From 659e332fc26260486c86f96d8cd219b1bb0638c4 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 16 Sep 2025 17:01:31 +0200 Subject: [PATCH 01/76] Add large bit_width unpack64 tests --- cpp/src/arrow/util/bpacking_test.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc index c2dd4748a44..a5f8b5ea1e1 100644 --- a/cpp/src/arrow/util/bpacking_test.cc +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -201,6 +201,11 @@ class TestUnpack : public ::testing::TestWithParam { template void TestAll(UnpackFunc unpack) { + const auto [_, bit_width] = GetParam(); + if (static_cast(bit_width) > sizeof(Int) * 8) { + GTEST_SKIP() << "Not defined for this bit width"; + } + // Known values TestUnpackZeros(unpack); TestUnpackOnes(unpack); @@ -219,7 +224,8 @@ INSTANTIATE_TEST_SUITE_P( TestUnpackSize{128, 31}, TestUnpackSize{2048, 1}, TestUnpackSize{2048, 8}, TestUnpackSize{2048, 13}, TestUnpackSize{2048, 16}, TestUnpackSize{2048, 31}, - TestUnpackSize{2048, 32})); + TestUnpackSize{2048, 32}, TestUnpackSize{2048, 63}, + TestUnpackSize{2048, 64})); TEST_P(TestUnpack, Unpack32Scalar) { this->TestAll(&unpack32_scalar); } TEST_P(TestUnpack, Unpack64Scalar) { this->TestAll(&unpack64_scalar); } From 45767c9819e5677d2244a0b612aeb1d180b77444 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 3 Sep 2025 14:09:02 +0200 Subject: [PATCH 02/76] Handle 16bit unpacking generation --- cpp/src/arrow/util/bpacking_simd_codegen.py | 162 ++++++++++++-------- 1 file changed, 96 insertions(+), 66 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_codegen.py b/cpp/src/arrow/util/bpacking_simd_codegen.py index 9464908c021..992afa95abf 100755 --- a/cpp/src/arrow/util/bpacking_simd_codegen.py +++ b/cpp/src/arrow/util/bpacking_simd_codegen.py @@ -22,106 +22,133 @@ # python bpacking_simd_codegen.py 256 > bpacking_simd256_generated_internal.h # python bpacking_simd_codegen.py 512 > bpacking_simd512_generated_internal.h -from functools import partial import sys from textwrap import dedent, indent class UnpackGenerator: - - def __init__(self, simd_width): + def __init__(self, simd_width, out_width, out_type): self.simd_width = simd_width - if simd_width % 32 != 0: - raise("SIMD bit width should be a multiple of 32") + self.out_width = out_width + if simd_width % out_width != 0: + raise ("SIMD bit width should be a multiple of output width") self.simd_byte_width = simd_width // 8 + self.out_byte_width = out_width // 8 + self.out_type = out_type def print_unpack_bit0_func(self): + ty = self.out_type print( - "inline static const uint32_t* unpack0_32(const uint32_t* in, uint32_t* out) {") - print(" memset(out, 0x0, 32 * sizeof(*out));") - print(" out += 32;") + f"inline static const {ty}* unpack0_{self.out_width}(const {ty}* in, {ty}* out) {{" + ) + print(f" std::memset(out, 0x0, {self.out_width} * sizeof(*out));") + print(f" out += {self.out_width};") print("") print(" return in;") print("}") - - def print_unpack_bit32_func(self): + def print_unpack_bitmax_func(self): + ty = self.out_type print( - "inline static const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) {") - print(" memcpy(out, in, 32 * sizeof(*out));") - print(" in += 32;") - print(" out += 32;") + f"inline static const {ty}* unpack{self.out_width}_{self.out_width}(const {ty}* in, {ty}* out) {{" + ) + print(f" std::memcpy(out, in, {self.out_width} * sizeof(*out));") + print(f" in += {self.out_width};") + print(f" out += {self.out_width};") print("") print(" return in;") print("}") def print_unpack_bit_func(self, bit): - def p(code): - print(indent(code, prefix=' ')) + def p(code, level=1): + print(indent(code, prefix=" " * level)) - shift = 0 - shifts = [] - in_index = 0 - inls = [] mask = (1 << bit) - 1 - bracket = "{" + ty = self.out_type + bytes_per_batch = self.simd_byte_width + words_per_batch = bytes_per_batch // self.out_byte_width + + print( + f"inline static const {ty}* unpack{bit}_{self.out_width}(const {ty}* in, {ty}* out) {{" + ) + p( + dedent(f"""\ + using simd_batch = xsimd::make_sized_batch_t<{ty}, {self.simd_width // self.out_width}>; - print(f"inline static const uint32_t* unpack{bit}_32(const uint32_t* in, uint32_t* out) {{") - p(dedent(f"""\ - uint32_t mask = 0x{mask:0x}; + {ty} mask = 0x{mask:0x}; simd_batch masks(mask); simd_batch words, shifts; simd_batch results; - """)) + """) + ) def safe_load(index): - return f"SafeLoad(in + {index})" + return f"SafeLoad<{ty}>(in + {index})" + + def static_cast_as_needed(str): + if self.out_width < 32: + return f"static_cast<{ty}>({str})" + return str - for i in range(32): - if shift + bit == 32: + shift = 0 + shifts = [] + in_index = 0 + inls = [] + + for i in range(self.out_width): + if shift + bit == self.out_width: shifts.append(shift) inls.append(safe_load(in_index)) in_index += 1 shift = 0 - elif shift + bit > 32: # cross the boundary + elif shift + bit > self.out_width: # cross the boundary inls.append( - f"{safe_load(in_index)} >> {shift} | {safe_load(in_index + 1)} << {32 - shift}") + static_cast_as_needed( + f"{safe_load(in_index)} >> {shift} " + f"| {safe_load(in_index + 1)} << {self.out_width - shift}" + ) + ) in_index += 1 - shift = bit - (32 - shift) + shift = bit - (self.out_width - shift) shifts.append(0) # zero shift else: shifts.append(shift) inls.append(safe_load(in_index)) shift += bit - bytes_per_batch = self.simd_byte_width - words_per_batch = bytes_per_batch // 4 - one_word_template = dedent("""\ - words = simd_batch{{ {words} }}; shifts = simd_batch{{ {shifts} }}; results = (words >> shifts) & masks; results.store_unaligned(out); out += {words_per_batch}; """) - for start in range(0, 32, words_per_batch): - stop = start + words_per_batch; + for start in range(0, self.out_width, words_per_batch): + stop = start + words_per_batch p(f"""// extract {bit}-bit bundles {start} to {stop - 1}""") - p(one_word_template.format( - words=", ".join(inls[start:stop]), - shifts=", ".join(map(str, shifts[start:stop])), - words_per_batch=words_per_batch)) - - p(dedent(f"""\ + p("words = simd_batch{") + for word_part in inls[start:stop]: + p(f"{word_part},", level=2) + p("};") + p( + one_word_template.format( + shifts=", ".join(map(str, shifts[start:stop])), + words_per_batch=words_per_batch, + ) + ) + + p( + dedent(f"""\ in += {bit}; - return in;""")) + return in;""") + ) print("}") def print_copyright(): - print(dedent("""\ + print( + dedent("""\ // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -138,7 +165,8 @@ def print_copyright(): // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. - """)) + """) + ) def print_note(): @@ -146,7 +174,7 @@ def print_note(): print() -def main(simd_width): +def main(simd_width, outputs): print_copyright() print_note() @@ -156,7 +184,8 @@ def main(simd_width): # potential name collisions if there are several UnpackBits generations # with the same SIMD width on a given architecture. - print(dedent(f"""\ + print( + dedent(f"""\ #pragma once #include @@ -167,37 +196,37 @@ def main(simd_width): #include "arrow/util/dispatch_internal.h" #include "arrow/util/ubsan.h" - namespace arrow {{ - namespace internal {{ + namespace arrow::internal {{ namespace {{ using ::arrow::util::SafeLoad; template struct {struct_name} {{ + """) + ) - using simd_batch = xsimd::make_sized_batch_t; - """)) - - gen = UnpackGenerator(simd_width) - gen.print_unpack_bit0_func() - print() - for i in range(1, 32): - gen.print_unpack_bit_func(i) + for out_width, out_type in outputs: + gen = UnpackGenerator(simd_width, out_width, out_type) + gen.print_unpack_bit0_func() + print() + for i in range(1, out_width): + gen.print_unpack_bit_func(i) + print() + gen.print_unpack_bitmax_func() print() - gen.print_unpack_bit32_func() - print() - print(dedent(f"""\ + print( + dedent(f"""\ }}; // struct {struct_name} }} // namespace - }} // namespace internal - }} // namespace arrow - """)) + }} // namespace arrow::internal + """) + ) -if __name__ == '__main__': +if __name__ == "__main__": usage = f"""Usage: {__file__} """ if len(sys.argv) != 2: raise ValueError(usage) @@ -206,4 +235,5 @@ def main(simd_width): except ValueError: raise ValueError(usage) - main(simd_width) + outputs = [(16, "uint16_t"), (32, "uint32_t")] + main(simd_width, outputs) From a62ba1c418f961df1dba3cc344d67dec559ec1c4 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 16 Sep 2025 14:19:11 +0200 Subject: [PATCH 03/76] Use uint8_t* input for simd unpack --- cpp/src/arrow/util/bpacking_avx2.cc | 4 ++-- cpp/src/arrow/util/bpacking_avx512.cc | 4 ++-- cpp/src/arrow/util/bpacking_neon.cc | 4 ++-- cpp/src/arrow/util/bpacking_simd_codegen.py | 14 +++++++------- cpp/src/arrow/util/bpacking_simd_internal.h | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_avx2.cc b/cpp/src/arrow/util/bpacking_avx2.cc index 84f091594c1..c6d1b4546ce 100644 --- a/cpp/src/arrow/util/bpacking_avx2.cc +++ b/cpp/src/arrow/util/bpacking_avx2.cc @@ -22,8 +22,8 @@ namespace arrow::internal { int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>( - reinterpret_cast(in), out, batch_size, num_bits); + return unpack32_specialized>(in, out, batch_size, + num_bits); } } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_avx512.cc b/cpp/src/arrow/util/bpacking_avx512.cc index 35de0dd5b47..29a7c133f30 100644 --- a/cpp/src/arrow/util/bpacking_avx512.cc +++ b/cpp/src/arrow/util/bpacking_avx512.cc @@ -22,8 +22,8 @@ namespace arrow::internal { int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>( - reinterpret_cast(in), out, batch_size, num_bits); + return unpack32_specialized>(in, out, batch_size, + num_bits); } } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_neon.cc b/cpp/src/arrow/util/bpacking_neon.cc index 407b309b7e8..517e2d95a70 100644 --- a/cpp/src/arrow/util/bpacking_neon.cc +++ b/cpp/src/arrow/util/bpacking_neon.cc @@ -22,8 +22,8 @@ namespace arrow::internal { int unpack32_neon(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>( - reinterpret_cast(in), out, batch_size, num_bits); + return unpack32_specialized>(in, out, batch_size, + num_bits); } } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_simd_codegen.py b/cpp/src/arrow/util/bpacking_simd_codegen.py index 992afa95abf..b0c81c1a272 100755 --- a/cpp/src/arrow/util/bpacking_simd_codegen.py +++ b/cpp/src/arrow/util/bpacking_simd_codegen.py @@ -39,7 +39,7 @@ def __init__(self, simd_width, out_width, out_type): def print_unpack_bit0_func(self): ty = self.out_type print( - f"inline static const {ty}* unpack0_{self.out_width}(const {ty}* in, {ty}* out) {{" + f"inline static const uint8_t* unpack0_{self.out_width}(const uint8_t* in, {ty}* out) {{" ) print(f" std::memset(out, 0x0, {self.out_width} * sizeof(*out));") print(f" out += {self.out_width};") @@ -50,10 +50,10 @@ def print_unpack_bit0_func(self): def print_unpack_bitmax_func(self): ty = self.out_type print( - f"inline static const {ty}* unpack{self.out_width}_{self.out_width}(const {ty}* in, {ty}* out) {{" + f"inline static const uint8_t* unpack{self.out_width}_{self.out_width}(const uint8_t* in, {ty}* out) {{" ) print(f" std::memcpy(out, in, {self.out_width} * sizeof(*out));") - print(f" in += {self.out_width};") + print(f" in += {self.out_byte_width} * {self.out_width};") print(f" out += {self.out_width};") print("") print(" return in;") @@ -69,7 +69,7 @@ def p(code, level=1): words_per_batch = bytes_per_batch // self.out_byte_width print( - f"inline static const {ty}* unpack{bit}_{self.out_width}(const {ty}* in, {ty}* out) {{" + f"inline static const uint8_t* unpack{bit}_{self.out_width}(const uint8_t* in, {ty}* out) {{" ) p( dedent(f"""\ @@ -84,7 +84,7 @@ def p(code, level=1): ) def safe_load(index): - return f"SafeLoad<{ty}>(in + {index})" + return f"SafeLoadAs<{ty}>(in + {self.out_byte_width} * {index})" def static_cast_as_needed(str): if self.out_width < 32: @@ -140,7 +140,7 @@ def static_cast_as_needed(str): p( dedent(f"""\ - in += {bit}; + in += {bit} * {self.out_byte_width}; return in;""") ) print("}") @@ -199,7 +199,7 @@ def main(simd_width, outputs): namespace arrow::internal {{ namespace {{ - using ::arrow::util::SafeLoad; + using ::arrow::util::SafeLoadAs; template struct {struct_name} {{ diff --git a/cpp/src/arrow/util/bpacking_simd_internal.h b/cpp/src/arrow/util/bpacking_simd_internal.h index 98e192e7cb0..8a3dc0d8af7 100644 --- a/cpp/src/arrow/util/bpacking_simd_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_internal.h @@ -22,7 +22,7 @@ namespace arrow { namespace internal { template -static int unpack32_specialized(const uint32_t* in, uint32_t* out, int batch_size, +static int unpack32_specialized(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { batch_size = batch_size / 32 * 32; int num_loops = batch_size / 32; From b9d3a20f2681536c264e36469aca9db7122ecf52 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 16 Sep 2025 14:21:20 +0200 Subject: [PATCH 04/76] Gen: regenerate bpacking_simd --- .../bpacking_simd128_generated_internal.h | 2630 +++++++++++++++-- .../bpacking_simd256_generated_internal.h | 2138 ++++++++++++-- .../bpacking_simd512_generated_internal.h | 1952 +++++++++++- 3 files changed, 6067 insertions(+), 653 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h index 5beecad4210..b47e2d3a627 100644 --- a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h @@ -27,25 +27,714 @@ #include "arrow/util/dispatch_internal.h" #include "arrow/util/ubsan.h" -namespace arrow { -namespace internal { +namespace arrow::internal { namespace { -using ::arrow::util::SafeLoad; +using ::arrow::util::SafeLoadAs; template struct UnpackBits128 { -using simd_batch = xsimd::make_sized_batch_t; +inline static const uint8_t* unpack0_16(const uint8_t* in, uint16_t* out) { + std::memset(out, 0x0, 16 * sizeof(*out)); + out += 16; -inline static const uint32_t* unpack0_32(const uint32_t* in, uint32_t* out) { - memset(out, 0x0, 32 * sizeof(*out)); + return in; +} + +inline static const uint8_t* unpack1_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x1; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 1-bit bundles 0 to 7 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + }; + shifts = simd_batch{ 0, 1, 2, 3, 4, 5, 6, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 1-bit bundles 8 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + }; + shifts = simd_batch{ 8, 9, 10, 11, 12, 13, 14, 15 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 1 * 2; + return in; +} + +inline static const uint8_t* unpack2_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x3; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 2-bit bundles 0 to 7 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + }; + shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 2-bit bundles 8 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + }; + shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 2 * 2; + return in; +} + +inline static const uint8_t* unpack3_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x7; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 3-bit bundles 0 to 7 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + }; + shifts = simd_batch{ 0, 3, 6, 9, 12, 0, 2, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 3-bit bundles 8 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + }; + shifts = simd_batch{ 8, 11, 0, 1, 4, 7, 10, 13 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 3 * 2; + return in; +} + +inline static const uint8_t* unpack4_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0xf; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 4-bit bundles 0 to 7 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + }; + shifts = simd_batch{ 0, 4, 8, 12, 0, 4, 8, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 4-bit bundles 8 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + }; + shifts = simd_batch{ 0, 4, 8, 12, 0, 4, 8, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 4 * 2; + return in; +} + +inline static const uint8_t* unpack5_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x1f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 5-bit bundles 0 to 7 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), + SafeLoadAs(in + 2 * 2), + }; + shifts = simd_batch{ 0, 5, 10, 0, 4, 9, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 5-bit bundles 8 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 4), + }; + shifts = simd_batch{ 8, 0, 2, 7, 0, 1, 6, 11 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 5 * 2; + return in; +} + +inline static const uint8_t* unpack6_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x3f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 6-bit bundles 0 to 7 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 12 | SafeLoadAs(in + 2 * 1) << 4), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + }; + shifts = simd_batch{ 0, 6, 0, 2, 8, 0, 4, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 6-bit bundles 8 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 14 | SafeLoadAs(in + 2 * 5) << 2), + SafeLoadAs(in + 2 * 5), + SafeLoadAs(in + 2 * 5), + }; + shifts = simd_batch{ 0, 6, 0, 2, 8, 0, 4, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 6 * 2; + return in; +} + +inline static const uint8_t* unpack7_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x7f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 7-bit bundles 0 to 7 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 14 | SafeLoadAs(in + 2 * 1) << 2), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 12 | SafeLoadAs(in + 2 * 2) << 4), + SafeLoadAs(in + 2 * 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 10 | SafeLoadAs(in + 2 * 3) << 6), + SafeLoadAs(in + 2 * 3), + }; + shifts = simd_batch{ 0, 7, 0, 5, 0, 3, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 7-bit bundles 8 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 15 | SafeLoadAs(in + 2 * 4) << 1), + SafeLoadAs(in + 2 * 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 13 | SafeLoadAs(in + 2 * 5) << 3), + SafeLoadAs(in + 2 * 5), + static_cast(SafeLoadAs(in + 2 * 5) >> 11 | SafeLoadAs(in + 2 * 6) << 5), + SafeLoadAs(in + 2 * 6), + SafeLoadAs(in + 2 * 6), + }; + shifts = simd_batch{ 8, 0, 6, 0, 4, 0, 2, 9 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 7 * 2; + return in; +} + +inline static const uint8_t* unpack8_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0xff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 8-bit bundles 0 to 7 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + }; + shifts = simd_batch{ 0, 8, 0, 8, 0, 8, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 8-bit bundles 8 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 5), + SafeLoadAs(in + 2 * 5), + SafeLoadAs(in + 2 * 6), + SafeLoadAs(in + 2 * 6), + SafeLoadAs(in + 2 * 7), + SafeLoadAs(in + 2 * 7), + }; + shifts = simd_batch{ 0, 8, 0, 8, 0, 8, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 8 * 2; + return in; +} + +inline static const uint8_t* unpack9_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x1ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 9-bit bundles 0 to 7 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 9 | SafeLoadAs(in + 2 * 1) << 7), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 11 | SafeLoadAs(in + 2 * 2) << 5), + SafeLoadAs(in + 2 * 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 15 | SafeLoadAs(in + 2 * 4) << 1), + }; + shifts = simd_batch{ 0, 0, 2, 0, 4, 0, 6, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 9-bit bundles 8 to 15 + words = simd_batch{ + static_cast(SafeLoadAs(in + 2 * 4) >> 8 | SafeLoadAs(in + 2 * 5) << 8), + SafeLoadAs(in + 2 * 5), + static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), + SafeLoadAs(in + 2 * 6), + static_cast(SafeLoadAs(in + 2 * 6) >> 12 | SafeLoadAs(in + 2 * 7) << 4), + SafeLoadAs(in + 2 * 7), + static_cast(SafeLoadAs(in + 2 * 7) >> 14 | SafeLoadAs(in + 2 * 8) << 2), + SafeLoadAs(in + 2 * 8), + }; + shifts = simd_batch{ 0, 1, 0, 3, 0, 5, 0, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 9 * 2; + return in; +} + +inline static const uint8_t* unpack10_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x3ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 10-bit bundles 0 to 7 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 10 | SafeLoadAs(in + 2 * 1) << 6), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 8 | SafeLoadAs(in + 2 * 3) << 8), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), + SafeLoadAs(in + 2 * 4), + }; + shifts = simd_batch{ 0, 0, 4, 0, 0, 2, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 10-bit bundles 8 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 5), + static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), + SafeLoadAs(in + 2 * 6), + static_cast(SafeLoadAs(in + 2 * 6) >> 14 | SafeLoadAs(in + 2 * 7) << 2), + static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), + SafeLoadAs(in + 2 * 8), + static_cast(SafeLoadAs(in + 2 * 8) >> 12 | SafeLoadAs(in + 2 * 9) << 4), + SafeLoadAs(in + 2 * 9), + }; + shifts = simd_batch{ 0, 0, 4, 0, 0, 2, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 10 * 2; + return in; +} + +inline static const uint8_t* unpack11_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x7ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 11-bit bundles 0 to 7 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 11 | SafeLoadAs(in + 2 * 1) << 5), + static_cast(SafeLoadAs(in + 2 * 1) >> 6 | SafeLoadAs(in + 2 * 2) << 10), + SafeLoadAs(in + 2 * 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 12 | SafeLoadAs(in + 2 * 3) << 4), + static_cast(SafeLoadAs(in + 2 * 3) >> 7 | SafeLoadAs(in + 2 * 4) << 9), + SafeLoadAs(in + 2 * 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 13 | SafeLoadAs(in + 2 * 5) << 3), + }; + shifts = simd_batch{ 0, 0, 0, 1, 0, 0, 2, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 11-bit bundles 8 to 15 + words = simd_batch{ + static_cast(SafeLoadAs(in + 2 * 5) >> 8 | SafeLoadAs(in + 2 * 6) << 8), + SafeLoadAs(in + 2 * 6), + static_cast(SafeLoadAs(in + 2 * 6) >> 14 | SafeLoadAs(in + 2 * 7) << 2), + static_cast(SafeLoadAs(in + 2 * 7) >> 9 | SafeLoadAs(in + 2 * 8) << 7), + SafeLoadAs(in + 2 * 8), + static_cast(SafeLoadAs(in + 2 * 8) >> 15 | SafeLoadAs(in + 2 * 9) << 1), + static_cast(SafeLoadAs(in + 2 * 9) >> 10 | SafeLoadAs(in + 2 * 10) << 6), + SafeLoadAs(in + 2 * 10), + }; + shifts = simd_batch{ 0, 3, 0, 0, 4, 0, 0, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 11 * 2; + return in; +} + +inline static const uint8_t* unpack12_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0xfff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 12-bit bundles 0 to 7 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 12 | SafeLoadAs(in + 2 * 1) << 4), + static_cast(SafeLoadAs(in + 2 * 1) >> 8 | SafeLoadAs(in + 2 * 2) << 8), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 8 | SafeLoadAs(in + 2 * 5) << 8), + SafeLoadAs(in + 2 * 5), + }; + shifts = simd_batch{ 0, 0, 0, 4, 0, 0, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 12-bit bundles 8 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 6), + static_cast(SafeLoadAs(in + 2 * 6) >> 12 | SafeLoadAs(in + 2 * 7) << 4), + static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), + SafeLoadAs(in + 2 * 8), + SafeLoadAs(in + 2 * 9), + static_cast(SafeLoadAs(in + 2 * 9) >> 12 | SafeLoadAs(in + 2 * 10) << 4), + static_cast(SafeLoadAs(in + 2 * 10) >> 8 | SafeLoadAs(in + 2 * 11) << 8), + SafeLoadAs(in + 2 * 11), + }; + shifts = simd_batch{ 0, 0, 0, 4, 0, 0, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 12 * 2; + return in; +} + +inline static const uint8_t* unpack13_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x1fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 13-bit bundles 0 to 7 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 13 | SafeLoadAs(in + 2 * 1) << 3), + static_cast(SafeLoadAs(in + 2 * 1) >> 10 | SafeLoadAs(in + 2 * 2) << 6), + static_cast(SafeLoadAs(in + 2 * 2) >> 7 | SafeLoadAs(in + 2 * 3) << 9), + static_cast(SafeLoadAs(in + 2 * 3) >> 4 | SafeLoadAs(in + 2 * 4) << 12), + SafeLoadAs(in + 2 * 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 14 | SafeLoadAs(in + 2 * 5) << 2), + static_cast(SafeLoadAs(in + 2 * 5) >> 11 | SafeLoadAs(in + 2 * 6) << 5), + }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 1, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 13-bit bundles 8 to 15 + words = simd_batch{ + static_cast(SafeLoadAs(in + 2 * 6) >> 8 | SafeLoadAs(in + 2 * 7) << 8), + static_cast(SafeLoadAs(in + 2 * 7) >> 5 | SafeLoadAs(in + 2 * 8) << 11), + SafeLoadAs(in + 2 * 8), + static_cast(SafeLoadAs(in + 2 * 8) >> 15 | SafeLoadAs(in + 2 * 9) << 1), + static_cast(SafeLoadAs(in + 2 * 9) >> 12 | SafeLoadAs(in + 2 * 10) << 4), + static_cast(SafeLoadAs(in + 2 * 10) >> 9 | SafeLoadAs(in + 2 * 11) << 7), + static_cast(SafeLoadAs(in + 2 * 11) >> 6 | SafeLoadAs(in + 2 * 12) << 10), + SafeLoadAs(in + 2 * 12), + }; + shifts = simd_batch{ 0, 0, 2, 0, 0, 0, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 13 * 2; + return in; +} + +inline static const uint8_t* unpack14_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x3fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 14-bit bundles 0 to 7 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 14 | SafeLoadAs(in + 2 * 1) << 2), + static_cast(SafeLoadAs(in + 2 * 1) >> 12 | SafeLoadAs(in + 2 * 2) << 4), + static_cast(SafeLoadAs(in + 2 * 2) >> 10 | SafeLoadAs(in + 2 * 3) << 6), + static_cast(SafeLoadAs(in + 2 * 3) >> 8 | SafeLoadAs(in + 2 * 4) << 8), + static_cast(SafeLoadAs(in + 2 * 4) >> 6 | SafeLoadAs(in + 2 * 5) << 10), + static_cast(SafeLoadAs(in + 2 * 5) >> 4 | SafeLoadAs(in + 2 * 6) << 12), + SafeLoadAs(in + 2 * 6), + }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 14-bit bundles 8 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 7), + static_cast(SafeLoadAs(in + 2 * 7) >> 14 | SafeLoadAs(in + 2 * 8) << 2), + static_cast(SafeLoadAs(in + 2 * 8) >> 12 | SafeLoadAs(in + 2 * 9) << 4), + static_cast(SafeLoadAs(in + 2 * 9) >> 10 | SafeLoadAs(in + 2 * 10) << 6), + static_cast(SafeLoadAs(in + 2 * 10) >> 8 | SafeLoadAs(in + 2 * 11) << 8), + static_cast(SafeLoadAs(in + 2 * 11) >> 6 | SafeLoadAs(in + 2 * 12) << 10), + static_cast(SafeLoadAs(in + 2 * 12) >> 4 | SafeLoadAs(in + 2 * 13) << 12), + SafeLoadAs(in + 2 * 13), + }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 14 * 2; + return in; +} + +inline static const uint8_t* unpack15_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x7fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 15-bit bundles 0 to 7 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 11 | SafeLoadAs(in + 2 * 5) << 5), + static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), + static_cast(SafeLoadAs(in + 2 * 6) >> 9 | SafeLoadAs(in + 2 * 7) << 7), + }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 15-bit bundles 8 to 15 + words = simd_batch{ + static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), + static_cast(SafeLoadAs(in + 2 * 8) >> 7 | SafeLoadAs(in + 2 * 9) << 9), + static_cast(SafeLoadAs(in + 2 * 9) >> 6 | SafeLoadAs(in + 2 * 10) << 10), + static_cast(SafeLoadAs(in + 2 * 10) >> 5 | SafeLoadAs(in + 2 * 11) << 11), + static_cast(SafeLoadAs(in + 2 * 11) >> 4 | SafeLoadAs(in + 2 * 12) << 12), + static_cast(SafeLoadAs(in + 2 * 12) >> 3 | SafeLoadAs(in + 2 * 13) << 13), + static_cast(SafeLoadAs(in + 2 * 13) >> 2 | SafeLoadAs(in + 2 * 14) << 14), + SafeLoadAs(in + 2 * 14), + }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 15 * 2; + return in; +} + +inline static const uint8_t* unpack16_16(const uint8_t* in, uint16_t* out) { + std::memcpy(out, in, 16 * sizeof(*out)); + in += 2 * 16; + out += 16; + + return in; +} + +inline static const uint8_t* unpack0_32(const uint8_t* in, uint32_t* out) { + std::memset(out, 0x0, 32 * sizeof(*out)); out += 32; return in; } -inline static const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1; simd_batch masks(mask); @@ -53,66 +742,108 @@ inline static const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 1-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 0, 1, 2, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 1-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 4, 5, 6, 7 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 1-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 8, 9, 10, 11 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 1-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 12, 13, 14, 15 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 1-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 16, 17, 18, 19 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 1-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 20, 21, 22, 23 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 1-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 24, 25, 26, 27 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 1-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 28, 29, 30, 31 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 1; + in += 1 * 4; return in; } -inline static const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3; simd_batch masks(mask); @@ -120,66 +851,108 @@ inline static const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 2-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 0, 2, 4, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 8, 10, 12, 14 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 16, 18, 20, 22 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 24, 26, 28, 30 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 2, 4, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 8, 10, 12, 14 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 16, 18, 20, 22 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 24, 26, 28, 30 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 2; + in += 2 * 4; return in; } -inline static const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7; simd_batch masks(mask); @@ -187,66 +960,108 @@ inline static const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 3-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 0, 3, 6, 9 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 12, 15, 18, 21 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 24, 27, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 4, 7, 10, 13 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 16, 19, 22, 25 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 28, 0, 2, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 8, 11, 14, 17 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 20, 23, 26, 29 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 3; + in += 3 * 4; return in; } -inline static const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xf; simd_batch masks(mask); @@ -254,66 +1069,108 @@ inline static const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 4-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 0, 4, 8, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 16, 20, 24, 28 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 4, 8, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 16, 20, 24, 28 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 0, 4, 8, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 16, 20, 24, 28 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 0, 4, 8, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 16, 20, 24, 28 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 4; + in += 4 * 4; return in; } -inline static const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1f; simd_batch masks(mask); @@ -321,66 +1178,108 @@ inline static const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 5-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 0, 5, 10, 15 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 20, 25, 0, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 8, 13, 18, 23 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 0, 1, 6, 11 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 31 | SafeLoadAs(in + 4 * 3) << 1, + }; shifts = simd_batch{ 16, 21, 26, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 4, 9, 14, 19 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 4), SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 29 | SafeLoadAs(in + 4 * 4) << 3, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 24, 0, 2, 7 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 12, 17, 22, 27 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 5; + in += 5 * 4; return in; } -inline static const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3f; simd_batch masks(mask); @@ -388,66 +1287,108 @@ inline static const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 6-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 0, 6, 12, 18 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 24, 0, 4, 10 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 16, 22, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 8, 14, 20, 26 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 0, 6, 12, 18 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 24, 0, 4, 10 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 16, 22, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 8, 14, 20, 26 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 6; + in += 6 * 4; return in; } -inline static const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7f; simd_batch masks(mask); @@ -455,66 +1396,108 @@ inline static const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 7-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 0, 7, 14, 21 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 3, 10, 17 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 24, 0, 6, 13 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5, SafeLoad(in + 3), SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 27 | SafeLoadAs(in + 4 * 3) << 5, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 20, 0, 2, 9 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 16, 23, 0, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 12, 19, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + }; shifts = simd_batch{ 8, 15, 22, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + }; shifts = simd_batch{ 4, 11, 18, 25 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 7; + in += 7 * 4; return in; } -inline static const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xff; simd_batch masks(mask); @@ -522,66 +1505,108 @@ inline static const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 8-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + }; shifts = simd_batch{ 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + }; shifts = simd_batch{ 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 8; + in += 8 * 4; return in; } -inline static const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1ff; simd_batch masks(mask); @@ -589,66 +1614,108 @@ inline static const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 9-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, + }; shifts = simd_batch{ 0, 9, 18, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, + }; shifts = simd_batch{ 4, 13, 22, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 8, 17, 0, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 12, 21, 0, 7 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 5), SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 25 | SafeLoadAs(in + 4 * 5) << 7, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 16, 0, 2, 11 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6), SafeLoad(in + 6) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + }; shifts = simd_batch{ 20, 0, 6, 15 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + }; shifts = simd_batch{ 0, 1, 10, 19 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + }; shifts = simd_batch{ 0, 5, 14, 23 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 9; + in += 9 * 4; return in; } -inline static const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3ff; simd_batch masks(mask); @@ -656,66 +1723,108 @@ inline static const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 10-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + }; shifts = simd_batch{ 0, 10, 20, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 8, 18, 0, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 16, 0, 4, 14 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 0, 2, 12, 22 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 30 | SafeLoadAs(in + 4 * 6) << 2, + }; shifts = simd_batch{ 0, 10, 20, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, + SafeLoadAs(in + 4 * 7), + }; shifts = simd_batch{ 8, 18, 0, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6, SafeLoad(in + 8), SafeLoad(in + 8) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 26 | SafeLoadAs(in + 4 * 8) << 6, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + }; shifts = simd_batch{ 16, 0, 4, 14 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), + }; shifts = simd_batch{ 0, 2, 12, 22 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 10; + in += 10 * 4; return in; } -inline static const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7ff; simd_batch masks(mask); @@ -723,66 +1832,108 @@ inline static const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 11-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10, SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 22 | SafeLoadAs(in + 4 * 1) << 10, + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 11, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1) >> 23 | SafeLoad(in + 2) << 9, SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 23 | SafeLoadAs(in + 4 * 2) << 9, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 12, 0, 2, 13 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 25 | SafeLoad(in + 4) << 7 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2) >> 24 | SafeLoadAs(in + 4 * 3) << 8, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 25 | SafeLoadAs(in + 4 * 4) << 7, + }; shifts = simd_batch{ 0, 3, 14, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 4, 15, 0, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5, SafeLoad(in + 6), SafeLoad(in + 6) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 27 | SafeLoadAs(in + 4 * 6) << 5, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + }; shifts = simd_batch{ 16, 0, 6, 17 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 29 | SafeLoadAs(in + 4 * 8) << 3, + }; shifts = simd_batch{ 0, 7, 18, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, + SafeLoadAs(in + 4 * 9), + }; shifts = simd_batch{ 8, 19, 0, 9 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1, SafeLoad(in + 10), SafeLoad(in + 10) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10), + }; shifts = simd_batch{ 20, 0, 10, 21 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 11; + in += 11 * 4; return in; } -inline static const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xfff; simd_batch masks(mask); @@ -790,66 +1941,108 @@ inline static const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 12-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 24 | SafeLoad(in + 1) << 8, SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 24 | SafeLoadAs(in + 4 * 1) << 8, + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 12, 0, 4 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 12-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 16, 0, 8, 20 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 12-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 0, 12, 0, 4 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 12-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5), SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 16, 0, 8, 20 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 12-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7), + }; shifts = simd_batch{ 0, 12, 0, 4 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 12-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + }; shifts = simd_batch{ 16, 0, 8, 20 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 12-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, + SafeLoadAs(in + 4 * 10), + }; shifts = simd_batch{ 0, 12, 0, 4 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 12-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 10), SafeLoad(in + 10) >> 28 | SafeLoad(in + 11) << 4, SafeLoad(in + 11), SafeLoad(in + 11) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11), + }; shifts = simd_batch{ 16, 0, 8, 20 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 12; + in += 12 * 4; return in; } -inline static const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1fff; simd_batch masks(mask); @@ -857,66 +2050,108 @@ inline static const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 13-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 26 | SafeLoad(in + 1) << 6, SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 26 | SafeLoadAs(in + 4 * 1) << 6, + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 13, 0, 7 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 13-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 1) >> 20 | SafeLoad(in + 2) << 12, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1) >> 20 | SafeLoadAs(in + 4 * 2) << 12, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 27 | SafeLoadAs(in + 4 * 3) << 5, + }; shifts = simd_batch{ 0, 1, 14, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 13-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 21 | SafeLoad(in + 4) << 11, SafeLoad(in + 4), SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 21 | SafeLoadAs(in + 4 * 4) << 11, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 8, 0, 2, 15 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 13-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5), SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 6) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, + SafeLoadAs(in + 4 * 6), + }; shifts = simd_batch{ 0, 9, 0, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 13-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6) >> 29 | SafeLoad(in + 7) << 3, SafeLoad(in + 7), SafeLoad(in + 7) >> 23 | SafeLoad(in + 8) << 9 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 29 | SafeLoadAs(in + 4 * 7) << 3, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 23 | SafeLoadAs(in + 4 * 8) << 9, + }; shifts = simd_batch{ 16, 0, 10, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 13-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, + SafeLoadAs(in + 4 * 9), + }; shifts = simd_batch{ 4, 17, 0, 11 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 13-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 10) >> 31 | SafeLoad(in + 11) << 1 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, + }; shifts = simd_batch{ 0, 5, 18, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 13-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 11), SafeLoad(in + 11) >> 25 | SafeLoad(in + 12) << 7, SafeLoad(in + 12), SafeLoad(in + 12) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12), + }; shifts = simd_batch{ 12, 0, 6, 19 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 13; + in += 13 * 4; return in; } -inline static const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3fff; simd_batch masks(mask); @@ -924,66 +2159,108 @@ inline static const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 14-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 14, 0, 10 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 14-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 1) >> 24 | SafeLoad(in + 2) << 8, SafeLoad(in + 2), SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1) >> 24 | SafeLoadAs(in + 4 * 2) << 8, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 0, 6, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 14-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, + }; shifts = simd_batch{ 16, 0, 12, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 14-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 6), SafeLoad(in + 6) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + }; shifts = simd_batch{ 8, 0, 4, 18 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 14-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8), + }; shifts = simd_batch{ 0, 14, 0, 10 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 14-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9), SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 10) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, + SafeLoadAs(in + 4 * 10), + }; shifts = simd_batch{ 0, 6, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 14-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 10), SafeLoad(in + 10) >> 30 | SafeLoad(in + 11) << 2, SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 30 | SafeLoadAs(in + 4 * 11) << 2, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + }; shifts = simd_batch{ 16, 0, 12, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 14-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 12), SafeLoad(in + 12) >> 22 | SafeLoad(in + 13) << 10, SafeLoad(in + 13), SafeLoad(in + 13) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13), + }; shifts = simd_batch{ 8, 0, 4, 18 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 14; + in += 14 * 4; return in; } -inline static const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7fff; simd_batch masks(mask); @@ -991,66 +2268,108 @@ inline static const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 15-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 15, 0, 13 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 15-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 0, 11, 0, 9 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 15-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 0, 7, 0, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 15-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6), SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14, SafeLoad(in + 7) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, + SafeLoadAs(in + 4 * 7), + }; shifts = simd_batch{ 0, 3, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 15-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 31 | SafeLoad(in + 8) << 1, SafeLoad(in + 8), SafeLoad(in + 8) >> 29 | SafeLoad(in + 9) << 3 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 31 | SafeLoadAs(in + 4 * 8) << 1, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 29 | SafeLoadAs(in + 4 * 9) << 3, + }; shifts = simd_batch{ 16, 0, 14, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 15-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9) >> 27 | SafeLoad(in + 10) << 5, SafeLoad(in + 10), SafeLoad(in + 10) >> 25 | SafeLoad(in + 11) << 7 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 27 | SafeLoadAs(in + 4 * 10) << 5, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 25 | SafeLoadAs(in + 4 * 11) << 7, + }; shifts = simd_batch{ 12, 0, 10, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 15-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 11), SafeLoad(in + 11) >> 23 | SafeLoad(in + 12) << 9, SafeLoad(in + 12), SafeLoad(in + 12) >> 21 | SafeLoad(in + 13) << 11 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 23 | SafeLoadAs(in + 4 * 12) << 9, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, + }; shifts = simd_batch{ 8, 0, 6, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 15-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 13), SafeLoad(in + 13) >> 19 | SafeLoad(in + 14) << 13, SafeLoad(in + 14), SafeLoad(in + 14) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14), + }; shifts = simd_batch{ 4, 0, 2, 17 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 15; + in += 15 * 4; return in; } -inline static const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xffff; simd_batch masks(mask); @@ -1058,66 +2377,108 @@ inline static const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 16-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 16, 0, 16 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 16-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 0, 16, 0, 16 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 16-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 5), SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 0, 16, 0, 16 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 16-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 7), SafeLoad(in + 7) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + }; shifts = simd_batch{ 0, 16, 0, 16 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 16-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 9), SafeLoad(in + 9) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), + }; shifts = simd_batch{ 0, 16, 0, 16 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 16-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 11), SafeLoad(in + 11) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11), + }; shifts = simd_batch{ 0, 16, 0, 16 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 16-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 12), SafeLoad(in + 12), SafeLoad(in + 13), SafeLoad(in + 13) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13), + }; shifts = simd_batch{ 0, 16, 0, 16 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 16-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 14), SafeLoad(in + 14), SafeLoad(in + 15), SafeLoad(in + 15) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15), + }; shifts = simd_batch{ 0, 16, 0, 16 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 16; + in += 16 * 4; return in; } -inline static const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1ffff; simd_batch masks(mask); @@ -1125,66 +2486,108 @@ inline static const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 17-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 17 | SafeLoad(in + 1) << 15, SafeLoad(in + 1), SafeLoad(in + 1) >> 19 | SafeLoad(in + 2) << 13 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 17 | SafeLoadAs(in + 4 * 1) << 15, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 19 | SafeLoadAs(in + 4 * 2) << 13, + }; shifts = simd_batch{ 0, 0, 2, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 17-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2) >> 21 | SafeLoad(in + 3) << 11, SafeLoad(in + 3), SafeLoad(in + 3) >> 23 | SafeLoad(in + 4) << 9 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 21 | SafeLoadAs(in + 4 * 3) << 11, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 23 | SafeLoadAs(in + 4 * 4) << 9, + }; shifts = simd_batch{ 4, 0, 6, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 17-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 5), SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 25 | SafeLoadAs(in + 4 * 5) << 7, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 27 | SafeLoadAs(in + 4 * 6) << 5, + }; shifts = simd_batch{ 8, 0, 10, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 17-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6) >> 29 | SafeLoad(in + 7) << 3, SafeLoad(in + 7), SafeLoad(in + 7) >> 31 | SafeLoad(in + 8) << 1 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 29 | SafeLoadAs(in + 4 * 7) << 3, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 31 | SafeLoadAs(in + 4 * 8) << 1, + }; shifts = simd_batch{ 12, 0, 14, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 17-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 8) >> 16 | SafeLoad(in + 9) << 16, SafeLoad(in + 9), SafeLoad(in + 9) >> 18 | SafeLoad(in + 10) << 14, SafeLoad(in + 10) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 8) >> 16 | SafeLoadAs(in + 4 * 9) << 16, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 18 | SafeLoadAs(in + 4 * 10) << 14, + SafeLoadAs(in + 4 * 10), + }; shifts = simd_batch{ 0, 1, 0, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 17-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 10) >> 20 | SafeLoad(in + 11) << 12, SafeLoad(in + 11), SafeLoad(in + 11) >> 22 | SafeLoad(in + 12) << 10, SafeLoad(in + 12) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 10) >> 20 | SafeLoadAs(in + 4 * 11) << 12, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 22 | SafeLoadAs(in + 4 * 12) << 10, + SafeLoadAs(in + 4 * 12), + }; shifts = simd_batch{ 0, 5, 0, 7 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 17-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 12) >> 24 | SafeLoad(in + 13) << 8, SafeLoad(in + 13), SafeLoad(in + 13) >> 26 | SafeLoad(in + 14) << 6, SafeLoad(in + 14) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 12) >> 24 | SafeLoadAs(in + 4 * 13) << 8, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 26 | SafeLoadAs(in + 4 * 14) << 6, + SafeLoadAs(in + 4 * 14), + }; shifts = simd_batch{ 0, 9, 0, 11 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 17-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 14) >> 28 | SafeLoad(in + 15) << 4, SafeLoad(in + 15), SafeLoad(in + 15) >> 30 | SafeLoad(in + 16) << 2, SafeLoad(in + 16) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 14) >> 28 | SafeLoadAs(in + 4 * 15) << 4, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, + SafeLoadAs(in + 4 * 16), + }; shifts = simd_batch{ 0, 13, 0, 15 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 17; + in += 17 * 4; return in; } -inline static const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3ffff; simd_batch masks(mask); @@ -1192,66 +2595,108 @@ inline static const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 18-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 18 | SafeLoad(in + 1) << 14, SafeLoad(in + 1), SafeLoad(in + 1) >> 22 | SafeLoad(in + 2) << 10 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 18 | SafeLoadAs(in + 4 * 1) << 14, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, + }; shifts = simd_batch{ 0, 0, 4, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 18-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + }; shifts = simd_batch{ 8, 0, 12, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 18-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 4) >> 16 | SafeLoad(in + 5) << 16, SafeLoad(in + 5), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4) >> 16 | SafeLoadAs(in + 4 * 5) << 16, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6), + }; shifts = simd_batch{ 0, 2, 0, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 18-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8), + }; shifts = simd_batch{ 0, 10, 0, 14 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 18-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9) >> 18 | SafeLoad(in + 10) << 14, SafeLoad(in + 10), SafeLoad(in + 10) >> 22 | SafeLoad(in + 11) << 10 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 18 | SafeLoadAs(in + 4 * 10) << 14, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 22 | SafeLoadAs(in + 4 * 11) << 10, + }; shifts = simd_batch{ 0, 0, 4, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 18-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12), SafeLoad(in + 12) >> 30 | SafeLoad(in + 13) << 2 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 30 | SafeLoadAs(in + 4 * 13) << 2, + }; shifts = simd_batch{ 8, 0, 12, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 18-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14), SafeLoad(in + 14) >> 20 | SafeLoad(in + 15) << 12, SafeLoad(in + 15) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 20 | SafeLoadAs(in + 4 * 15) << 12, + SafeLoadAs(in + 4 * 15), + }; shifts = simd_batch{ 0, 2, 0, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 18-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, + SafeLoadAs(in + 4 * 16), + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 17), + }; shifts = simd_batch{ 0, 10, 0, 14 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 18; + in += 18 * 4; return in; } -inline static const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7ffff; simd_batch masks(mask); @@ -1259,66 +2704,108 @@ inline static const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 19-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 19 | SafeLoad(in + 1) << 13, SafeLoad(in + 1), SafeLoad(in + 1) >> 25 | SafeLoad(in + 2) << 7 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 19 | SafeLoadAs(in + 4 * 1) << 13, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 25 | SafeLoadAs(in + 4 * 2) << 7, + }; shifts = simd_batch{ 0, 0, 6, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 19-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1, SafeLoad(in + 3) >> 18 | SafeLoad(in + 4) << 14, SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 31 | SafeLoadAs(in + 4 * 3) << 1, + SafeLoadAs(in + 4 * 3) >> 18 | SafeLoadAs(in + 4 * 4) << 14, + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 12, 0, 0, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 19-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 4) >> 24 | SafeLoad(in + 5) << 8, SafeLoad(in + 5), SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2, SafeLoad(in + 6) >> 17 | SafeLoad(in + 7) << 15 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4) >> 24 | SafeLoadAs(in + 4 * 5) << 8, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 30 | SafeLoadAs(in + 4 * 6) << 2, + SafeLoadAs(in + 4 * 6) >> 17 | SafeLoadAs(in + 4 * 7) << 15, + }; shifts = simd_batch{ 0, 11, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 19-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 23 | SafeLoad(in + 8) << 9, SafeLoad(in + 8), SafeLoad(in + 8) >> 29 | SafeLoad(in + 9) << 3 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 23 | SafeLoadAs(in + 4 * 8) << 9, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 29 | SafeLoadAs(in + 4 * 9) << 3, + }; shifts = simd_batch{ 4, 0, 10, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 19-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 9) >> 16 | SafeLoad(in + 10) << 16, SafeLoad(in + 10), SafeLoad(in + 10) >> 22 | SafeLoad(in + 11) << 10, SafeLoad(in + 11) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 9) >> 16 | SafeLoadAs(in + 4 * 10) << 16, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 22 | SafeLoadAs(in + 4 * 11) << 10, + SafeLoadAs(in + 4 * 11), + }; shifts = simd_batch{ 0, 3, 0, 9 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 19-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 11) >> 28 | SafeLoad(in + 12) << 4, SafeLoad(in + 12) >> 15 | SafeLoad(in + 13) << 17, SafeLoad(in + 13), SafeLoad(in + 13) >> 21 | SafeLoad(in + 14) << 11 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 11) >> 28 | SafeLoadAs(in + 4 * 12) << 4, + SafeLoadAs(in + 4 * 12) >> 15 | SafeLoadAs(in + 4 * 13) << 17, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 21 | SafeLoadAs(in + 4 * 14) << 11, + }; shifts = simd_batch{ 0, 0, 2, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 19-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 14), SafeLoad(in + 14) >> 27 | SafeLoad(in + 15) << 5, SafeLoad(in + 15) >> 14 | SafeLoad(in + 16) << 18, SafeLoad(in + 16) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 27 | SafeLoadAs(in + 4 * 15) << 5, + SafeLoadAs(in + 4 * 15) >> 14 | SafeLoadAs(in + 4 * 16) << 18, + SafeLoadAs(in + 4 * 16), + }; shifts = simd_batch{ 8, 0, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 19-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 16) >> 20 | SafeLoad(in + 17) << 12, SafeLoad(in + 17), SafeLoad(in + 17) >> 26 | SafeLoad(in + 18) << 6, SafeLoad(in + 18) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 16) >> 20 | SafeLoadAs(in + 4 * 17) << 12, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, + SafeLoadAs(in + 4 * 18), + }; shifts = simd_batch{ 0, 7, 0, 13 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 19; + in += 19 * 4; return in; } -inline static const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xfffff; simd_batch masks(mask); @@ -1326,66 +2813,108 @@ inline static const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 20-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 20 | SafeLoad(in + 1) << 12, SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 20 | SafeLoadAs(in + 4 * 1) << 12, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + }; shifts = simd_batch{ 0, 0, 8, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 20-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 2) >> 16 | SafeLoad(in + 3) << 16, SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2) >> 16 | SafeLoadAs(in + 4 * 3) << 16, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 0, 4, 0, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 20-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, + }; shifts = simd_batch{ 0, 0, 8, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 20-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, + SafeLoadAs(in + 4 * 9), + }; shifts = simd_batch{ 0, 4, 0, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 20-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 10), SafeLoad(in + 10) >> 20 | SafeLoad(in + 11) << 12, SafeLoad(in + 11), SafeLoad(in + 11) >> 28 | SafeLoad(in + 12) << 4 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 20 | SafeLoadAs(in + 4 * 11) << 12, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 28 | SafeLoadAs(in + 4 * 12) << 4, + }; shifts = simd_batch{ 0, 0, 8, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 20-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 12) >> 16 | SafeLoad(in + 13) << 16, SafeLoad(in + 13), SafeLoad(in + 13) >> 24 | SafeLoad(in + 14) << 8, SafeLoad(in + 14) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 12) >> 16 | SafeLoadAs(in + 4 * 13) << 16, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 24 | SafeLoadAs(in + 4 * 14) << 8, + SafeLoadAs(in + 4 * 14), + }; shifts = simd_batch{ 0, 4, 0, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 20-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 15), SafeLoad(in + 15) >> 20 | SafeLoad(in + 16) << 12, SafeLoad(in + 16), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 20 | SafeLoadAs(in + 4 * 16) << 12, + SafeLoadAs(in + 4 * 16), + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + }; shifts = simd_batch{ 0, 0, 8, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 20-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 17) >> 16 | SafeLoad(in + 18) << 16, SafeLoad(in + 18), SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 17) >> 16 | SafeLoadAs(in + 4 * 18) << 16, + SafeLoadAs(in + 4 * 18), + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19), + }; shifts = simd_batch{ 0, 4, 0, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 20; + in += 20 * 4; return in; } -inline static const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1fffff; simd_batch masks(mask); @@ -1393,66 +2922,108 @@ inline static const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 21-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 21 | SafeLoad(in + 1) << 11, SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 21 | SafeLoadAs(in + 4 * 1) << 11, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, + }; shifts = simd_batch{ 0, 0, 10, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 21-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) >> 19 | SafeLoad(in + 5) << 13 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4) >> 19 | SafeLoadAs(in + 4 * 5) << 13, + }; shifts = simd_batch{ 0, 9, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 21-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14, SafeLoad(in + 7) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, + SafeLoadAs(in + 4 * 7), + }; shifts = simd_batch{ 8, 0, 0, 7 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 21-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) >> 17 | SafeLoad(in + 9) << 15, SafeLoad(in + 9), SafeLoad(in + 9) >> 27 | SafeLoad(in + 10) << 5 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8) >> 17 | SafeLoadAs(in + 4 * 9) << 15, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 27 | SafeLoadAs(in + 4 * 10) << 5, + }; shifts = simd_batch{ 0, 0, 6, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 21-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12) >> 15 | SafeLoad(in + 13) << 17 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + SafeLoadAs(in + 4 * 12) >> 15 | SafeLoadAs(in + 4 * 13) << 17, + }; shifts = simd_batch{ 0, 5, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 21-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 13), SafeLoad(in + 13) >> 25 | SafeLoad(in + 14) << 7, SafeLoad(in + 14) >> 14 | SafeLoad(in + 15) << 18, SafeLoad(in + 15) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 25 | SafeLoadAs(in + 4 * 14) << 7, + SafeLoadAs(in + 4 * 14) >> 14 | SafeLoadAs(in + 4 * 15) << 18, + SafeLoadAs(in + 4 * 15), + }; shifts = simd_batch{ 4, 0, 0, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 21-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 13 | SafeLoad(in + 17) << 19, SafeLoad(in + 17), SafeLoad(in + 17) >> 23 | SafeLoad(in + 18) << 9 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, + SafeLoadAs(in + 4 * 16) >> 13 | SafeLoadAs(in + 4 * 17) << 19, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, + }; shifts = simd_batch{ 0, 0, 2, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 21-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 18) >> 12 | SafeLoad(in + 19) << 20, SafeLoad(in + 19), SafeLoad(in + 19) >> 22 | SafeLoad(in + 20) << 10, SafeLoad(in + 20) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 18) >> 12 | SafeLoadAs(in + 4 * 19) << 20, + SafeLoadAs(in + 4 * 19), + SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, + SafeLoadAs(in + 4 * 20), + }; shifts = simd_batch{ 0, 1, 0, 11 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 21; + in += 21 * 4; return in; } -inline static const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3fffff; simd_batch masks(mask); @@ -1460,66 +3031,108 @@ inline static const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 22-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10, SafeLoad(in + 1) >> 12 | SafeLoad(in + 2) << 20, SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 22 | SafeLoadAs(in + 4 * 1) << 10, + SafeLoadAs(in + 4 * 1) >> 12 | SafeLoadAs(in + 4 * 2) << 20, + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 0, 0, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 22-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8, SafeLoad(in + 3) >> 14 | SafeLoad(in + 4) << 18, SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2) >> 24 | SafeLoadAs(in + 4 * 3) << 8, + SafeLoadAs(in + 4 * 3) >> 14 | SafeLoadAs(in + 4 * 4) << 18, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, + }; shifts = simd_batch{ 0, 0, 4, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 22-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 5) >> 16 | SafeLoad(in + 6) << 16, SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7) >> 18 | SafeLoad(in + 8) << 14 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5) >> 16 | SafeLoadAs(in + 4 * 6) << 16, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, + SafeLoadAs(in + 4 * 7) >> 18 | SafeLoadAs(in + 4 * 8) << 14, + }; shifts = simd_batch{ 0, 6, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 22-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 10) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, + SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, + SafeLoadAs(in + 4 * 10), + }; shifts = simd_batch{ 8, 0, 0, 10 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 22-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 11), SafeLoad(in + 11) >> 22 | SafeLoad(in + 12) << 10, SafeLoad(in + 12) >> 12 | SafeLoad(in + 13) << 20, SafeLoad(in + 13) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 22 | SafeLoadAs(in + 4 * 12) << 10, + SafeLoadAs(in + 4 * 12) >> 12 | SafeLoadAs(in + 4 * 13) << 20, + SafeLoadAs(in + 4 * 13), + }; shifts = simd_batch{ 0, 0, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 22-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 13) >> 24 | SafeLoad(in + 14) << 8, SafeLoad(in + 14) >> 14 | SafeLoad(in + 15) << 18, SafeLoad(in + 15), SafeLoad(in + 15) >> 26 | SafeLoad(in + 16) << 6 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 13) >> 24 | SafeLoadAs(in + 4 * 14) << 8, + SafeLoadAs(in + 4 * 14) >> 14 | SafeLoadAs(in + 4 * 15) << 18, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 26 | SafeLoadAs(in + 4 * 16) << 6, + }; shifts = simd_batch{ 0, 0, 4, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 22-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 16) >> 16 | SafeLoad(in + 17) << 16, SafeLoad(in + 17), SafeLoad(in + 17) >> 28 | SafeLoad(in + 18) << 4, SafeLoad(in + 18) >> 18 | SafeLoad(in + 19) << 14 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 16) >> 16 | SafeLoadAs(in + 4 * 17) << 16, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 28 | SafeLoadAs(in + 4 * 18) << 4, + SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, + }; shifts = simd_batch{ 0, 6, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 22-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 19), SafeLoad(in + 19) >> 30 | SafeLoad(in + 20) << 2, SafeLoad(in + 20) >> 20 | SafeLoad(in + 21) << 12, SafeLoad(in + 21) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 19), + SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, + SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, + SafeLoadAs(in + 4 * 21), + }; shifts = simd_batch{ 8, 0, 0, 10 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 22; + in += 22 * 4; return in; } -inline static const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7fffff; simd_batch masks(mask); @@ -1527,66 +3140,108 @@ inline static const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 23-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 23 | SafeLoad(in + 1) << 9, SafeLoad(in + 1) >> 14 | SafeLoad(in + 2) << 18, SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 23 | SafeLoadAs(in + 4 * 1) << 9, + SafeLoadAs(in + 4 * 1) >> 14 | SafeLoadAs(in + 4 * 2) << 18, + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 0, 0, 0, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 23-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 2) >> 28 | SafeLoad(in + 3) << 4, SafeLoad(in + 3) >> 19 | SafeLoad(in + 4) << 13, SafeLoad(in + 4) >> 10 | SafeLoad(in + 5) << 22, SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2) >> 28 | SafeLoadAs(in + 4 * 3) << 4, + SafeLoadAs(in + 4 * 3) >> 19 | SafeLoadAs(in + 4 * 4) << 13, + SafeLoadAs(in + 4 * 4) >> 10 | SafeLoadAs(in + 4 * 5) << 22, + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 0, 0, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 23-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 5) >> 24 | SafeLoad(in + 6) << 8, SafeLoad(in + 6) >> 15 | SafeLoad(in + 7) << 17, SafeLoad(in + 7), SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5) >> 24 | SafeLoadAs(in + 4 * 6) << 8, + SafeLoadAs(in + 4 * 6) >> 15 | SafeLoadAs(in + 4 * 7) << 17, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 29 | SafeLoadAs(in + 4 * 8) << 3, + }; shifts = simd_batch{ 0, 0, 6, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 23-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 8) >> 20 | SafeLoad(in + 9) << 12, SafeLoad(in + 9) >> 11 | SafeLoad(in + 10) << 21, SafeLoad(in + 10), SafeLoad(in + 10) >> 25 | SafeLoad(in + 11) << 7 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 8) >> 20 | SafeLoadAs(in + 4 * 9) << 12, + SafeLoadAs(in + 4 * 9) >> 11 | SafeLoadAs(in + 4 * 10) << 21, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 25 | SafeLoadAs(in + 4 * 11) << 7, + }; shifts = simd_batch{ 0, 0, 2, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 23-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 11) >> 16 | SafeLoad(in + 12) << 16, SafeLoad(in + 12), SafeLoad(in + 12) >> 30 | SafeLoad(in + 13) << 2, SafeLoad(in + 13) >> 21 | SafeLoad(in + 14) << 11 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 11) >> 16 | SafeLoadAs(in + 4 * 12) << 16, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 30 | SafeLoadAs(in + 4 * 13) << 2, + SafeLoadAs(in + 4 * 13) >> 21 | SafeLoadAs(in + 4 * 14) << 11, + }; shifts = simd_batch{ 0, 7, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 23-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 14) >> 12 | SafeLoad(in + 15) << 20, SafeLoad(in + 15), SafeLoad(in + 15) >> 26 | SafeLoad(in + 16) << 6, SafeLoad(in + 16) >> 17 | SafeLoad(in + 17) << 15 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 14) >> 12 | SafeLoadAs(in + 4 * 15) << 20, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 26 | SafeLoadAs(in + 4 * 16) << 6, + SafeLoadAs(in + 4 * 16) >> 17 | SafeLoadAs(in + 4 * 17) << 15, + }; shifts = simd_batch{ 0, 3, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 23-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 17), SafeLoad(in + 17) >> 31 | SafeLoad(in + 18) << 1, SafeLoad(in + 18) >> 22 | SafeLoad(in + 19) << 10, SafeLoad(in + 19) >> 13 | SafeLoad(in + 20) << 19 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 31 | SafeLoadAs(in + 4 * 18) << 1, + SafeLoadAs(in + 4 * 18) >> 22 | SafeLoadAs(in + 4 * 19) << 10, + SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, + }; shifts = simd_batch{ 8, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 23-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 20), SafeLoad(in + 20) >> 27 | SafeLoad(in + 21) << 5, SafeLoad(in + 21) >> 18 | SafeLoad(in + 22) << 14, SafeLoad(in + 22) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 20), + SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, + SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, + SafeLoadAs(in + 4 * 22), + }; shifts = simd_batch{ 4, 0, 0, 9 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 23; + in += 23 * 4; return in; } -inline static const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xffffff; simd_batch masks(mask); @@ -1594,66 +3249,108 @@ inline static const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 24-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 24 | SafeLoad(in + 1) << 8, SafeLoad(in + 1) >> 16 | SafeLoad(in + 2) << 16, SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 24 | SafeLoadAs(in + 4 * 1) << 8, + SafeLoadAs(in + 4 * 1) >> 16 | SafeLoadAs(in + 4 * 2) << 16, + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 0, 0, 0, 8 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 24-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) >> 16 | SafeLoad(in + 5) << 16, SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4) >> 16 | SafeLoadAs(in + 4 * 5) << 16, + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 0, 0, 0, 8 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 24-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, + SafeLoadAs(in + 4 * 8), + }; shifts = simd_batch{ 0, 0, 0, 8 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 24-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, + SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, + SafeLoadAs(in + 4 * 11), + }; shifts = simd_batch{ 0, 0, 0, 8 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 24-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 12), SafeLoad(in + 12) >> 24 | SafeLoad(in + 13) << 8, SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 24 | SafeLoadAs(in + 4 * 13) << 8, + SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, + SafeLoadAs(in + 4 * 14), + }; shifts = simd_batch{ 0, 0, 0, 8 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 24-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 15), SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 16 | SafeLoad(in + 17) << 16, SafeLoad(in + 17) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, + SafeLoadAs(in + 4 * 16) >> 16 | SafeLoadAs(in + 4 * 17) << 16, + SafeLoadAs(in + 4 * 17), + }; shifts = simd_batch{ 0, 0, 0, 8 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 24-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 18), SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 16 | SafeLoad(in + 20) << 16, SafeLoad(in + 20) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 18), + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19) >> 16 | SafeLoadAs(in + 4 * 20) << 16, + SafeLoadAs(in + 4 * 20), + }; shifts = simd_batch{ 0, 0, 0, 8 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 24-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 21), SafeLoad(in + 21) >> 24 | SafeLoad(in + 22) << 8, SafeLoad(in + 22) >> 16 | SafeLoad(in + 23) << 16, SafeLoad(in + 23) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, + SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, + SafeLoadAs(in + 4 * 23), + }; shifts = simd_batch{ 0, 0, 0, 8 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 24; + in += 24 * 4; return in; } -inline static const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1ffffff; simd_batch masks(mask); @@ -1661,66 +3358,108 @@ inline static const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 25-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 25 | SafeLoad(in + 1) << 7, SafeLoad(in + 1) >> 18 | SafeLoad(in + 2) << 14, SafeLoad(in + 2) >> 11 | SafeLoad(in + 3) << 21 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 25 | SafeLoadAs(in + 4 * 1) << 7, + SafeLoadAs(in + 4 * 1) >> 18 | SafeLoadAs(in + 4 * 2) << 14, + SafeLoadAs(in + 4 * 2) >> 11 | SafeLoadAs(in + 4 * 3) << 21, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 25-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5) >> 15 | SafeLoad(in + 6) << 17 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 29 | SafeLoadAs(in + 4 * 4) << 3, + SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, + SafeLoadAs(in + 4 * 5) >> 15 | SafeLoadAs(in + 4 * 6) << 17, + }; shifts = simd_batch{ 4, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 25-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 6) >> 8 | SafeLoad(in + 7) << 24, SafeLoad(in + 7), SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6, SafeLoad(in + 8) >> 19 | SafeLoad(in + 9) << 13 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6) >> 8 | SafeLoadAs(in + 4 * 7) << 24, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 26 | SafeLoadAs(in + 4 * 8) << 6, + SafeLoadAs(in + 4 * 8) >> 19 | SafeLoadAs(in + 4 * 9) << 13, + }; shifts = simd_batch{ 0, 1, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 25-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 20, SafeLoad(in + 10), SafeLoad(in + 10) >> 30 | SafeLoad(in + 11) << 2, SafeLoad(in + 11) >> 23 | SafeLoad(in + 12) << 9 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 30 | SafeLoadAs(in + 4 * 11) << 2, + SafeLoadAs(in + 4 * 11) >> 23 | SafeLoadAs(in + 4 * 12) << 9, + }; shifts = simd_batch{ 0, 5, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 25-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 12) >> 16 | SafeLoad(in + 13) << 16, SafeLoad(in + 13) >> 9 | SafeLoad(in + 14) << 23, SafeLoad(in + 14), SafeLoad(in + 14) >> 27 | SafeLoad(in + 15) << 5 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 12) >> 16 | SafeLoadAs(in + 4 * 13) << 16, + SafeLoadAs(in + 4 * 13) >> 9 | SafeLoadAs(in + 4 * 14) << 23, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 27 | SafeLoadAs(in + 4 * 15) << 5, + }; shifts = simd_batch{ 0, 0, 2, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 25-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 15) >> 20 | SafeLoad(in + 16) << 12, SafeLoad(in + 16) >> 13 | SafeLoad(in + 17) << 19, SafeLoad(in + 17), SafeLoad(in + 17) >> 31 | SafeLoad(in + 18) << 1 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 15) >> 20 | SafeLoadAs(in + 4 * 16) << 12, + SafeLoadAs(in + 4 * 16) >> 13 | SafeLoadAs(in + 4 * 17) << 19, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 31 | SafeLoadAs(in + 4 * 18) << 1, + }; shifts = simd_batch{ 0, 0, 6, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 25-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 17 | SafeLoad(in + 20) << 15, SafeLoad(in + 20) >> 10 | SafeLoad(in + 21) << 22, SafeLoad(in + 21) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19) >> 17 | SafeLoadAs(in + 4 * 20) << 15, + SafeLoadAs(in + 4 * 20) >> 10 | SafeLoadAs(in + 4 * 21) << 22, + SafeLoadAs(in + 4 * 21), + }; shifts = simd_batch{ 0, 0, 0, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 25-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 21) >> 28 | SafeLoad(in + 22) << 4, SafeLoad(in + 22) >> 21 | SafeLoad(in + 23) << 11, SafeLoad(in + 23) >> 14 | SafeLoad(in + 24) << 18, SafeLoad(in + 24) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 21) >> 28 | SafeLoadAs(in + 4 * 22) << 4, + SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, + SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, + SafeLoadAs(in + 4 * 24), + }; shifts = simd_batch{ 0, 0, 0, 7 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 25; + in += 25 * 4; return in; } -inline static const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3ffffff; simd_batch masks(mask); @@ -1728,66 +3467,108 @@ inline static const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 26-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 26 | SafeLoad(in + 1) << 6, SafeLoad(in + 1) >> 20 | SafeLoad(in + 2) << 12, SafeLoad(in + 2) >> 14 | SafeLoad(in + 3) << 18 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 26 | SafeLoadAs(in + 4 * 1) << 6, + SafeLoadAs(in + 4 * 1) >> 20 | SafeLoadAs(in + 4 * 2) << 12, + SafeLoadAs(in + 4 * 2) >> 14 | SafeLoadAs(in + 4 * 3) << 18, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 26-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 3) >> 8 | SafeLoad(in + 4) << 24, SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3) >> 8 | SafeLoadAs(in + 4 * 4) << 24, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, + SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, + }; shifts = simd_batch{ 0, 2, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 26-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 6) >> 16 | SafeLoad(in + 7) << 16, SafeLoad(in + 7) >> 10 | SafeLoad(in + 8) << 22, SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6) >> 16 | SafeLoadAs(in + 4 * 7) << 16, + SafeLoadAs(in + 4 * 7) >> 10 | SafeLoadAs(in + 4 * 8) << 22, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, + }; shifts = simd_batch{ 0, 0, 4, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 26-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10) >> 18 | SafeLoad(in + 11) << 14, SafeLoad(in + 11) >> 12 | SafeLoad(in + 12) << 20, SafeLoad(in + 12) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, + SafeLoadAs(in + 4 * 10) >> 18 | SafeLoadAs(in + 4 * 11) << 14, + SafeLoadAs(in + 4 * 11) >> 12 | SafeLoadAs(in + 4 * 12) << 20, + SafeLoadAs(in + 4 * 12), + }; shifts = simd_batch{ 0, 0, 0, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 26-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 13), SafeLoad(in + 13) >> 26 | SafeLoad(in + 14) << 6, SafeLoad(in + 14) >> 20 | SafeLoad(in + 15) << 12, SafeLoad(in + 15) >> 14 | SafeLoad(in + 16) << 18 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 26 | SafeLoadAs(in + 4 * 14) << 6, + SafeLoadAs(in + 4 * 14) >> 20 | SafeLoadAs(in + 4 * 15) << 12, + SafeLoadAs(in + 4 * 15) >> 14 | SafeLoadAs(in + 4 * 16) << 18, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 26-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 16) >> 8 | SafeLoad(in + 17) << 24, SafeLoad(in + 17), SafeLoad(in + 17) >> 28 | SafeLoad(in + 18) << 4, SafeLoad(in + 18) >> 22 | SafeLoad(in + 19) << 10 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 16) >> 8 | SafeLoadAs(in + 4 * 17) << 24, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 28 | SafeLoadAs(in + 4 * 18) << 4, + SafeLoadAs(in + 4 * 18) >> 22 | SafeLoadAs(in + 4 * 19) << 10, + }; shifts = simd_batch{ 0, 2, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 26-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 19) >> 16 | SafeLoad(in + 20) << 16, SafeLoad(in + 20) >> 10 | SafeLoad(in + 21) << 22, SafeLoad(in + 21), SafeLoad(in + 21) >> 30 | SafeLoad(in + 22) << 2 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 19) >> 16 | SafeLoadAs(in + 4 * 20) << 16, + SafeLoadAs(in + 4 * 20) >> 10 | SafeLoadAs(in + 4 * 21) << 22, + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, + }; shifts = simd_batch{ 0, 0, 4, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 26-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 22) >> 24 | SafeLoad(in + 23) << 8, SafeLoad(in + 23) >> 18 | SafeLoad(in + 24) << 14, SafeLoad(in + 24) >> 12 | SafeLoad(in + 25) << 20, SafeLoad(in + 25) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 22) >> 24 | SafeLoadAs(in + 4 * 23) << 8, + SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, + SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, + SafeLoadAs(in + 4 * 25), + }; shifts = simd_batch{ 0, 0, 0, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 26; + in += 26 * 4; return in; } -inline static const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7ffffff; simd_batch masks(mask); @@ -1795,66 +3576,108 @@ inline static const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 27-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5, SafeLoad(in + 1) >> 22 | SafeLoad(in + 2) << 10, SafeLoad(in + 2) >> 17 | SafeLoad(in + 3) << 15 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, + SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, + SafeLoadAs(in + 4 * 2) >> 17 | SafeLoadAs(in + 4 * 3) << 15, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 27-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 3) >> 12 | SafeLoad(in + 4) << 20, SafeLoad(in + 4) >> 7 | SafeLoad(in + 5) << 25, SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3) >> 12 | SafeLoadAs(in + 4 * 4) << 20, + SafeLoadAs(in + 4 * 4) >> 7 | SafeLoadAs(in + 4 * 5) << 25, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + }; shifts = simd_batch{ 0, 0, 2, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 27-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7) >> 19 | SafeLoad(in + 8) << 13, SafeLoad(in + 8) >> 14 | SafeLoad(in + 9) << 18, SafeLoad(in + 9) >> 9 | SafeLoad(in + 10) << 23 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7) >> 19 | SafeLoadAs(in + 4 * 8) << 13, + SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, + SafeLoadAs(in + 4 * 9) >> 9 | SafeLoadAs(in + 4 * 10) << 23, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 27-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 10), SafeLoad(in + 10) >> 31 | SafeLoad(in + 11) << 1, SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12) >> 21 | SafeLoad(in + 13) << 11 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, + }; shifts = simd_batch{ 4, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 27-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14) >> 11 | SafeLoad(in + 15) << 21, SafeLoad(in + 15) >> 6 | SafeLoad(in + 16) << 26, SafeLoad(in + 16) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, + SafeLoadAs(in + 4 * 14) >> 11 | SafeLoadAs(in + 4 * 15) << 21, + SafeLoadAs(in + 4 * 15) >> 6 | SafeLoadAs(in + 4 * 16) << 26, + SafeLoadAs(in + 4 * 16), + }; shifts = simd_batch{ 0, 0, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 27-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) >> 23 | SafeLoad(in + 18) << 9, SafeLoad(in + 18) >> 18 | SafeLoad(in + 19) << 14, SafeLoad(in + 19) >> 13 | SafeLoad(in + 20) << 19 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, + SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, + SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 27-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 20) >> 8 | SafeLoad(in + 21) << 24, SafeLoad(in + 21), SafeLoad(in + 21) >> 30 | SafeLoad(in + 22) << 2, SafeLoad(in + 22) >> 25 | SafeLoad(in + 23) << 7 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 20) >> 8 | SafeLoadAs(in + 4 * 21) << 24, + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, + SafeLoadAs(in + 4 * 22) >> 25 | SafeLoadAs(in + 4 * 23) << 7, + }; shifts = simd_batch{ 0, 3, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 27-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 23) >> 20 | SafeLoad(in + 24) << 12, SafeLoad(in + 24) >> 15 | SafeLoad(in + 25) << 17, SafeLoad(in + 25) >> 10 | SafeLoad(in + 26) << 22, SafeLoad(in + 26) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, + SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, + SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, + SafeLoadAs(in + 4 * 26), + }; shifts = simd_batch{ 0, 0, 0, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 27; + in += 27 * 4; return in; } -inline static const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xfffffff; simd_batch masks(mask); @@ -1862,66 +3685,108 @@ inline static const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 28-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1) >> 24 | SafeLoad(in + 2) << 8, SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, + SafeLoadAs(in + 4 * 1) >> 24 | SafeLoadAs(in + 4 * 2) << 8, + SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 28-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 3) >> 16 | SafeLoad(in + 4) << 16, SafeLoad(in + 4) >> 12 | SafeLoad(in + 5) << 20, SafeLoad(in + 5) >> 8 | SafeLoad(in + 6) << 24, SafeLoad(in + 6) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3) >> 16 | SafeLoadAs(in + 4 * 4) << 16, + SafeLoadAs(in + 4 * 4) >> 12 | SafeLoadAs(in + 4 * 5) << 20, + SafeLoadAs(in + 4 * 5) >> 8 | SafeLoadAs(in + 4 * 6) << 24, + SafeLoadAs(in + 4 * 6), + }; shifts = simd_batch{ 0, 0, 0, 4 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 28-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, + SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 28-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11) >> 12 | SafeLoad(in + 12) << 20, SafeLoad(in + 12) >> 8 | SafeLoad(in + 13) << 24, SafeLoad(in + 13) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, + SafeLoadAs(in + 4 * 11) >> 12 | SafeLoadAs(in + 4 * 12) << 20, + SafeLoadAs(in + 4 * 12) >> 8 | SafeLoadAs(in + 4 * 13) << 24, + SafeLoadAs(in + 4 * 13), + }; shifts = simd_batch{ 0, 0, 0, 4 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 28-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 14), SafeLoad(in + 14) >> 28 | SafeLoad(in + 15) << 4, SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 20 | SafeLoad(in + 17) << 12 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 28 | SafeLoadAs(in + 4 * 15) << 4, + SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, + SafeLoadAs(in + 4 * 16) >> 20 | SafeLoadAs(in + 4 * 17) << 12, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 28-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 17) >> 16 | SafeLoad(in + 18) << 16, SafeLoad(in + 18) >> 12 | SafeLoad(in + 19) << 20, SafeLoad(in + 19) >> 8 | SafeLoad(in + 20) << 24, SafeLoad(in + 20) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 17) >> 16 | SafeLoadAs(in + 4 * 18) << 16, + SafeLoadAs(in + 4 * 18) >> 12 | SafeLoadAs(in + 4 * 19) << 20, + SafeLoadAs(in + 4 * 19) >> 8 | SafeLoadAs(in + 4 * 20) << 24, + SafeLoadAs(in + 4 * 20), + }; shifts = simd_batch{ 0, 0, 0, 4 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 28-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 21), SafeLoad(in + 21) >> 28 | SafeLoad(in + 22) << 4, SafeLoad(in + 22) >> 24 | SafeLoad(in + 23) << 8, SafeLoad(in + 23) >> 20 | SafeLoad(in + 24) << 12 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 28 | SafeLoadAs(in + 4 * 22) << 4, + SafeLoadAs(in + 4 * 22) >> 24 | SafeLoadAs(in + 4 * 23) << 8, + SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 28-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 24) >> 16 | SafeLoad(in + 25) << 16, SafeLoad(in + 25) >> 12 | SafeLoad(in + 26) << 20, SafeLoad(in + 26) >> 8 | SafeLoad(in + 27) << 24, SafeLoad(in + 27) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 24) >> 16 | SafeLoadAs(in + 4 * 25) << 16, + SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, + SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, + SafeLoadAs(in + 4 * 27), + }; shifts = simd_batch{ 0, 0, 0, 4 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 28; + in += 28 * 4; return in; } -inline static const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1fffffff; simd_batch masks(mask); @@ -1929,66 +3794,108 @@ inline static const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 29-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 29 | SafeLoad(in + 1) << 3, SafeLoad(in + 1) >> 26 | SafeLoad(in + 2) << 6, SafeLoad(in + 2) >> 23 | SafeLoad(in + 3) << 9 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 29 | SafeLoadAs(in + 4 * 1) << 3, + SafeLoadAs(in + 4 * 1) >> 26 | SafeLoadAs(in + 4 * 2) << 6, + SafeLoadAs(in + 4 * 2) >> 23 | SafeLoadAs(in + 4 * 3) << 9, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 29-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 3) >> 20 | SafeLoad(in + 4) << 12, SafeLoad(in + 4) >> 17 | SafeLoad(in + 5) << 15, SafeLoad(in + 5) >> 14 | SafeLoad(in + 6) << 18, SafeLoad(in + 6) >> 11 | SafeLoad(in + 7) << 21 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3) >> 20 | SafeLoadAs(in + 4 * 4) << 12, + SafeLoadAs(in + 4 * 4) >> 17 | SafeLoadAs(in + 4 * 5) << 15, + SafeLoadAs(in + 4 * 5) >> 14 | SafeLoadAs(in + 4 * 6) << 18, + SafeLoadAs(in + 4 * 6) >> 11 | SafeLoadAs(in + 4 * 7) << 21, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 29-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 7) >> 8 | SafeLoad(in + 8) << 24, SafeLoad(in + 8) >> 5 | SafeLoad(in + 9) << 27, SafeLoad(in + 9), SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7) >> 8 | SafeLoadAs(in + 4 * 8) << 24, + SafeLoadAs(in + 4 * 8) >> 5 | SafeLoadAs(in + 4 * 9) << 27, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, + }; shifts = simd_batch{ 0, 0, 2, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 29-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 10) >> 28 | SafeLoad(in + 11) << 4, SafeLoad(in + 11) >> 25 | SafeLoad(in + 12) << 7, SafeLoad(in + 12) >> 22 | SafeLoad(in + 13) << 10, SafeLoad(in + 13) >> 19 | SafeLoad(in + 14) << 13 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, + SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, + SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, + SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 29-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 14) >> 16 | SafeLoad(in + 15) << 16, SafeLoad(in + 15) >> 13 | SafeLoad(in + 16) << 19, SafeLoad(in + 16) >> 10 | SafeLoad(in + 17) << 22, SafeLoad(in + 17) >> 7 | SafeLoad(in + 18) << 25 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 14) >> 16 | SafeLoadAs(in + 4 * 15) << 16, + SafeLoadAs(in + 4 * 15) >> 13 | SafeLoadAs(in + 4 * 16) << 19, + SafeLoadAs(in + 4 * 16) >> 10 | SafeLoadAs(in + 4 * 17) << 22, + SafeLoadAs(in + 4 * 17) >> 7 | SafeLoadAs(in + 4 * 18) << 25, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 29-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 18) >> 4 | SafeLoad(in + 19) << 28, SafeLoad(in + 19), SafeLoad(in + 19) >> 30 | SafeLoad(in + 20) << 2, SafeLoad(in + 20) >> 27 | SafeLoad(in + 21) << 5 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 18) >> 4 | SafeLoadAs(in + 4 * 19) << 28, + SafeLoadAs(in + 4 * 19), + SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, + SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, + }; shifts = simd_batch{ 0, 1, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 29-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 21) >> 24 | SafeLoad(in + 22) << 8, SafeLoad(in + 22) >> 21 | SafeLoad(in + 23) << 11, SafeLoad(in + 23) >> 18 | SafeLoad(in + 24) << 14, SafeLoad(in + 24) >> 15 | SafeLoad(in + 25) << 17 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, + SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, + SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, + SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 29-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 25) >> 12 | SafeLoad(in + 26) << 20, SafeLoad(in + 26) >> 9 | SafeLoad(in + 27) << 23, SafeLoad(in + 27) >> 6 | SafeLoad(in + 28) << 26, SafeLoad(in + 28) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, + SafeLoadAs(in + 4 * 26) >> 9 | SafeLoadAs(in + 4 * 27) << 23, + SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, + SafeLoadAs(in + 4 * 28), + }; shifts = simd_batch{ 0, 0, 0, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 29; + in += 29 * 4; return in; } -inline static const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3fffffff; simd_batch masks(mask); @@ -1996,66 +3903,108 @@ inline static const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 30-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 30-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 30-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8) >> 14 | SafeLoad(in + 9) << 18, SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 20, SafeLoad(in + 10) >> 10 | SafeLoad(in + 11) << 22 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, + SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, + SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, + SafeLoadAs(in + 4 * 10) >> 10 | SafeLoadAs(in + 4 * 11) << 22, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 30-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 11) >> 8 | SafeLoad(in + 12) << 24, SafeLoad(in + 12) >> 6 | SafeLoad(in + 13) << 26, SafeLoad(in + 13) >> 4 | SafeLoad(in + 14) << 28, SafeLoad(in + 14) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 11) >> 8 | SafeLoadAs(in + 4 * 12) << 24, + SafeLoadAs(in + 4 * 12) >> 6 | SafeLoadAs(in + 4 * 13) << 26, + SafeLoadAs(in + 4 * 13) >> 4 | SafeLoadAs(in + 4 * 14) << 28, + SafeLoadAs(in + 4 * 14), + }; shifts = simd_batch{ 0, 0, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 30-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 15), SafeLoad(in + 15) >> 30 | SafeLoad(in + 16) << 2, SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) >> 26 | SafeLoad(in + 18) << 6 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 30-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 22 | SafeLoad(in + 20) << 10, SafeLoad(in + 20) >> 20 | SafeLoad(in + 21) << 12, SafeLoad(in + 21) >> 18 | SafeLoad(in + 22) << 14 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, + SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, + SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 30-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 22) >> 16 | SafeLoad(in + 23) << 16, SafeLoad(in + 23) >> 14 | SafeLoad(in + 24) << 18, SafeLoad(in + 24) >> 12 | SafeLoad(in + 25) << 20, SafeLoad(in + 25) >> 10 | SafeLoad(in + 26) << 22 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, + SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, + SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, + SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 30-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 26) >> 8 | SafeLoad(in + 27) << 24, SafeLoad(in + 27) >> 6 | SafeLoad(in + 28) << 26, SafeLoad(in + 28) >> 4 | SafeLoad(in + 29) << 28, SafeLoad(in + 29) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, + SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, + SafeLoadAs(in + 4 * 28) >> 4 | SafeLoadAs(in + 4 * 29) << 28, + SafeLoadAs(in + 4 * 29), + }; shifts = simd_batch{ 0, 0, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 30; + in += 30 * 4; return in; } -inline static const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7fffffff; simd_batch masks(mask); @@ -2063,68 +4012,108 @@ inline static const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 31-bit bundles 0 to 3 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 31 | SafeLoad(in + 1) << 1, SafeLoad(in + 1) >> 30 | SafeLoad(in + 2) << 2, SafeLoad(in + 2) >> 29 | SafeLoad(in + 3) << 3 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 31 | SafeLoadAs(in + 4 * 1) << 1, + SafeLoadAs(in + 4 * 1) >> 30 | SafeLoadAs(in + 4 * 2) << 2, + SafeLoadAs(in + 4 * 2) >> 29 | SafeLoadAs(in + 4 * 3) << 3, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 31-bit bundles 4 to 7 - words = simd_batch{ SafeLoad(in + 3) >> 28 | SafeLoad(in + 4) << 4, SafeLoad(in + 4) >> 27 | SafeLoad(in + 5) << 5, SafeLoad(in + 5) >> 26 | SafeLoad(in + 6) << 6, SafeLoad(in + 6) >> 25 | SafeLoad(in + 7) << 7 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3) >> 28 | SafeLoadAs(in + 4 * 4) << 4, + SafeLoadAs(in + 4 * 4) >> 27 | SafeLoadAs(in + 4 * 5) << 5, + SafeLoadAs(in + 4 * 5) >> 26 | SafeLoadAs(in + 4 * 6) << 6, + SafeLoadAs(in + 4 * 6) >> 25 | SafeLoadAs(in + 4 * 7) << 7, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 31-bit bundles 8 to 11 - words = simd_batch{ SafeLoad(in + 7) >> 24 | SafeLoad(in + 8) << 8, SafeLoad(in + 8) >> 23 | SafeLoad(in + 9) << 9, SafeLoad(in + 9) >> 22 | SafeLoad(in + 10) << 10, SafeLoad(in + 10) >> 21 | SafeLoad(in + 11) << 11 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7) >> 24 | SafeLoadAs(in + 4 * 8) << 8, + SafeLoadAs(in + 4 * 8) >> 23 | SafeLoadAs(in + 4 * 9) << 9, + SafeLoadAs(in + 4 * 9) >> 22 | SafeLoadAs(in + 4 * 10) << 10, + SafeLoadAs(in + 4 * 10) >> 21 | SafeLoadAs(in + 4 * 11) << 11, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 31-bit bundles 12 to 15 - words = simd_batch{ SafeLoad(in + 11) >> 20 | SafeLoad(in + 12) << 12, SafeLoad(in + 12) >> 19 | SafeLoad(in + 13) << 13, SafeLoad(in + 13) >> 18 | SafeLoad(in + 14) << 14, SafeLoad(in + 14) >> 17 | SafeLoad(in + 15) << 15 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 11) >> 20 | SafeLoadAs(in + 4 * 12) << 12, + SafeLoadAs(in + 4 * 12) >> 19 | SafeLoadAs(in + 4 * 13) << 13, + SafeLoadAs(in + 4 * 13) >> 18 | SafeLoadAs(in + 4 * 14) << 14, + SafeLoadAs(in + 4 * 14) >> 17 | SafeLoadAs(in + 4 * 15) << 15, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 31-bit bundles 16 to 19 - words = simd_batch{ SafeLoad(in + 15) >> 16 | SafeLoad(in + 16) << 16, SafeLoad(in + 16) >> 15 | SafeLoad(in + 17) << 17, SafeLoad(in + 17) >> 14 | SafeLoad(in + 18) << 18, SafeLoad(in + 18) >> 13 | SafeLoad(in + 19) << 19 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 15) >> 16 | SafeLoadAs(in + 4 * 16) << 16, + SafeLoadAs(in + 4 * 16) >> 15 | SafeLoadAs(in + 4 * 17) << 17, + SafeLoadAs(in + 4 * 17) >> 14 | SafeLoadAs(in + 4 * 18) << 18, + SafeLoadAs(in + 4 * 18) >> 13 | SafeLoadAs(in + 4 * 19) << 19, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 31-bit bundles 20 to 23 - words = simd_batch{ SafeLoad(in + 19) >> 12 | SafeLoad(in + 20) << 20, SafeLoad(in + 20) >> 11 | SafeLoad(in + 21) << 21, SafeLoad(in + 21) >> 10 | SafeLoad(in + 22) << 22, SafeLoad(in + 22) >> 9 | SafeLoad(in + 23) << 23 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 19) >> 12 | SafeLoadAs(in + 4 * 20) << 20, + SafeLoadAs(in + 4 * 20) >> 11 | SafeLoadAs(in + 4 * 21) << 21, + SafeLoadAs(in + 4 * 21) >> 10 | SafeLoadAs(in + 4 * 22) << 22, + SafeLoadAs(in + 4 * 22) >> 9 | SafeLoadAs(in + 4 * 23) << 23, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 31-bit bundles 24 to 27 - words = simd_batch{ SafeLoad(in + 23) >> 8 | SafeLoad(in + 24) << 24, SafeLoad(in + 24) >> 7 | SafeLoad(in + 25) << 25, SafeLoad(in + 25) >> 6 | SafeLoad(in + 26) << 26, SafeLoad(in + 26) >> 5 | SafeLoad(in + 27) << 27 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 23) >> 8 | SafeLoadAs(in + 4 * 24) << 24, + SafeLoadAs(in + 4 * 24) >> 7 | SafeLoadAs(in + 4 * 25) << 25, + SafeLoadAs(in + 4 * 25) >> 6 | SafeLoadAs(in + 4 * 26) << 26, + SafeLoadAs(in + 4 * 26) >> 5 | SafeLoadAs(in + 4 * 27) << 27, + }; shifts = simd_batch{ 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 31-bit bundles 28 to 31 - words = simd_batch{ SafeLoad(in + 27) >> 4 | SafeLoad(in + 28) << 28, SafeLoad(in + 28) >> 3 | SafeLoad(in + 29) << 29, SafeLoad(in + 29) >> 2 | SafeLoad(in + 30) << 30, SafeLoad(in + 30) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 27) >> 4 | SafeLoadAs(in + 4 * 28) << 28, + SafeLoadAs(in + 4 * 28) >> 3 | SafeLoadAs(in + 4 * 29) << 29, + SafeLoadAs(in + 4 * 29) >> 2 | SafeLoadAs(in + 4 * 30) << 30, + SafeLoadAs(in + 4 * 30), + }; shifts = simd_batch{ 0, 0, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; - in += 31; + in += 31 * 4; return in; } -inline static const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) { - memcpy(out, in, 32 * sizeof(*out)); - in += 32; +inline static const uint8_t* unpack32_32(const uint8_t* in, uint32_t* out) { + std::memcpy(out, in, 32 * sizeof(*out)); + in += 4 * 32; out += 32; return in; @@ -2133,6 +4122,5 @@ inline static const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) { }; // struct UnpackBits128 } // namespace -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h index 3dccb1745f7..6e187831f1e 100644 --- a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h @@ -27,25 +27,594 @@ #include "arrow/util/dispatch_internal.h" #include "arrow/util/ubsan.h" -namespace arrow { -namespace internal { +namespace arrow::internal { namespace { -using ::arrow::util::SafeLoad; +using ::arrow::util::SafeLoadAs; template struct UnpackBits256 { -using simd_batch = xsimd::make_sized_batch_t; +inline static const uint8_t* unpack0_16(const uint8_t* in, uint16_t* out) { + std::memset(out, 0x0, 16 * sizeof(*out)); + out += 16; -inline static const uint32_t* unpack0_32(const uint32_t* in, uint32_t* out) { - memset(out, 0x0, 32 * sizeof(*out)); + return in; +} + +inline static const uint8_t* unpack1_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x1; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 1-bit bundles 0 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + }; + shifts = simd_batch{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 1 * 2; + return in; +} + +inline static const uint8_t* unpack2_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x3; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 2-bit bundles 0 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + }; + shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 2 * 2; + return in; +} + +inline static const uint8_t* unpack3_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x7; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 3-bit bundles 0 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + }; + shifts = simd_batch{ 0, 3, 6, 9, 12, 0, 2, 5, 8, 11, 0, 1, 4, 7, 10, 13 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 3 * 2; + return in; +} + +inline static const uint8_t* unpack4_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0xf; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 4-bit bundles 0 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + }; + shifts = simd_batch{ 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 4 * 2; + return in; +} + +inline static const uint8_t* unpack5_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x1f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 5-bit bundles 0 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 4), + }; + shifts = simd_batch{ 0, 5, 10, 0, 4, 9, 0, 3, 8, 0, 2, 7, 0, 1, 6, 11 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 5 * 2; + return in; +} + +inline static const uint8_t* unpack6_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x3f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 6-bit bundles 0 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 12 | SafeLoadAs(in + 2 * 1) << 4), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 14 | SafeLoadAs(in + 2 * 5) << 2), + SafeLoadAs(in + 2 * 5), + SafeLoadAs(in + 2 * 5), + }; + shifts = simd_batch{ 0, 6, 0, 2, 8, 0, 4, 10, 0, 6, 0, 2, 8, 0, 4, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 6 * 2; + return in; +} + +inline static const uint8_t* unpack7_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x7f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 7-bit bundles 0 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 14 | SafeLoadAs(in + 2 * 1) << 2), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 12 | SafeLoadAs(in + 2 * 2) << 4), + SafeLoadAs(in + 2 * 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 10 | SafeLoadAs(in + 2 * 3) << 6), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 15 | SafeLoadAs(in + 2 * 4) << 1), + SafeLoadAs(in + 2 * 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 13 | SafeLoadAs(in + 2 * 5) << 3), + SafeLoadAs(in + 2 * 5), + static_cast(SafeLoadAs(in + 2 * 5) >> 11 | SafeLoadAs(in + 2 * 6) << 5), + SafeLoadAs(in + 2 * 6), + SafeLoadAs(in + 2 * 6), + }; + shifts = simd_batch{ 0, 7, 0, 5, 0, 3, 0, 1, 8, 0, 6, 0, 4, 0, 2, 9 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 7 * 2; + return in; +} + +inline static const uint8_t* unpack8_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0xff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 8-bit bundles 0 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 5), + SafeLoadAs(in + 2 * 5), + SafeLoadAs(in + 2 * 6), + SafeLoadAs(in + 2 * 6), + SafeLoadAs(in + 2 * 7), + SafeLoadAs(in + 2 * 7), + }; + shifts = simd_batch{ 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 8 * 2; + return in; +} + +inline static const uint8_t* unpack9_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x1ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 9-bit bundles 0 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 9 | SafeLoadAs(in + 2 * 1) << 7), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 11 | SafeLoadAs(in + 2 * 2) << 5), + SafeLoadAs(in + 2 * 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 15 | SafeLoadAs(in + 2 * 4) << 1), + static_cast(SafeLoadAs(in + 2 * 4) >> 8 | SafeLoadAs(in + 2 * 5) << 8), + SafeLoadAs(in + 2 * 5), + static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), + SafeLoadAs(in + 2 * 6), + static_cast(SafeLoadAs(in + 2 * 6) >> 12 | SafeLoadAs(in + 2 * 7) << 4), + SafeLoadAs(in + 2 * 7), + static_cast(SafeLoadAs(in + 2 * 7) >> 14 | SafeLoadAs(in + 2 * 8) << 2), + SafeLoadAs(in + 2 * 8), + }; + shifts = simd_batch{ 0, 0, 2, 0, 4, 0, 6, 0, 0, 1, 0, 3, 0, 5, 0, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 9 * 2; + return in; +} + +inline static const uint8_t* unpack10_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x3ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 10-bit bundles 0 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 10 | SafeLoadAs(in + 2 * 1) << 6), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 8 | SafeLoadAs(in + 2 * 3) << 8), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 5), + static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), + SafeLoadAs(in + 2 * 6), + static_cast(SafeLoadAs(in + 2 * 6) >> 14 | SafeLoadAs(in + 2 * 7) << 2), + static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), + SafeLoadAs(in + 2 * 8), + static_cast(SafeLoadAs(in + 2 * 8) >> 12 | SafeLoadAs(in + 2 * 9) << 4), + SafeLoadAs(in + 2 * 9), + }; + shifts = simd_batch{ 0, 0, 4, 0, 0, 2, 0, 6, 0, 0, 4, 0, 0, 2, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 10 * 2; + return in; +} + +inline static const uint8_t* unpack11_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x7ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 11-bit bundles 0 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 11 | SafeLoadAs(in + 2 * 1) << 5), + static_cast(SafeLoadAs(in + 2 * 1) >> 6 | SafeLoadAs(in + 2 * 2) << 10), + SafeLoadAs(in + 2 * 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 12 | SafeLoadAs(in + 2 * 3) << 4), + static_cast(SafeLoadAs(in + 2 * 3) >> 7 | SafeLoadAs(in + 2 * 4) << 9), + SafeLoadAs(in + 2 * 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 13 | SafeLoadAs(in + 2 * 5) << 3), + static_cast(SafeLoadAs(in + 2 * 5) >> 8 | SafeLoadAs(in + 2 * 6) << 8), + SafeLoadAs(in + 2 * 6), + static_cast(SafeLoadAs(in + 2 * 6) >> 14 | SafeLoadAs(in + 2 * 7) << 2), + static_cast(SafeLoadAs(in + 2 * 7) >> 9 | SafeLoadAs(in + 2 * 8) << 7), + SafeLoadAs(in + 2 * 8), + static_cast(SafeLoadAs(in + 2 * 8) >> 15 | SafeLoadAs(in + 2 * 9) << 1), + static_cast(SafeLoadAs(in + 2 * 9) >> 10 | SafeLoadAs(in + 2 * 10) << 6), + SafeLoadAs(in + 2 * 10), + }; + shifts = simd_batch{ 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 11 * 2; + return in; +} + +inline static const uint8_t* unpack12_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0xfff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 12-bit bundles 0 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 12 | SafeLoadAs(in + 2 * 1) << 4), + static_cast(SafeLoadAs(in + 2 * 1) >> 8 | SafeLoadAs(in + 2 * 2) << 8), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 8 | SafeLoadAs(in + 2 * 5) << 8), + SafeLoadAs(in + 2 * 5), + SafeLoadAs(in + 2 * 6), + static_cast(SafeLoadAs(in + 2 * 6) >> 12 | SafeLoadAs(in + 2 * 7) << 4), + static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), + SafeLoadAs(in + 2 * 8), + SafeLoadAs(in + 2 * 9), + static_cast(SafeLoadAs(in + 2 * 9) >> 12 | SafeLoadAs(in + 2 * 10) << 4), + static_cast(SafeLoadAs(in + 2 * 10) >> 8 | SafeLoadAs(in + 2 * 11) << 8), + SafeLoadAs(in + 2 * 11), + }; + shifts = simd_batch{ 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 12 * 2; + return in; +} + +inline static const uint8_t* unpack13_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x1fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 13-bit bundles 0 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 13 | SafeLoadAs(in + 2 * 1) << 3), + static_cast(SafeLoadAs(in + 2 * 1) >> 10 | SafeLoadAs(in + 2 * 2) << 6), + static_cast(SafeLoadAs(in + 2 * 2) >> 7 | SafeLoadAs(in + 2 * 3) << 9), + static_cast(SafeLoadAs(in + 2 * 3) >> 4 | SafeLoadAs(in + 2 * 4) << 12), + SafeLoadAs(in + 2 * 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 14 | SafeLoadAs(in + 2 * 5) << 2), + static_cast(SafeLoadAs(in + 2 * 5) >> 11 | SafeLoadAs(in + 2 * 6) << 5), + static_cast(SafeLoadAs(in + 2 * 6) >> 8 | SafeLoadAs(in + 2 * 7) << 8), + static_cast(SafeLoadAs(in + 2 * 7) >> 5 | SafeLoadAs(in + 2 * 8) << 11), + SafeLoadAs(in + 2 * 8), + static_cast(SafeLoadAs(in + 2 * 8) >> 15 | SafeLoadAs(in + 2 * 9) << 1), + static_cast(SafeLoadAs(in + 2 * 9) >> 12 | SafeLoadAs(in + 2 * 10) << 4), + static_cast(SafeLoadAs(in + 2 * 10) >> 9 | SafeLoadAs(in + 2 * 11) << 7), + static_cast(SafeLoadAs(in + 2 * 11) >> 6 | SafeLoadAs(in + 2 * 12) << 10), + SafeLoadAs(in + 2 * 12), + }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 13 * 2; + return in; +} + +inline static const uint8_t* unpack14_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x3fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 14-bit bundles 0 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 14 | SafeLoadAs(in + 2 * 1) << 2), + static_cast(SafeLoadAs(in + 2 * 1) >> 12 | SafeLoadAs(in + 2 * 2) << 4), + static_cast(SafeLoadAs(in + 2 * 2) >> 10 | SafeLoadAs(in + 2 * 3) << 6), + static_cast(SafeLoadAs(in + 2 * 3) >> 8 | SafeLoadAs(in + 2 * 4) << 8), + static_cast(SafeLoadAs(in + 2 * 4) >> 6 | SafeLoadAs(in + 2 * 5) << 10), + static_cast(SafeLoadAs(in + 2 * 5) >> 4 | SafeLoadAs(in + 2 * 6) << 12), + SafeLoadAs(in + 2 * 6), + SafeLoadAs(in + 2 * 7), + static_cast(SafeLoadAs(in + 2 * 7) >> 14 | SafeLoadAs(in + 2 * 8) << 2), + static_cast(SafeLoadAs(in + 2 * 8) >> 12 | SafeLoadAs(in + 2 * 9) << 4), + static_cast(SafeLoadAs(in + 2 * 9) >> 10 | SafeLoadAs(in + 2 * 10) << 6), + static_cast(SafeLoadAs(in + 2 * 10) >> 8 | SafeLoadAs(in + 2 * 11) << 8), + static_cast(SafeLoadAs(in + 2 * 11) >> 6 | SafeLoadAs(in + 2 * 12) << 10), + static_cast(SafeLoadAs(in + 2 * 12) >> 4 | SafeLoadAs(in + 2 * 13) << 12), + SafeLoadAs(in + 2 * 13), + }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 14 * 2; + return in; +} + +inline static const uint8_t* unpack15_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x7fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 15-bit bundles 0 to 15 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 11 | SafeLoadAs(in + 2 * 5) << 5), + static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), + static_cast(SafeLoadAs(in + 2 * 6) >> 9 | SafeLoadAs(in + 2 * 7) << 7), + static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), + static_cast(SafeLoadAs(in + 2 * 8) >> 7 | SafeLoadAs(in + 2 * 9) << 9), + static_cast(SafeLoadAs(in + 2 * 9) >> 6 | SafeLoadAs(in + 2 * 10) << 10), + static_cast(SafeLoadAs(in + 2 * 10) >> 5 | SafeLoadAs(in + 2 * 11) << 11), + static_cast(SafeLoadAs(in + 2 * 11) >> 4 | SafeLoadAs(in + 2 * 12) << 12), + static_cast(SafeLoadAs(in + 2 * 12) >> 3 | SafeLoadAs(in + 2 * 13) << 13), + static_cast(SafeLoadAs(in + 2 * 13) >> 2 | SafeLoadAs(in + 2 * 14) << 14), + SafeLoadAs(in + 2 * 14), + }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 15 * 2; + return in; +} + +inline static const uint8_t* unpack16_16(const uint8_t* in, uint16_t* out) { + std::memcpy(out, in, 16 * sizeof(*out)); + in += 2 * 16; + out += 16; + + return in; +} + +inline static const uint8_t* unpack0_32(const uint8_t* in, uint32_t* out) { + std::memset(out, 0x0, 32 * sizeof(*out)); out += 32; return in; } -inline static const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1; simd_batch masks(mask); @@ -53,38 +622,76 @@ inline static const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 1-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 0, 1, 2, 3, 4, 5, 6, 7 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 1-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 8, 9, 10, 11, 12, 13, 14, 15 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 1-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 16, 17, 18, 19, 20, 21, 22, 23 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 1-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 24, 25, 26, 27, 28, 29, 30, 31 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 1; + in += 1 * 4; return in; } -inline static const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3; simd_batch masks(mask); @@ -92,38 +699,76 @@ inline static const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 2-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 2-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 16, 18, 20, 22, 24, 26, 28, 30 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 2-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 2-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 16, 18, 20, 22, 24, 26, 28, 30 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 2; + in += 2 * 4; return in; } -inline static const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7; simd_batch masks(mask); @@ -131,38 +776,76 @@ inline static const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 3-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 0, 3, 6, 9, 12, 15, 18, 21 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 3-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 24, 27, 0, 1, 4, 7, 10, 13 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 3-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 16, 19, 22, 25, 28, 0, 2, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 3-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 8, 11, 14, 17, 20, 23, 26, 29 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 3; + in += 3 * 4; return in; } -inline static const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xf; simd_batch masks(mask); @@ -170,38 +853,76 @@ inline static const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 4-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 4-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 4-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 4-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 4; + in += 4 * 4; return in; } -inline static const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1f; simd_batch masks(mask); @@ -209,38 +930,76 @@ inline static const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 5-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 5, 10, 15, 20, 25, 0, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 5-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 8, 13, 18, 23, 0, 1, 6, 11 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 5-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 31 | SafeLoadAs(in + 4 * 3) << 1, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 16, 21, 26, 0, 4, 9, 14, 19 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 5-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 29 | SafeLoadAs(in + 4 * 4) << 3, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 24, 0, 2, 7, 12, 17, 22, 27 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 5; + in += 5 * 4; return in; } -inline static const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3f; simd_batch masks(mask); @@ -248,38 +1007,76 @@ inline static const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 6-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 6, 12, 18, 24, 0, 4, 10 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 6-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 16, 22, 0, 2, 8, 14, 20, 26 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 6-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 0, 6, 12, 18, 24, 0, 4, 10 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 6-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 16, 22, 0, 2, 8, 14, 20, 26 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 6; + in += 6 * 4; return in; } -inline static const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7f; simd_batch masks(mask); @@ -287,38 +1084,76 @@ inline static const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 7-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 7, 14, 21, 0, 3, 10, 17 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 7-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5, SafeLoad(in + 3), SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 27 | SafeLoadAs(in + 4 * 3) << 5, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 24, 0, 6, 13, 20, 0, 2, 9 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 7-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 16, 23, 0, 5, 12, 19, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 7-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + }; shifts = simd_batch{ 8, 15, 22, 0, 4, 11, 18, 25 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 7; + in += 7 * 4; return in; } -inline static const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xff; simd_batch masks(mask); @@ -326,38 +1161,76 @@ inline static const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 8-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 8-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 8-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 8-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + }; shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 8; + in += 8 * 4; return in; } -inline static const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1ff; simd_batch masks(mask); @@ -365,38 +1238,76 @@ inline static const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 9-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, + }; shifts = simd_batch{ 0, 9, 18, 0, 4, 13, 22, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 9-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 8, 17, 0, 3, 12, 21, 0, 7 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 9-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6), SafeLoad(in + 6) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 25 | SafeLoadAs(in + 4 * 5) << 7, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + }; shifts = simd_batch{ 16, 0, 2, 11, 20, 0, 6, 15 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 9-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + }; shifts = simd_batch{ 0, 1, 10, 19, 0, 5, 14, 23 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 9; + in += 9 * 4; return in; } -inline static const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3ff; simd_batch masks(mask); @@ -404,38 +1315,76 @@ inline static const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 10-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 0, 10, 20, 0, 8, 18, 0, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 10-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 16, 0, 4, 14, 0, 2, 12, 22 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 10-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2, SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 30 | SafeLoadAs(in + 4 * 6) << 2, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, + SafeLoadAs(in + 4 * 7), + }; shifts = simd_batch{ 0, 10, 20, 0, 8, 18, 0, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 10-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 26 | SafeLoadAs(in + 4 * 8) << 6, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), + }; shifts = simd_batch{ 16, 0, 4, 14, 0, 2, 12, 22 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 10; + in += 10 * 4; return in; } -inline static const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7ff; simd_batch masks(mask); @@ -443,38 +1392,76 @@ inline static const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 11-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 23 | SafeLoad(in + 2) << 9, SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 22 | SafeLoadAs(in + 4 * 1) << 10, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 23 | SafeLoadAs(in + 4 * 2) << 9, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 0, 11, 0, 1, 12, 0, 2, 13 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 11-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 25 | SafeLoad(in + 4) << 7, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2) >> 24 | SafeLoadAs(in + 4 * 3) << 8, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 25 | SafeLoadAs(in + 4 * 4) << 7, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 0, 3, 14, 0, 4, 15, 0, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 11-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5, SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 27 | SafeLoadAs(in + 4 * 6) << 5, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 29 | SafeLoadAs(in + 4 * 8) << 3, + }; shifts = simd_batch{ 16, 0, 6, 17, 0, 7, 18, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 11-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1, SafeLoad(in + 10), SafeLoad(in + 10) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10), + }; shifts = simd_batch{ 8, 19, 0, 9, 20, 0, 10, 21 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 11; + in += 11 * 4; return in; } -inline static const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xfff; simd_batch masks(mask); @@ -482,38 +1469,76 @@ inline static const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 12-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 24 | SafeLoad(in + 1) << 8, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 24 | SafeLoadAs(in + 4 * 1) << 8, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 12-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5), SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 12-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + }; shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 12-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 10) >> 28 | SafeLoad(in + 11) << 4, SafeLoad(in + 11), SafeLoad(in + 11) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11), + }; shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 12; + in += 12 * 4; return in; } -inline static const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1fff; simd_batch masks(mask); @@ -521,38 +1546,76 @@ inline static const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 13-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 26 | SafeLoad(in + 1) << 6, SafeLoad(in + 1), SafeLoad(in + 1) >> 20 | SafeLoad(in + 2) << 12, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 26 | SafeLoadAs(in + 4 * 1) << 6, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 20 | SafeLoadAs(in + 4 * 2) << 12, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 27 | SafeLoadAs(in + 4 * 3) << 5, + }; shifts = simd_batch{ 0, 13, 0, 7, 0, 1, 14, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 13-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 21 | SafeLoad(in + 4) << 11, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5), SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 6) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 21 | SafeLoadAs(in + 4 * 4) << 11, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, + SafeLoadAs(in + 4 * 6), + }; shifts = simd_batch{ 8, 0, 2, 15, 0, 9, 0, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 13-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6) >> 29 | SafeLoad(in + 7) << 3, SafeLoad(in + 7), SafeLoad(in + 7) >> 23 | SafeLoad(in + 8) << 9, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 29 | SafeLoadAs(in + 4 * 7) << 3, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 23 | SafeLoadAs(in + 4 * 8) << 9, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, + SafeLoadAs(in + 4 * 9), + }; shifts = simd_batch{ 16, 0, 10, 0, 4, 17, 0, 11 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 13-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 10) >> 31 | SafeLoad(in + 11) << 1, SafeLoad(in + 11), SafeLoad(in + 11) >> 25 | SafeLoad(in + 12) << 7, SafeLoad(in + 12), SafeLoad(in + 12) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12), + }; shifts = simd_batch{ 0, 5, 18, 0, 12, 0, 6, 19 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 13; + in += 13 * 4; return in; } -inline static const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3fff; simd_batch masks(mask); @@ -560,38 +1623,76 @@ inline static const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 14-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1), SafeLoad(in + 1) >> 24 | SafeLoad(in + 2) << 8, SafeLoad(in + 2), SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 24 | SafeLoadAs(in + 4 * 2) << 8, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 0, 14, 0, 10, 0, 6, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 14-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5), SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 6), SafeLoad(in + 6) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + }; shifts = simd_batch{ 16, 0, 12, 0, 8, 0, 4, 18 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 14-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9), SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 10) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, + SafeLoadAs(in + 4 * 10), + }; shifts = simd_batch{ 0, 14, 0, 10, 0, 6, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 14-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 10), SafeLoad(in + 10) >> 30 | SafeLoad(in + 11) << 2, SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12), SafeLoad(in + 12) >> 22 | SafeLoad(in + 13) << 10, SafeLoad(in + 13), SafeLoad(in + 13) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 30 | SafeLoadAs(in + 4 * 11) << 2, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13), + }; shifts = simd_batch{ 16, 0, 12, 0, 8, 0, 4, 18 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 14; + in += 14 * 4; return in; } -inline static const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7fff; simd_batch masks(mask); @@ -599,38 +1700,76 @@ inline static const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 15-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 0, 15, 0, 13, 0, 11, 0, 9 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 15-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6), SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14, SafeLoad(in + 7) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, + SafeLoadAs(in + 4 * 7), + }; shifts = simd_batch{ 0, 7, 0, 5, 0, 3, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 15-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 31 | SafeLoad(in + 8) << 1, SafeLoad(in + 8), SafeLoad(in + 8) >> 29 | SafeLoad(in + 9) << 3, SafeLoad(in + 9), SafeLoad(in + 9) >> 27 | SafeLoad(in + 10) << 5, SafeLoad(in + 10), SafeLoad(in + 10) >> 25 | SafeLoad(in + 11) << 7 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 31 | SafeLoadAs(in + 4 * 8) << 1, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 29 | SafeLoadAs(in + 4 * 9) << 3, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 27 | SafeLoadAs(in + 4 * 10) << 5, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 25 | SafeLoadAs(in + 4 * 11) << 7, + }; shifts = simd_batch{ 16, 0, 14, 0, 12, 0, 10, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 15-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 11), SafeLoad(in + 11) >> 23 | SafeLoad(in + 12) << 9, SafeLoad(in + 12), SafeLoad(in + 12) >> 21 | SafeLoad(in + 13) << 11, SafeLoad(in + 13), SafeLoad(in + 13) >> 19 | SafeLoad(in + 14) << 13, SafeLoad(in + 14), SafeLoad(in + 14) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 23 | SafeLoadAs(in + 4 * 12) << 9, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14), + }; shifts = simd_batch{ 8, 0, 6, 0, 4, 0, 2, 17 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 15; + in += 15 * 4; return in; } -inline static const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xffff; simd_batch masks(mask); @@ -638,38 +1777,76 @@ inline static const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 16-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 16-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 7), SafeLoad(in + 7) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + }; shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 16-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 11), SafeLoad(in + 11) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11), + }; shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 16-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 12), SafeLoad(in + 12), SafeLoad(in + 13), SafeLoad(in + 13), SafeLoad(in + 14), SafeLoad(in + 14), SafeLoad(in + 15), SafeLoad(in + 15) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15), + }; shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 16; + in += 16 * 4; return in; } -inline static const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1ffff; simd_batch masks(mask); @@ -677,38 +1854,76 @@ inline static const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 17-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 17 | SafeLoad(in + 1) << 15, SafeLoad(in + 1), SafeLoad(in + 1) >> 19 | SafeLoad(in + 2) << 13, SafeLoad(in + 2), SafeLoad(in + 2) >> 21 | SafeLoad(in + 3) << 11, SafeLoad(in + 3), SafeLoad(in + 3) >> 23 | SafeLoad(in + 4) << 9 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 17 | SafeLoadAs(in + 4 * 1) << 15, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 19 | SafeLoadAs(in + 4 * 2) << 13, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 21 | SafeLoadAs(in + 4 * 3) << 11, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 23 | SafeLoadAs(in + 4 * 4) << 9, + }; shifts = simd_batch{ 0, 0, 2, 0, 4, 0, 6, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 17-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 5), SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5, SafeLoad(in + 6), SafeLoad(in + 6) >> 29 | SafeLoad(in + 7) << 3, SafeLoad(in + 7), SafeLoad(in + 7) >> 31 | SafeLoad(in + 8) << 1 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 25 | SafeLoadAs(in + 4 * 5) << 7, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 27 | SafeLoadAs(in + 4 * 6) << 5, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 29 | SafeLoadAs(in + 4 * 7) << 3, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 31 | SafeLoadAs(in + 4 * 8) << 1, + }; shifts = simd_batch{ 8, 0, 10, 0, 12, 0, 14, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 17-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 8) >> 16 | SafeLoad(in + 9) << 16, SafeLoad(in + 9), SafeLoad(in + 9) >> 18 | SafeLoad(in + 10) << 14, SafeLoad(in + 10), SafeLoad(in + 10) >> 20 | SafeLoad(in + 11) << 12, SafeLoad(in + 11), SafeLoad(in + 11) >> 22 | SafeLoad(in + 12) << 10, SafeLoad(in + 12) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 8) >> 16 | SafeLoadAs(in + 4 * 9) << 16, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 18 | SafeLoadAs(in + 4 * 10) << 14, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 20 | SafeLoadAs(in + 4 * 11) << 12, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 22 | SafeLoadAs(in + 4 * 12) << 10, + SafeLoadAs(in + 4 * 12), + }; shifts = simd_batch{ 0, 1, 0, 3, 0, 5, 0, 7 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 17-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 12) >> 24 | SafeLoad(in + 13) << 8, SafeLoad(in + 13), SafeLoad(in + 13) >> 26 | SafeLoad(in + 14) << 6, SafeLoad(in + 14), SafeLoad(in + 14) >> 28 | SafeLoad(in + 15) << 4, SafeLoad(in + 15), SafeLoad(in + 15) >> 30 | SafeLoad(in + 16) << 2, SafeLoad(in + 16) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 12) >> 24 | SafeLoadAs(in + 4 * 13) << 8, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 26 | SafeLoadAs(in + 4 * 14) << 6, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 28 | SafeLoadAs(in + 4 * 15) << 4, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, + SafeLoadAs(in + 4 * 16), + }; shifts = simd_batch{ 0, 9, 0, 11, 0, 13, 0, 15 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 17; + in += 17 * 4; return in; } -inline static const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3ffff; simd_batch masks(mask); @@ -716,38 +1931,76 @@ inline static const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 18-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 18 | SafeLoad(in + 1) << 14, SafeLoad(in + 1), SafeLoad(in + 1) >> 22 | SafeLoad(in + 2) << 10, SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 18 | SafeLoadAs(in + 4 * 1) << 14, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + }; shifts = simd_batch{ 0, 0, 4, 0, 8, 0, 12, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 18-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 4) >> 16 | SafeLoad(in + 5) << 16, SafeLoad(in + 5), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4) >> 16 | SafeLoadAs(in + 4 * 5) << 16, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8), + }; shifts = simd_batch{ 0, 2, 0, 6, 0, 10, 0, 14 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 18-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9) >> 18 | SafeLoad(in + 10) << 14, SafeLoad(in + 10), SafeLoad(in + 10) >> 22 | SafeLoad(in + 11) << 10, SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12), SafeLoad(in + 12) >> 30 | SafeLoad(in + 13) << 2 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 18 | SafeLoadAs(in + 4 * 10) << 14, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 22 | SafeLoadAs(in + 4 * 11) << 10, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 30 | SafeLoadAs(in + 4 * 13) << 2, + }; shifts = simd_batch{ 0, 0, 4, 0, 8, 0, 12, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 18-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14), SafeLoad(in + 14) >> 20 | SafeLoad(in + 15) << 12, SafeLoad(in + 15), SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 20 | SafeLoadAs(in + 4 * 15) << 12, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, + SafeLoadAs(in + 4 * 16), + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 17), + }; shifts = simd_batch{ 0, 2, 0, 6, 0, 10, 0, 14 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 18; + in += 18 * 4; return in; } -inline static const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7ffff; simd_batch masks(mask); @@ -755,38 +2008,76 @@ inline static const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 19-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 19 | SafeLoad(in + 1) << 13, SafeLoad(in + 1), SafeLoad(in + 1) >> 25 | SafeLoad(in + 2) << 7, SafeLoad(in + 2), SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1, SafeLoad(in + 3) >> 18 | SafeLoad(in + 4) << 14, SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 19 | SafeLoadAs(in + 4 * 1) << 13, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 25 | SafeLoadAs(in + 4 * 2) << 7, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 31 | SafeLoadAs(in + 4 * 3) << 1, + SafeLoadAs(in + 4 * 3) >> 18 | SafeLoadAs(in + 4 * 4) << 14, + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 0, 0, 6, 0, 12, 0, 0, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 19-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 4) >> 24 | SafeLoad(in + 5) << 8, SafeLoad(in + 5), SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2, SafeLoad(in + 6) >> 17 | SafeLoad(in + 7) << 15, SafeLoad(in + 7), SafeLoad(in + 7) >> 23 | SafeLoad(in + 8) << 9, SafeLoad(in + 8), SafeLoad(in + 8) >> 29 | SafeLoad(in + 9) << 3 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4) >> 24 | SafeLoadAs(in + 4 * 5) << 8, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 30 | SafeLoadAs(in + 4 * 6) << 2, + SafeLoadAs(in + 4 * 6) >> 17 | SafeLoadAs(in + 4 * 7) << 15, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 23 | SafeLoadAs(in + 4 * 8) << 9, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 29 | SafeLoadAs(in + 4 * 9) << 3, + }; shifts = simd_batch{ 0, 11, 0, 0, 4, 0, 10, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 19-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 9) >> 16 | SafeLoad(in + 10) << 16, SafeLoad(in + 10), SafeLoad(in + 10) >> 22 | SafeLoad(in + 11) << 10, SafeLoad(in + 11), SafeLoad(in + 11) >> 28 | SafeLoad(in + 12) << 4, SafeLoad(in + 12) >> 15 | SafeLoad(in + 13) << 17, SafeLoad(in + 13), SafeLoad(in + 13) >> 21 | SafeLoad(in + 14) << 11 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 9) >> 16 | SafeLoadAs(in + 4 * 10) << 16, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 22 | SafeLoadAs(in + 4 * 11) << 10, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 28 | SafeLoadAs(in + 4 * 12) << 4, + SafeLoadAs(in + 4 * 12) >> 15 | SafeLoadAs(in + 4 * 13) << 17, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 21 | SafeLoadAs(in + 4 * 14) << 11, + }; shifts = simd_batch{ 0, 3, 0, 9, 0, 0, 2, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 19-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 14), SafeLoad(in + 14) >> 27 | SafeLoad(in + 15) << 5, SafeLoad(in + 15) >> 14 | SafeLoad(in + 16) << 18, SafeLoad(in + 16), SafeLoad(in + 16) >> 20 | SafeLoad(in + 17) << 12, SafeLoad(in + 17), SafeLoad(in + 17) >> 26 | SafeLoad(in + 18) << 6, SafeLoad(in + 18) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 27 | SafeLoadAs(in + 4 * 15) << 5, + SafeLoadAs(in + 4 * 15) >> 14 | SafeLoadAs(in + 4 * 16) << 18, + SafeLoadAs(in + 4 * 16), + SafeLoadAs(in + 4 * 16) >> 20 | SafeLoadAs(in + 4 * 17) << 12, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, + SafeLoadAs(in + 4 * 18), + }; shifts = simd_batch{ 8, 0, 0, 1, 0, 7, 0, 13 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 19; + in += 19 * 4; return in; } -inline static const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xfffff; simd_batch masks(mask); @@ -794,38 +2085,76 @@ inline static const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 20-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 20 | SafeLoad(in + 1) << 12, SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) >> 16 | SafeLoad(in + 3) << 16, SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 20 | SafeLoadAs(in + 4 * 1) << 12, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2) >> 16 | SafeLoadAs(in + 4 * 3) << 16, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 20-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, + SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, + SafeLoadAs(in + 4 * 9), + }; shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 20-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 10), SafeLoad(in + 10) >> 20 | SafeLoad(in + 11) << 12, SafeLoad(in + 11), SafeLoad(in + 11) >> 28 | SafeLoad(in + 12) << 4, SafeLoad(in + 12) >> 16 | SafeLoad(in + 13) << 16, SafeLoad(in + 13), SafeLoad(in + 13) >> 24 | SafeLoad(in + 14) << 8, SafeLoad(in + 14) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 20 | SafeLoadAs(in + 4 * 11) << 12, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 28 | SafeLoadAs(in + 4 * 12) << 4, + SafeLoadAs(in + 4 * 12) >> 16 | SafeLoadAs(in + 4 * 13) << 16, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 24 | SafeLoadAs(in + 4 * 14) << 8, + SafeLoadAs(in + 4 * 14), + }; shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 20-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 15), SafeLoad(in + 15) >> 20 | SafeLoad(in + 16) << 12, SafeLoad(in + 16), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) >> 16 | SafeLoad(in + 18) << 16, SafeLoad(in + 18), SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 20 | SafeLoadAs(in + 4 * 16) << 12, + SafeLoadAs(in + 4 * 16), + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 17) >> 16 | SafeLoadAs(in + 4 * 18) << 16, + SafeLoadAs(in + 4 * 18), + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19), + }; shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 20; + in += 20 * 4; return in; } -inline static const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1fffff; simd_batch masks(mask); @@ -833,38 +2162,76 @@ inline static const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 21-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 21 | SafeLoad(in + 1) << 11, SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) >> 19 | SafeLoad(in + 5) << 13 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 21 | SafeLoadAs(in + 4 * 1) << 11, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, + SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4) >> 19 | SafeLoadAs(in + 4 * 5) << 13, + }; shifts = simd_batch{ 0, 0, 10, 0, 0, 9, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 21-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14, SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) >> 17 | SafeLoad(in + 9) << 15, SafeLoad(in + 9), SafeLoad(in + 9) >> 27 | SafeLoad(in + 10) << 5 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8) >> 17 | SafeLoadAs(in + 4 * 9) << 15, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 27 | SafeLoadAs(in + 4 * 10) << 5, + }; shifts = simd_batch{ 8, 0, 0, 7, 0, 0, 6, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 21-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12) >> 15 | SafeLoad(in + 13) << 17, SafeLoad(in + 13), SafeLoad(in + 13) >> 25 | SafeLoad(in + 14) << 7, SafeLoad(in + 14) >> 14 | SafeLoad(in + 15) << 18, SafeLoad(in + 15) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + SafeLoadAs(in + 4 * 12) >> 15 | SafeLoadAs(in + 4 * 13) << 17, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 25 | SafeLoadAs(in + 4 * 14) << 7, + SafeLoadAs(in + 4 * 14) >> 14 | SafeLoadAs(in + 4 * 15) << 18, + SafeLoadAs(in + 4 * 15), + }; shifts = simd_batch{ 0, 5, 0, 0, 4, 0, 0, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 21-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 13 | SafeLoad(in + 17) << 19, SafeLoad(in + 17), SafeLoad(in + 17) >> 23 | SafeLoad(in + 18) << 9, SafeLoad(in + 18) >> 12 | SafeLoad(in + 19) << 20, SafeLoad(in + 19), SafeLoad(in + 19) >> 22 | SafeLoad(in + 20) << 10, SafeLoad(in + 20) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, + SafeLoadAs(in + 4 * 16) >> 13 | SafeLoadAs(in + 4 * 17) << 19, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, + SafeLoadAs(in + 4 * 18) >> 12 | SafeLoadAs(in + 4 * 19) << 20, + SafeLoadAs(in + 4 * 19), + SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, + SafeLoadAs(in + 4 * 20), + }; shifts = simd_batch{ 0, 0, 2, 0, 0, 1, 0, 11 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 21; + in += 21 * 4; return in; } -inline static const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3fffff; simd_batch masks(mask); @@ -872,38 +2239,76 @@ inline static const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 22-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10, SafeLoad(in + 1) >> 12 | SafeLoad(in + 2) << 20, SafeLoad(in + 2), SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8, SafeLoad(in + 3) >> 14 | SafeLoad(in + 4) << 18, SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 22 | SafeLoadAs(in + 4 * 1) << 10, + SafeLoadAs(in + 4 * 1) >> 12 | SafeLoadAs(in + 4 * 2) << 20, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 24 | SafeLoadAs(in + 4 * 3) << 8, + SafeLoadAs(in + 4 * 3) >> 14 | SafeLoadAs(in + 4 * 4) << 18, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, + }; shifts = simd_batch{ 0, 0, 0, 2, 0, 0, 4, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 22-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 5) >> 16 | SafeLoad(in + 6) << 16, SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7) >> 18 | SafeLoad(in + 8) << 14, SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 10) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5) >> 16 | SafeLoadAs(in + 4 * 6) << 16, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, + SafeLoadAs(in + 4 * 7) >> 18 | SafeLoadAs(in + 4 * 8) << 14, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, + SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, + SafeLoadAs(in + 4 * 10), + }; shifts = simd_batch{ 0, 6, 0, 0, 8, 0, 0, 10 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 22-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 11), SafeLoad(in + 11) >> 22 | SafeLoad(in + 12) << 10, SafeLoad(in + 12) >> 12 | SafeLoad(in + 13) << 20, SafeLoad(in + 13), SafeLoad(in + 13) >> 24 | SafeLoad(in + 14) << 8, SafeLoad(in + 14) >> 14 | SafeLoad(in + 15) << 18, SafeLoad(in + 15), SafeLoad(in + 15) >> 26 | SafeLoad(in + 16) << 6 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 22 | SafeLoadAs(in + 4 * 12) << 10, + SafeLoadAs(in + 4 * 12) >> 12 | SafeLoadAs(in + 4 * 13) << 20, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 24 | SafeLoadAs(in + 4 * 14) << 8, + SafeLoadAs(in + 4 * 14) >> 14 | SafeLoadAs(in + 4 * 15) << 18, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 26 | SafeLoadAs(in + 4 * 16) << 6, + }; shifts = simd_batch{ 0, 0, 0, 2, 0, 0, 4, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 22-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 16) >> 16 | SafeLoad(in + 17) << 16, SafeLoad(in + 17), SafeLoad(in + 17) >> 28 | SafeLoad(in + 18) << 4, SafeLoad(in + 18) >> 18 | SafeLoad(in + 19) << 14, SafeLoad(in + 19), SafeLoad(in + 19) >> 30 | SafeLoad(in + 20) << 2, SafeLoad(in + 20) >> 20 | SafeLoad(in + 21) << 12, SafeLoad(in + 21) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 16) >> 16 | SafeLoadAs(in + 4 * 17) << 16, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 28 | SafeLoadAs(in + 4 * 18) << 4, + SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, + SafeLoadAs(in + 4 * 19), + SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, + SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, + SafeLoadAs(in + 4 * 21), + }; shifts = simd_batch{ 0, 6, 0, 0, 8, 0, 0, 10 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 22; + in += 22 * 4; return in; } -inline static const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7fffff; simd_batch masks(mask); @@ -911,38 +2316,76 @@ inline static const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 23-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 23 | SafeLoad(in + 1) << 9, SafeLoad(in + 1) >> 14 | SafeLoad(in + 2) << 18, SafeLoad(in + 2), SafeLoad(in + 2) >> 28 | SafeLoad(in + 3) << 4, SafeLoad(in + 3) >> 19 | SafeLoad(in + 4) << 13, SafeLoad(in + 4) >> 10 | SafeLoad(in + 5) << 22, SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 23 | SafeLoadAs(in + 4 * 1) << 9, + SafeLoadAs(in + 4 * 1) >> 14 | SafeLoadAs(in + 4 * 2) << 18, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 28 | SafeLoadAs(in + 4 * 3) << 4, + SafeLoadAs(in + 4 * 3) >> 19 | SafeLoadAs(in + 4 * 4) << 13, + SafeLoadAs(in + 4 * 4) >> 10 | SafeLoadAs(in + 4 * 5) << 22, + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 0, 0, 0, 5, 0, 0, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 23-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 5) >> 24 | SafeLoad(in + 6) << 8, SafeLoad(in + 6) >> 15 | SafeLoad(in + 7) << 17, SafeLoad(in + 7), SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3, SafeLoad(in + 8) >> 20 | SafeLoad(in + 9) << 12, SafeLoad(in + 9) >> 11 | SafeLoad(in + 10) << 21, SafeLoad(in + 10), SafeLoad(in + 10) >> 25 | SafeLoad(in + 11) << 7 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5) >> 24 | SafeLoadAs(in + 4 * 6) << 8, + SafeLoadAs(in + 4 * 6) >> 15 | SafeLoadAs(in + 4 * 7) << 17, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 29 | SafeLoadAs(in + 4 * 8) << 3, + SafeLoadAs(in + 4 * 8) >> 20 | SafeLoadAs(in + 4 * 9) << 12, + SafeLoadAs(in + 4 * 9) >> 11 | SafeLoadAs(in + 4 * 10) << 21, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 25 | SafeLoadAs(in + 4 * 11) << 7, + }; shifts = simd_batch{ 0, 0, 6, 0, 0, 0, 2, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 23-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 11) >> 16 | SafeLoad(in + 12) << 16, SafeLoad(in + 12), SafeLoad(in + 12) >> 30 | SafeLoad(in + 13) << 2, SafeLoad(in + 13) >> 21 | SafeLoad(in + 14) << 11, SafeLoad(in + 14) >> 12 | SafeLoad(in + 15) << 20, SafeLoad(in + 15), SafeLoad(in + 15) >> 26 | SafeLoad(in + 16) << 6, SafeLoad(in + 16) >> 17 | SafeLoad(in + 17) << 15 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 11) >> 16 | SafeLoadAs(in + 4 * 12) << 16, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 30 | SafeLoadAs(in + 4 * 13) << 2, + SafeLoadAs(in + 4 * 13) >> 21 | SafeLoadAs(in + 4 * 14) << 11, + SafeLoadAs(in + 4 * 14) >> 12 | SafeLoadAs(in + 4 * 15) << 20, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 26 | SafeLoadAs(in + 4 * 16) << 6, + SafeLoadAs(in + 4 * 16) >> 17 | SafeLoadAs(in + 4 * 17) << 15, + }; shifts = simd_batch{ 0, 7, 0, 0, 0, 3, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 23-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 17), SafeLoad(in + 17) >> 31 | SafeLoad(in + 18) << 1, SafeLoad(in + 18) >> 22 | SafeLoad(in + 19) << 10, SafeLoad(in + 19) >> 13 | SafeLoad(in + 20) << 19, SafeLoad(in + 20), SafeLoad(in + 20) >> 27 | SafeLoad(in + 21) << 5, SafeLoad(in + 21) >> 18 | SafeLoad(in + 22) << 14, SafeLoad(in + 22) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 31 | SafeLoadAs(in + 4 * 18) << 1, + SafeLoadAs(in + 4 * 18) >> 22 | SafeLoadAs(in + 4 * 19) << 10, + SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, + SafeLoadAs(in + 4 * 20), + SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, + SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, + SafeLoadAs(in + 4 * 22), + }; shifts = simd_batch{ 8, 0, 0, 0, 4, 0, 0, 9 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 23; + in += 23 * 4; return in; } -inline static const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xffffff; simd_batch masks(mask); @@ -950,38 +2393,76 @@ inline static const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 24-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 24 | SafeLoad(in + 1) << 8, SafeLoad(in + 1) >> 16 | SafeLoad(in + 2) << 16, SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) >> 16 | SafeLoad(in + 5) << 16, SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 24 | SafeLoadAs(in + 4 * 1) << 8, + SafeLoadAs(in + 4 * 1) >> 16 | SafeLoadAs(in + 4 * 2) << 16, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4) >> 16 | SafeLoadAs(in + 4 * 5) << 16, + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 24-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8), SafeLoad(in + 9), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, + SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, + SafeLoadAs(in + 4 * 11), + }; shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 24-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 12), SafeLoad(in + 12) >> 24 | SafeLoad(in + 13) << 8, SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14), SafeLoad(in + 15), SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 16 | SafeLoad(in + 17) << 16, SafeLoad(in + 17) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 24 | SafeLoadAs(in + 4 * 13) << 8, + SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, + SafeLoadAs(in + 4 * 16) >> 16 | SafeLoadAs(in + 4 * 17) << 16, + SafeLoadAs(in + 4 * 17), + }; shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 24-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 18), SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 16 | SafeLoad(in + 20) << 16, SafeLoad(in + 20), SafeLoad(in + 21), SafeLoad(in + 21) >> 24 | SafeLoad(in + 22) << 8, SafeLoad(in + 22) >> 16 | SafeLoad(in + 23) << 16, SafeLoad(in + 23) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 18), + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19) >> 16 | SafeLoadAs(in + 4 * 20) << 16, + SafeLoadAs(in + 4 * 20), + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, + SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, + SafeLoadAs(in + 4 * 23), + }; shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 24; + in += 24 * 4; return in; } -inline static const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1ffffff; simd_batch masks(mask); @@ -989,38 +2470,76 @@ inline static const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 25-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 25 | SafeLoad(in + 1) << 7, SafeLoad(in + 1) >> 18 | SafeLoad(in + 2) << 14, SafeLoad(in + 2) >> 11 | SafeLoad(in + 3) << 21, SafeLoad(in + 3), SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5) >> 15 | SafeLoad(in + 6) << 17 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 25 | SafeLoadAs(in + 4 * 1) << 7, + SafeLoadAs(in + 4 * 1) >> 18 | SafeLoadAs(in + 4 * 2) << 14, + SafeLoadAs(in + 4 * 2) >> 11 | SafeLoadAs(in + 4 * 3) << 21, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 29 | SafeLoadAs(in + 4 * 4) << 3, + SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, + SafeLoadAs(in + 4 * 5) >> 15 | SafeLoadAs(in + 4 * 6) << 17, + }; shifts = simd_batch{ 0, 0, 0, 0, 4, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 25-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 6) >> 8 | SafeLoad(in + 7) << 24, SafeLoad(in + 7), SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6, SafeLoad(in + 8) >> 19 | SafeLoad(in + 9) << 13, SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 20, SafeLoad(in + 10), SafeLoad(in + 10) >> 30 | SafeLoad(in + 11) << 2, SafeLoad(in + 11) >> 23 | SafeLoad(in + 12) << 9 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6) >> 8 | SafeLoadAs(in + 4 * 7) << 24, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 26 | SafeLoadAs(in + 4 * 8) << 6, + SafeLoadAs(in + 4 * 8) >> 19 | SafeLoadAs(in + 4 * 9) << 13, + SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 30 | SafeLoadAs(in + 4 * 11) << 2, + SafeLoadAs(in + 4 * 11) >> 23 | SafeLoadAs(in + 4 * 12) << 9, + }; shifts = simd_batch{ 0, 1, 0, 0, 0, 5, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 25-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 12) >> 16 | SafeLoad(in + 13) << 16, SafeLoad(in + 13) >> 9 | SafeLoad(in + 14) << 23, SafeLoad(in + 14), SafeLoad(in + 14) >> 27 | SafeLoad(in + 15) << 5, SafeLoad(in + 15) >> 20 | SafeLoad(in + 16) << 12, SafeLoad(in + 16) >> 13 | SafeLoad(in + 17) << 19, SafeLoad(in + 17), SafeLoad(in + 17) >> 31 | SafeLoad(in + 18) << 1 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 12) >> 16 | SafeLoadAs(in + 4 * 13) << 16, + SafeLoadAs(in + 4 * 13) >> 9 | SafeLoadAs(in + 4 * 14) << 23, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 27 | SafeLoadAs(in + 4 * 15) << 5, + SafeLoadAs(in + 4 * 15) >> 20 | SafeLoadAs(in + 4 * 16) << 12, + SafeLoadAs(in + 4 * 16) >> 13 | SafeLoadAs(in + 4 * 17) << 19, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 31 | SafeLoadAs(in + 4 * 18) << 1, + }; shifts = simd_batch{ 0, 0, 2, 0, 0, 0, 6, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 25-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 17 | SafeLoad(in + 20) << 15, SafeLoad(in + 20) >> 10 | SafeLoad(in + 21) << 22, SafeLoad(in + 21), SafeLoad(in + 21) >> 28 | SafeLoad(in + 22) << 4, SafeLoad(in + 22) >> 21 | SafeLoad(in + 23) << 11, SafeLoad(in + 23) >> 14 | SafeLoad(in + 24) << 18, SafeLoad(in + 24) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19) >> 17 | SafeLoadAs(in + 4 * 20) << 15, + SafeLoadAs(in + 4 * 20) >> 10 | SafeLoadAs(in + 4 * 21) << 22, + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 28 | SafeLoadAs(in + 4 * 22) << 4, + SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, + SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, + SafeLoadAs(in + 4 * 24), + }; shifts = simd_batch{ 0, 0, 0, 3, 0, 0, 0, 7 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 25; + in += 25 * 4; return in; } -inline static const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3ffffff; simd_batch masks(mask); @@ -1028,38 +2547,76 @@ inline static const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 26-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 26 | SafeLoad(in + 1) << 6, SafeLoad(in + 1) >> 20 | SafeLoad(in + 2) << 12, SafeLoad(in + 2) >> 14 | SafeLoad(in + 3) << 18, SafeLoad(in + 3) >> 8 | SafeLoad(in + 4) << 24, SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 26 | SafeLoadAs(in + 4 * 1) << 6, + SafeLoadAs(in + 4 * 1) >> 20 | SafeLoadAs(in + 4 * 2) << 12, + SafeLoadAs(in + 4 * 2) >> 14 | SafeLoadAs(in + 4 * 3) << 18, + SafeLoadAs(in + 4 * 3) >> 8 | SafeLoadAs(in + 4 * 4) << 24, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, + SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 2, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 26-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 6) >> 16 | SafeLoad(in + 7) << 16, SafeLoad(in + 7) >> 10 | SafeLoad(in + 8) << 22, SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10) >> 18 | SafeLoad(in + 11) << 14, SafeLoad(in + 11) >> 12 | SafeLoad(in + 12) << 20, SafeLoad(in + 12) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6) >> 16 | SafeLoadAs(in + 4 * 7) << 16, + SafeLoadAs(in + 4 * 7) >> 10 | SafeLoadAs(in + 4 * 8) << 22, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, + SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, + SafeLoadAs(in + 4 * 10) >> 18 | SafeLoadAs(in + 4 * 11) << 14, + SafeLoadAs(in + 4 * 11) >> 12 | SafeLoadAs(in + 4 * 12) << 20, + SafeLoadAs(in + 4 * 12), + }; shifts = simd_batch{ 0, 0, 4, 0, 0, 0, 0, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 26-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 13), SafeLoad(in + 13) >> 26 | SafeLoad(in + 14) << 6, SafeLoad(in + 14) >> 20 | SafeLoad(in + 15) << 12, SafeLoad(in + 15) >> 14 | SafeLoad(in + 16) << 18, SafeLoad(in + 16) >> 8 | SafeLoad(in + 17) << 24, SafeLoad(in + 17), SafeLoad(in + 17) >> 28 | SafeLoad(in + 18) << 4, SafeLoad(in + 18) >> 22 | SafeLoad(in + 19) << 10 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 26 | SafeLoadAs(in + 4 * 14) << 6, + SafeLoadAs(in + 4 * 14) >> 20 | SafeLoadAs(in + 4 * 15) << 12, + SafeLoadAs(in + 4 * 15) >> 14 | SafeLoadAs(in + 4 * 16) << 18, + SafeLoadAs(in + 4 * 16) >> 8 | SafeLoadAs(in + 4 * 17) << 24, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 28 | SafeLoadAs(in + 4 * 18) << 4, + SafeLoadAs(in + 4 * 18) >> 22 | SafeLoadAs(in + 4 * 19) << 10, + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 2, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 26-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 19) >> 16 | SafeLoad(in + 20) << 16, SafeLoad(in + 20) >> 10 | SafeLoad(in + 21) << 22, SafeLoad(in + 21), SafeLoad(in + 21) >> 30 | SafeLoad(in + 22) << 2, SafeLoad(in + 22) >> 24 | SafeLoad(in + 23) << 8, SafeLoad(in + 23) >> 18 | SafeLoad(in + 24) << 14, SafeLoad(in + 24) >> 12 | SafeLoad(in + 25) << 20, SafeLoad(in + 25) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 19) >> 16 | SafeLoadAs(in + 4 * 20) << 16, + SafeLoadAs(in + 4 * 20) >> 10 | SafeLoadAs(in + 4 * 21) << 22, + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, + SafeLoadAs(in + 4 * 22) >> 24 | SafeLoadAs(in + 4 * 23) << 8, + SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, + SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, + SafeLoadAs(in + 4 * 25), + }; shifts = simd_batch{ 0, 0, 4, 0, 0, 0, 0, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 26; + in += 26 * 4; return in; } -inline static const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7ffffff; simd_batch masks(mask); @@ -1067,38 +2624,76 @@ inline static const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 27-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5, SafeLoad(in + 1) >> 22 | SafeLoad(in + 2) << 10, SafeLoad(in + 2) >> 17 | SafeLoad(in + 3) << 15, SafeLoad(in + 3) >> 12 | SafeLoad(in + 4) << 20, SafeLoad(in + 4) >> 7 | SafeLoad(in + 5) << 25, SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, + SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, + SafeLoadAs(in + 4 * 2) >> 17 | SafeLoadAs(in + 4 * 3) << 15, + SafeLoadAs(in + 4 * 3) >> 12 | SafeLoadAs(in + 4 * 4) << 20, + SafeLoadAs(in + 4 * 4) >> 7 | SafeLoadAs(in + 4 * 5) << 25, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 2, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 27-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7) >> 19 | SafeLoad(in + 8) << 13, SafeLoad(in + 8) >> 14 | SafeLoad(in + 9) << 18, SafeLoad(in + 9) >> 9 | SafeLoad(in + 10) << 23, SafeLoad(in + 10), SafeLoad(in + 10) >> 31 | SafeLoad(in + 11) << 1, SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12) >> 21 | SafeLoad(in + 13) << 11 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7) >> 19 | SafeLoadAs(in + 4 * 8) << 13, + SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, + SafeLoadAs(in + 4 * 9) >> 9 | SafeLoadAs(in + 4 * 10) << 23, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, + }; shifts = simd_batch{ 0, 0, 0, 0, 4, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 27-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14) >> 11 | SafeLoad(in + 15) << 21, SafeLoad(in + 15) >> 6 | SafeLoad(in + 16) << 26, SafeLoad(in + 16), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) >> 23 | SafeLoad(in + 18) << 9, SafeLoad(in + 18) >> 18 | SafeLoad(in + 19) << 14, SafeLoad(in + 19) >> 13 | SafeLoad(in + 20) << 19 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, + SafeLoadAs(in + 4 * 14) >> 11 | SafeLoadAs(in + 4 * 15) << 21, + SafeLoadAs(in + 4 * 15) >> 6 | SafeLoadAs(in + 4 * 16) << 26, + SafeLoadAs(in + 4 * 16), + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, + SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, + SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, + }; shifts = simd_batch{ 0, 0, 0, 1, 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 27-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 20) >> 8 | SafeLoad(in + 21) << 24, SafeLoad(in + 21), SafeLoad(in + 21) >> 30 | SafeLoad(in + 22) << 2, SafeLoad(in + 22) >> 25 | SafeLoad(in + 23) << 7, SafeLoad(in + 23) >> 20 | SafeLoad(in + 24) << 12, SafeLoad(in + 24) >> 15 | SafeLoad(in + 25) << 17, SafeLoad(in + 25) >> 10 | SafeLoad(in + 26) << 22, SafeLoad(in + 26) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 20) >> 8 | SafeLoadAs(in + 4 * 21) << 24, + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, + SafeLoadAs(in + 4 * 22) >> 25 | SafeLoadAs(in + 4 * 23) << 7, + SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, + SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, + SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, + SafeLoadAs(in + 4 * 26), + }; shifts = simd_batch{ 0, 3, 0, 0, 0, 0, 0, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 27; + in += 27 * 4; return in; } -inline static const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xfffffff; simd_batch masks(mask); @@ -1106,38 +2701,76 @@ inline static const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 28-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1) >> 24 | SafeLoad(in + 2) << 8, SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 3) >> 16 | SafeLoad(in + 4) << 16, SafeLoad(in + 4) >> 12 | SafeLoad(in + 5) << 20, SafeLoad(in + 5) >> 8 | SafeLoad(in + 6) << 24, SafeLoad(in + 6) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, + SafeLoadAs(in + 4 * 1) >> 24 | SafeLoadAs(in + 4 * 2) << 8, + SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, + SafeLoadAs(in + 4 * 3) >> 16 | SafeLoadAs(in + 4 * 4) << 16, + SafeLoadAs(in + 4 * 4) >> 12 | SafeLoadAs(in + 4 * 5) << 20, + SafeLoadAs(in + 4 * 5) >> 8 | SafeLoadAs(in + 4 * 6) << 24, + SafeLoadAs(in + 4 * 6), + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 28-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11) >> 12 | SafeLoad(in + 12) << 20, SafeLoad(in + 12) >> 8 | SafeLoad(in + 13) << 24, SafeLoad(in + 13) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, + SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, + SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, + SafeLoadAs(in + 4 * 11) >> 12 | SafeLoadAs(in + 4 * 12) << 20, + SafeLoadAs(in + 4 * 12) >> 8 | SafeLoadAs(in + 4 * 13) << 24, + SafeLoadAs(in + 4 * 13), + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 28-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 14), SafeLoad(in + 14) >> 28 | SafeLoad(in + 15) << 4, SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 20 | SafeLoad(in + 17) << 12, SafeLoad(in + 17) >> 16 | SafeLoad(in + 18) << 16, SafeLoad(in + 18) >> 12 | SafeLoad(in + 19) << 20, SafeLoad(in + 19) >> 8 | SafeLoad(in + 20) << 24, SafeLoad(in + 20) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 28 | SafeLoadAs(in + 4 * 15) << 4, + SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, + SafeLoadAs(in + 4 * 16) >> 20 | SafeLoadAs(in + 4 * 17) << 12, + SafeLoadAs(in + 4 * 17) >> 16 | SafeLoadAs(in + 4 * 18) << 16, + SafeLoadAs(in + 4 * 18) >> 12 | SafeLoadAs(in + 4 * 19) << 20, + SafeLoadAs(in + 4 * 19) >> 8 | SafeLoadAs(in + 4 * 20) << 24, + SafeLoadAs(in + 4 * 20), + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 28-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 21), SafeLoad(in + 21) >> 28 | SafeLoad(in + 22) << 4, SafeLoad(in + 22) >> 24 | SafeLoad(in + 23) << 8, SafeLoad(in + 23) >> 20 | SafeLoad(in + 24) << 12, SafeLoad(in + 24) >> 16 | SafeLoad(in + 25) << 16, SafeLoad(in + 25) >> 12 | SafeLoad(in + 26) << 20, SafeLoad(in + 26) >> 8 | SafeLoad(in + 27) << 24, SafeLoad(in + 27) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 28 | SafeLoadAs(in + 4 * 22) << 4, + SafeLoadAs(in + 4 * 22) >> 24 | SafeLoadAs(in + 4 * 23) << 8, + SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, + SafeLoadAs(in + 4 * 24) >> 16 | SafeLoadAs(in + 4 * 25) << 16, + SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, + SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, + SafeLoadAs(in + 4 * 27), + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 28; + in += 28 * 4; return in; } -inline static const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1fffffff; simd_batch masks(mask); @@ -1145,38 +2778,76 @@ inline static const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 29-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 29 | SafeLoad(in + 1) << 3, SafeLoad(in + 1) >> 26 | SafeLoad(in + 2) << 6, SafeLoad(in + 2) >> 23 | SafeLoad(in + 3) << 9, SafeLoad(in + 3) >> 20 | SafeLoad(in + 4) << 12, SafeLoad(in + 4) >> 17 | SafeLoad(in + 5) << 15, SafeLoad(in + 5) >> 14 | SafeLoad(in + 6) << 18, SafeLoad(in + 6) >> 11 | SafeLoad(in + 7) << 21 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 29 | SafeLoadAs(in + 4 * 1) << 3, + SafeLoadAs(in + 4 * 1) >> 26 | SafeLoadAs(in + 4 * 2) << 6, + SafeLoadAs(in + 4 * 2) >> 23 | SafeLoadAs(in + 4 * 3) << 9, + SafeLoadAs(in + 4 * 3) >> 20 | SafeLoadAs(in + 4 * 4) << 12, + SafeLoadAs(in + 4 * 4) >> 17 | SafeLoadAs(in + 4 * 5) << 15, + SafeLoadAs(in + 4 * 5) >> 14 | SafeLoadAs(in + 4 * 6) << 18, + SafeLoadAs(in + 4 * 6) >> 11 | SafeLoadAs(in + 4 * 7) << 21, + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 29-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 7) >> 8 | SafeLoad(in + 8) << 24, SafeLoad(in + 8) >> 5 | SafeLoad(in + 9) << 27, SafeLoad(in + 9), SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1, SafeLoad(in + 10) >> 28 | SafeLoad(in + 11) << 4, SafeLoad(in + 11) >> 25 | SafeLoad(in + 12) << 7, SafeLoad(in + 12) >> 22 | SafeLoad(in + 13) << 10, SafeLoad(in + 13) >> 19 | SafeLoad(in + 14) << 13 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7) >> 8 | SafeLoadAs(in + 4 * 8) << 24, + SafeLoadAs(in + 4 * 8) >> 5 | SafeLoadAs(in + 4 * 9) << 27, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, + SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, + SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, + SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, + SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, + }; shifts = simd_batch{ 0, 0, 2, 0, 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 29-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 14) >> 16 | SafeLoad(in + 15) << 16, SafeLoad(in + 15) >> 13 | SafeLoad(in + 16) << 19, SafeLoad(in + 16) >> 10 | SafeLoad(in + 17) << 22, SafeLoad(in + 17) >> 7 | SafeLoad(in + 18) << 25, SafeLoad(in + 18) >> 4 | SafeLoad(in + 19) << 28, SafeLoad(in + 19), SafeLoad(in + 19) >> 30 | SafeLoad(in + 20) << 2, SafeLoad(in + 20) >> 27 | SafeLoad(in + 21) << 5 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 14) >> 16 | SafeLoadAs(in + 4 * 15) << 16, + SafeLoadAs(in + 4 * 15) >> 13 | SafeLoadAs(in + 4 * 16) << 19, + SafeLoadAs(in + 4 * 16) >> 10 | SafeLoadAs(in + 4 * 17) << 22, + SafeLoadAs(in + 4 * 17) >> 7 | SafeLoadAs(in + 4 * 18) << 25, + SafeLoadAs(in + 4 * 18) >> 4 | SafeLoadAs(in + 4 * 19) << 28, + SafeLoadAs(in + 4 * 19), + SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, + SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 1, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 29-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 21) >> 24 | SafeLoad(in + 22) << 8, SafeLoad(in + 22) >> 21 | SafeLoad(in + 23) << 11, SafeLoad(in + 23) >> 18 | SafeLoad(in + 24) << 14, SafeLoad(in + 24) >> 15 | SafeLoad(in + 25) << 17, SafeLoad(in + 25) >> 12 | SafeLoad(in + 26) << 20, SafeLoad(in + 26) >> 9 | SafeLoad(in + 27) << 23, SafeLoad(in + 27) >> 6 | SafeLoad(in + 28) << 26, SafeLoad(in + 28) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, + SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, + SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, + SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, + SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, + SafeLoadAs(in + 4 * 26) >> 9 | SafeLoadAs(in + 4 * 27) << 23, + SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, + SafeLoadAs(in + 4 * 28), + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 29; + in += 29 * 4; return in; } -inline static const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3fffffff; simd_batch masks(mask); @@ -1184,38 +2855,76 @@ inline static const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 30-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 30-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8) >> 14 | SafeLoad(in + 9) << 18, SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 20, SafeLoad(in + 10) >> 10 | SafeLoad(in + 11) << 22, SafeLoad(in + 11) >> 8 | SafeLoad(in + 12) << 24, SafeLoad(in + 12) >> 6 | SafeLoad(in + 13) << 26, SafeLoad(in + 13) >> 4 | SafeLoad(in + 14) << 28, SafeLoad(in + 14) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, + SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, + SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, + SafeLoadAs(in + 4 * 10) >> 10 | SafeLoadAs(in + 4 * 11) << 22, + SafeLoadAs(in + 4 * 11) >> 8 | SafeLoadAs(in + 4 * 12) << 24, + SafeLoadAs(in + 4 * 12) >> 6 | SafeLoadAs(in + 4 * 13) << 26, + SafeLoadAs(in + 4 * 13) >> 4 | SafeLoadAs(in + 4 * 14) << 28, + SafeLoadAs(in + 4 * 14), + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 30-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 15), SafeLoad(in + 15) >> 30 | SafeLoad(in + 16) << 2, SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) >> 26 | SafeLoad(in + 18) << 6, SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 22 | SafeLoad(in + 20) << 10, SafeLoad(in + 20) >> 20 | SafeLoad(in + 21) << 12, SafeLoad(in + 21) >> 18 | SafeLoad(in + 22) << 14 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, + SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, + SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 30-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 22) >> 16 | SafeLoad(in + 23) << 16, SafeLoad(in + 23) >> 14 | SafeLoad(in + 24) << 18, SafeLoad(in + 24) >> 12 | SafeLoad(in + 25) << 20, SafeLoad(in + 25) >> 10 | SafeLoad(in + 26) << 22, SafeLoad(in + 26) >> 8 | SafeLoad(in + 27) << 24, SafeLoad(in + 27) >> 6 | SafeLoad(in + 28) << 26, SafeLoad(in + 28) >> 4 | SafeLoad(in + 29) << 28, SafeLoad(in + 29) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, + SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, + SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, + SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, + SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, + SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, + SafeLoadAs(in + 4 * 28) >> 4 | SafeLoadAs(in + 4 * 29) << 28, + SafeLoadAs(in + 4 * 29), + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 30; + in += 30 * 4; return in; } -inline static const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7fffffff; simd_batch masks(mask); @@ -1223,40 +2932,76 @@ inline static const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 31-bit bundles 0 to 7 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 31 | SafeLoad(in + 1) << 1, SafeLoad(in + 1) >> 30 | SafeLoad(in + 2) << 2, SafeLoad(in + 2) >> 29 | SafeLoad(in + 3) << 3, SafeLoad(in + 3) >> 28 | SafeLoad(in + 4) << 4, SafeLoad(in + 4) >> 27 | SafeLoad(in + 5) << 5, SafeLoad(in + 5) >> 26 | SafeLoad(in + 6) << 6, SafeLoad(in + 6) >> 25 | SafeLoad(in + 7) << 7 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 31 | SafeLoadAs(in + 4 * 1) << 1, + SafeLoadAs(in + 4 * 1) >> 30 | SafeLoadAs(in + 4 * 2) << 2, + SafeLoadAs(in + 4 * 2) >> 29 | SafeLoadAs(in + 4 * 3) << 3, + SafeLoadAs(in + 4 * 3) >> 28 | SafeLoadAs(in + 4 * 4) << 4, + SafeLoadAs(in + 4 * 4) >> 27 | SafeLoadAs(in + 4 * 5) << 5, + SafeLoadAs(in + 4 * 5) >> 26 | SafeLoadAs(in + 4 * 6) << 6, + SafeLoadAs(in + 4 * 6) >> 25 | SafeLoadAs(in + 4 * 7) << 7, + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 31-bit bundles 8 to 15 - words = simd_batch{ SafeLoad(in + 7) >> 24 | SafeLoad(in + 8) << 8, SafeLoad(in + 8) >> 23 | SafeLoad(in + 9) << 9, SafeLoad(in + 9) >> 22 | SafeLoad(in + 10) << 10, SafeLoad(in + 10) >> 21 | SafeLoad(in + 11) << 11, SafeLoad(in + 11) >> 20 | SafeLoad(in + 12) << 12, SafeLoad(in + 12) >> 19 | SafeLoad(in + 13) << 13, SafeLoad(in + 13) >> 18 | SafeLoad(in + 14) << 14, SafeLoad(in + 14) >> 17 | SafeLoad(in + 15) << 15 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7) >> 24 | SafeLoadAs(in + 4 * 8) << 8, + SafeLoadAs(in + 4 * 8) >> 23 | SafeLoadAs(in + 4 * 9) << 9, + SafeLoadAs(in + 4 * 9) >> 22 | SafeLoadAs(in + 4 * 10) << 10, + SafeLoadAs(in + 4 * 10) >> 21 | SafeLoadAs(in + 4 * 11) << 11, + SafeLoadAs(in + 4 * 11) >> 20 | SafeLoadAs(in + 4 * 12) << 12, + SafeLoadAs(in + 4 * 12) >> 19 | SafeLoadAs(in + 4 * 13) << 13, + SafeLoadAs(in + 4 * 13) >> 18 | SafeLoadAs(in + 4 * 14) << 14, + SafeLoadAs(in + 4 * 14) >> 17 | SafeLoadAs(in + 4 * 15) << 15, + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 31-bit bundles 16 to 23 - words = simd_batch{ SafeLoad(in + 15) >> 16 | SafeLoad(in + 16) << 16, SafeLoad(in + 16) >> 15 | SafeLoad(in + 17) << 17, SafeLoad(in + 17) >> 14 | SafeLoad(in + 18) << 18, SafeLoad(in + 18) >> 13 | SafeLoad(in + 19) << 19, SafeLoad(in + 19) >> 12 | SafeLoad(in + 20) << 20, SafeLoad(in + 20) >> 11 | SafeLoad(in + 21) << 21, SafeLoad(in + 21) >> 10 | SafeLoad(in + 22) << 22, SafeLoad(in + 22) >> 9 | SafeLoad(in + 23) << 23 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 15) >> 16 | SafeLoadAs(in + 4 * 16) << 16, + SafeLoadAs(in + 4 * 16) >> 15 | SafeLoadAs(in + 4 * 17) << 17, + SafeLoadAs(in + 4 * 17) >> 14 | SafeLoadAs(in + 4 * 18) << 18, + SafeLoadAs(in + 4 * 18) >> 13 | SafeLoadAs(in + 4 * 19) << 19, + SafeLoadAs(in + 4 * 19) >> 12 | SafeLoadAs(in + 4 * 20) << 20, + SafeLoadAs(in + 4 * 20) >> 11 | SafeLoadAs(in + 4 * 21) << 21, + SafeLoadAs(in + 4 * 21) >> 10 | SafeLoadAs(in + 4 * 22) << 22, + SafeLoadAs(in + 4 * 22) >> 9 | SafeLoadAs(in + 4 * 23) << 23, + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; // extract 31-bit bundles 24 to 31 - words = simd_batch{ SafeLoad(in + 23) >> 8 | SafeLoad(in + 24) << 24, SafeLoad(in + 24) >> 7 | SafeLoad(in + 25) << 25, SafeLoad(in + 25) >> 6 | SafeLoad(in + 26) << 26, SafeLoad(in + 26) >> 5 | SafeLoad(in + 27) << 27, SafeLoad(in + 27) >> 4 | SafeLoad(in + 28) << 28, SafeLoad(in + 28) >> 3 | SafeLoad(in + 29) << 29, SafeLoad(in + 29) >> 2 | SafeLoad(in + 30) << 30, SafeLoad(in + 30) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 23) >> 8 | SafeLoadAs(in + 4 * 24) << 24, + SafeLoadAs(in + 4 * 24) >> 7 | SafeLoadAs(in + 4 * 25) << 25, + SafeLoadAs(in + 4 * 25) >> 6 | SafeLoadAs(in + 4 * 26) << 26, + SafeLoadAs(in + 4 * 26) >> 5 | SafeLoadAs(in + 4 * 27) << 27, + SafeLoadAs(in + 4 * 27) >> 4 | SafeLoadAs(in + 4 * 28) << 28, + SafeLoadAs(in + 4 * 28) >> 3 | SafeLoadAs(in + 4 * 29) << 29, + SafeLoadAs(in + 4 * 29) >> 2 | SafeLoadAs(in + 4 * 30) << 30, + SafeLoadAs(in + 4 * 30), + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 8; - in += 31; + in += 31 * 4; return in; } -inline static const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) { - memcpy(out, in, 32 * sizeof(*out)); - in += 32; +inline static const uint8_t* unpack32_32(const uint8_t* in, uint32_t* out) { + std::memcpy(out, in, 32 * sizeof(*out)); + in += 4 * 32; out += 32; return in; @@ -1265,6 +3010,5 @@ inline static const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) { }; // struct UnpackBits256 } // namespace -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_simd512_generated_internal.h b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h index 4f2aeaeeb4b..da53ddfd381 100644 --- a/cpp/src/arrow/util/bpacking_simd512_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h @@ -27,25 +27,594 @@ #include "arrow/util/dispatch_internal.h" #include "arrow/util/ubsan.h" -namespace arrow { -namespace internal { +namespace arrow::internal { namespace { -using ::arrow::util::SafeLoad; +using ::arrow::util::SafeLoadAs; template struct UnpackBits512 { -using simd_batch = xsimd::make_sized_batch_t; +inline static const uint8_t* unpack0_16(const uint8_t* in, uint16_t* out) { + std::memset(out, 0x0, 16 * sizeof(*out)); + out += 16; + + return in; +} + +inline static const uint8_t* unpack1_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x1; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 1-bit bundles 0 to 31 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + }; + shifts = simd_batch{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 32; + + in += 1 * 2; + return in; +} + +inline static const uint8_t* unpack2_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x3; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 2-bit bundles 0 to 31 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + }; + shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 32; + + in += 2 * 2; + return in; +} + +inline static const uint8_t* unpack3_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x7; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 3-bit bundles 0 to 31 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + }; + shifts = simd_batch{ 0, 3, 6, 9, 12, 0, 2, 5, 8, 11, 0, 1, 4, 7, 10, 13 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 32; + + in += 3 * 2; + return in; +} + +inline static const uint8_t* unpack4_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; -inline static const uint32_t* unpack0_32(const uint32_t* in, uint32_t* out) { - memset(out, 0x0, 32 * sizeof(*out)); + uint16_t mask = 0xf; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 4-bit bundles 0 to 31 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + }; + shifts = simd_batch{ 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); out += 32; + in += 4 * 2; return in; } -inline static const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack5_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x1f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 5-bit bundles 0 to 31 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 4), + }; + shifts = simd_batch{ 0, 5, 10, 0, 4, 9, 0, 3, 8, 0, 2, 7, 0, 1, 6, 11 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 32; + + in += 5 * 2; + return in; +} + +inline static const uint8_t* unpack6_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x3f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 6-bit bundles 0 to 31 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 12 | SafeLoadAs(in + 2 * 1) << 4), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 14 | SafeLoadAs(in + 2 * 5) << 2), + SafeLoadAs(in + 2 * 5), + SafeLoadAs(in + 2 * 5), + }; + shifts = simd_batch{ 0, 6, 0, 2, 8, 0, 4, 10, 0, 6, 0, 2, 8, 0, 4, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 32; + + in += 6 * 2; + return in; +} + +inline static const uint8_t* unpack7_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x7f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 7-bit bundles 0 to 31 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 14 | SafeLoadAs(in + 2 * 1) << 2), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 12 | SafeLoadAs(in + 2 * 2) << 4), + SafeLoadAs(in + 2 * 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 10 | SafeLoadAs(in + 2 * 3) << 6), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 15 | SafeLoadAs(in + 2 * 4) << 1), + SafeLoadAs(in + 2 * 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 13 | SafeLoadAs(in + 2 * 5) << 3), + SafeLoadAs(in + 2 * 5), + static_cast(SafeLoadAs(in + 2 * 5) >> 11 | SafeLoadAs(in + 2 * 6) << 5), + SafeLoadAs(in + 2 * 6), + SafeLoadAs(in + 2 * 6), + }; + shifts = simd_batch{ 0, 7, 0, 5, 0, 3, 0, 1, 8, 0, 6, 0, 4, 0, 2, 9 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 32; + + in += 7 * 2; + return in; +} + +inline static const uint8_t* unpack8_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0xff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 8-bit bundles 0 to 31 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 0), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 1), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 3), + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 5), + SafeLoadAs(in + 2 * 5), + SafeLoadAs(in + 2 * 6), + SafeLoadAs(in + 2 * 6), + SafeLoadAs(in + 2 * 7), + SafeLoadAs(in + 2 * 7), + }; + shifts = simd_batch{ 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 32; + + in += 8 * 2; + return in; +} + +inline static const uint8_t* unpack9_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x1ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 9-bit bundles 0 to 31 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 9 | SafeLoadAs(in + 2 * 1) << 7), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 11 | SafeLoadAs(in + 2 * 2) << 5), + SafeLoadAs(in + 2 * 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 15 | SafeLoadAs(in + 2 * 4) << 1), + static_cast(SafeLoadAs(in + 2 * 4) >> 8 | SafeLoadAs(in + 2 * 5) << 8), + SafeLoadAs(in + 2 * 5), + static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), + SafeLoadAs(in + 2 * 6), + static_cast(SafeLoadAs(in + 2 * 6) >> 12 | SafeLoadAs(in + 2 * 7) << 4), + SafeLoadAs(in + 2 * 7), + static_cast(SafeLoadAs(in + 2 * 7) >> 14 | SafeLoadAs(in + 2 * 8) << 2), + SafeLoadAs(in + 2 * 8), + }; + shifts = simd_batch{ 0, 0, 2, 0, 4, 0, 6, 0, 0, 1, 0, 3, 0, 5, 0, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 32; + + in += 9 * 2; + return in; +} + +inline static const uint8_t* unpack10_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x3ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 10-bit bundles 0 to 31 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 10 | SafeLoadAs(in + 2 * 1) << 6), + SafeLoadAs(in + 2 * 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 8 | SafeLoadAs(in + 2 * 3) << 8), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), + SafeLoadAs(in + 2 * 4), + SafeLoadAs(in + 2 * 5), + static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), + SafeLoadAs(in + 2 * 6), + static_cast(SafeLoadAs(in + 2 * 6) >> 14 | SafeLoadAs(in + 2 * 7) << 2), + static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), + SafeLoadAs(in + 2 * 8), + static_cast(SafeLoadAs(in + 2 * 8) >> 12 | SafeLoadAs(in + 2 * 9) << 4), + SafeLoadAs(in + 2 * 9), + }; + shifts = simd_batch{ 0, 0, 4, 0, 0, 2, 0, 6, 0, 0, 4, 0, 0, 2, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 32; + + in += 10 * 2; + return in; +} + +inline static const uint8_t* unpack11_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x7ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 11-bit bundles 0 to 31 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 11 | SafeLoadAs(in + 2 * 1) << 5), + static_cast(SafeLoadAs(in + 2 * 1) >> 6 | SafeLoadAs(in + 2 * 2) << 10), + SafeLoadAs(in + 2 * 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 12 | SafeLoadAs(in + 2 * 3) << 4), + static_cast(SafeLoadAs(in + 2 * 3) >> 7 | SafeLoadAs(in + 2 * 4) << 9), + SafeLoadAs(in + 2 * 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 13 | SafeLoadAs(in + 2 * 5) << 3), + static_cast(SafeLoadAs(in + 2 * 5) >> 8 | SafeLoadAs(in + 2 * 6) << 8), + SafeLoadAs(in + 2 * 6), + static_cast(SafeLoadAs(in + 2 * 6) >> 14 | SafeLoadAs(in + 2 * 7) << 2), + static_cast(SafeLoadAs(in + 2 * 7) >> 9 | SafeLoadAs(in + 2 * 8) << 7), + SafeLoadAs(in + 2 * 8), + static_cast(SafeLoadAs(in + 2 * 8) >> 15 | SafeLoadAs(in + 2 * 9) << 1), + static_cast(SafeLoadAs(in + 2 * 9) >> 10 | SafeLoadAs(in + 2 * 10) << 6), + SafeLoadAs(in + 2 * 10), + }; + shifts = simd_batch{ 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 32; + + in += 11 * 2; + return in; +} + +inline static const uint8_t* unpack12_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0xfff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 12-bit bundles 0 to 31 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 12 | SafeLoadAs(in + 2 * 1) << 4), + static_cast(SafeLoadAs(in + 2 * 1) >> 8 | SafeLoadAs(in + 2 * 2) << 8), + SafeLoadAs(in + 2 * 2), + SafeLoadAs(in + 2 * 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 8 | SafeLoadAs(in + 2 * 5) << 8), + SafeLoadAs(in + 2 * 5), + SafeLoadAs(in + 2 * 6), + static_cast(SafeLoadAs(in + 2 * 6) >> 12 | SafeLoadAs(in + 2 * 7) << 4), + static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), + SafeLoadAs(in + 2 * 8), + SafeLoadAs(in + 2 * 9), + static_cast(SafeLoadAs(in + 2 * 9) >> 12 | SafeLoadAs(in + 2 * 10) << 4), + static_cast(SafeLoadAs(in + 2 * 10) >> 8 | SafeLoadAs(in + 2 * 11) << 8), + SafeLoadAs(in + 2 * 11), + }; + shifts = simd_batch{ 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 32; + + in += 12 * 2; + return in; +} + +inline static const uint8_t* unpack13_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x1fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 13-bit bundles 0 to 31 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 13 | SafeLoadAs(in + 2 * 1) << 3), + static_cast(SafeLoadAs(in + 2 * 1) >> 10 | SafeLoadAs(in + 2 * 2) << 6), + static_cast(SafeLoadAs(in + 2 * 2) >> 7 | SafeLoadAs(in + 2 * 3) << 9), + static_cast(SafeLoadAs(in + 2 * 3) >> 4 | SafeLoadAs(in + 2 * 4) << 12), + SafeLoadAs(in + 2 * 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 14 | SafeLoadAs(in + 2 * 5) << 2), + static_cast(SafeLoadAs(in + 2 * 5) >> 11 | SafeLoadAs(in + 2 * 6) << 5), + static_cast(SafeLoadAs(in + 2 * 6) >> 8 | SafeLoadAs(in + 2 * 7) << 8), + static_cast(SafeLoadAs(in + 2 * 7) >> 5 | SafeLoadAs(in + 2 * 8) << 11), + SafeLoadAs(in + 2 * 8), + static_cast(SafeLoadAs(in + 2 * 8) >> 15 | SafeLoadAs(in + 2 * 9) << 1), + static_cast(SafeLoadAs(in + 2 * 9) >> 12 | SafeLoadAs(in + 2 * 10) << 4), + static_cast(SafeLoadAs(in + 2 * 10) >> 9 | SafeLoadAs(in + 2 * 11) << 7), + static_cast(SafeLoadAs(in + 2 * 11) >> 6 | SafeLoadAs(in + 2 * 12) << 10), + SafeLoadAs(in + 2 * 12), + }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 32; + + in += 13 * 2; + return in; +} + +inline static const uint8_t* unpack14_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x3fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 14-bit bundles 0 to 31 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 14 | SafeLoadAs(in + 2 * 1) << 2), + static_cast(SafeLoadAs(in + 2 * 1) >> 12 | SafeLoadAs(in + 2 * 2) << 4), + static_cast(SafeLoadAs(in + 2 * 2) >> 10 | SafeLoadAs(in + 2 * 3) << 6), + static_cast(SafeLoadAs(in + 2 * 3) >> 8 | SafeLoadAs(in + 2 * 4) << 8), + static_cast(SafeLoadAs(in + 2 * 4) >> 6 | SafeLoadAs(in + 2 * 5) << 10), + static_cast(SafeLoadAs(in + 2 * 5) >> 4 | SafeLoadAs(in + 2 * 6) << 12), + SafeLoadAs(in + 2 * 6), + SafeLoadAs(in + 2 * 7), + static_cast(SafeLoadAs(in + 2 * 7) >> 14 | SafeLoadAs(in + 2 * 8) << 2), + static_cast(SafeLoadAs(in + 2 * 8) >> 12 | SafeLoadAs(in + 2 * 9) << 4), + static_cast(SafeLoadAs(in + 2 * 9) >> 10 | SafeLoadAs(in + 2 * 10) << 6), + static_cast(SafeLoadAs(in + 2 * 10) >> 8 | SafeLoadAs(in + 2 * 11) << 8), + static_cast(SafeLoadAs(in + 2 * 11) >> 6 | SafeLoadAs(in + 2 * 12) << 10), + static_cast(SafeLoadAs(in + 2 * 12) >> 4 | SafeLoadAs(in + 2 * 13) << 12), + SafeLoadAs(in + 2 * 13), + }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 32; + + in += 14 * 2; + return in; +} + +inline static const uint8_t* unpack15_16(const uint8_t* in, uint16_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + + uint16_t mask = 0x7fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 15-bit bundles 0 to 31 + words = simd_batch{ + SafeLoadAs(in + 2 * 0), + static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), + static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), + static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), + static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), + static_cast(SafeLoadAs(in + 2 * 4) >> 11 | SafeLoadAs(in + 2 * 5) << 5), + static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), + static_cast(SafeLoadAs(in + 2 * 6) >> 9 | SafeLoadAs(in + 2 * 7) << 7), + static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), + static_cast(SafeLoadAs(in + 2 * 8) >> 7 | SafeLoadAs(in + 2 * 9) << 9), + static_cast(SafeLoadAs(in + 2 * 9) >> 6 | SafeLoadAs(in + 2 * 10) << 10), + static_cast(SafeLoadAs(in + 2 * 10) >> 5 | SafeLoadAs(in + 2 * 11) << 11), + static_cast(SafeLoadAs(in + 2 * 11) >> 4 | SafeLoadAs(in + 2 * 12) << 12), + static_cast(SafeLoadAs(in + 2 * 12) >> 3 | SafeLoadAs(in + 2 * 13) << 13), + static_cast(SafeLoadAs(in + 2 * 13) >> 2 | SafeLoadAs(in + 2 * 14) << 14), + SafeLoadAs(in + 2 * 14), + }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 32; + + in += 15 * 2; + return in; +} + +inline static const uint8_t* unpack16_16(const uint8_t* in, uint16_t* out) { + std::memcpy(out, in, 16 * sizeof(*out)); + in += 2 * 16; + out += 16; + + return in; +} + +inline static const uint8_t* unpack0_32(const uint8_t* in, uint32_t* out) { + std::memset(out, 0x0, 32 * sizeof(*out)); + out += 32; + + return in; +} + +inline static const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1; simd_batch masks(mask); @@ -53,24 +622,60 @@ inline static const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 1-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 1-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 1; + in += 1 * 4; return in; } -inline static const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3; simd_batch masks(mask); @@ -78,24 +683,60 @@ inline static const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 2-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 2-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 2; + in += 2 * 4; return in; } -inline static const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7; simd_batch masks(mask); @@ -103,24 +744,60 @@ inline static const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 3-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 0, 1, 4, 7, 10, 13 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 3-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 16, 19, 22, 25, 28, 0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 3; + in += 3 * 4; return in; } -inline static const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xf; simd_batch masks(mask); @@ -128,24 +805,60 @@ inline static const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 4-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + }; shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 4-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 4; + in += 4 * 4; return in; } -inline static const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1f; simd_batch masks(mask); @@ -153,24 +866,60 @@ inline static const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 5-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 0, 5, 10, 15, 20, 25, 0, 3, 8, 13, 18, 23, 0, 1, 6, 11 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 5-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 31 | SafeLoadAs(in + 4 * 3) << 1, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 29 | SafeLoadAs(in + 4 * 4) << 3, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 16, 21, 26, 0, 4, 9, 14, 19, 24, 0, 2, 7, 12, 17, 22, 27 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 5; + in += 5 * 4; return in; } -inline static const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3f; simd_batch masks(mask); @@ -178,24 +927,60 @@ inline static const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 6-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + }; shifts = simd_batch{ 0, 6, 12, 18, 24, 0, 4, 10, 16, 22, 0, 2, 8, 14, 20, 26 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 6-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 0, 6, 12, 18, 24, 0, 4, 10, 16, 22, 0, 2, 8, 14, 20, 26 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 6; + in += 6 * 4; return in; } -inline static const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7f; simd_batch masks(mask); @@ -203,24 +988,60 @@ inline static const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 7-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5, SafeLoad(in + 3), SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 27 | SafeLoadAs(in + 4 * 3) << 5, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 0, 7, 14, 21, 0, 3, 10, 17, 24, 0, 6, 13, 20, 0, 2, 9 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 7-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + }; shifts = simd_batch{ 16, 23, 0, 5, 12, 19, 0, 1, 8, 15, 22, 0, 4, 11, 18, 25 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 7; + in += 7 * 4; return in; } -inline static const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xff; simd_batch masks(mask); @@ -228,24 +1049,60 @@ inline static const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 8-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + }; shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 8-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + }; shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 8; + in += 8 * 4; return in; } -inline static const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1ff; simd_batch masks(mask); @@ -253,24 +1110,60 @@ inline static const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 9-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 0, 9, 18, 0, 4, 13, 22, 0, 8, 17, 0, 3, 12, 21, 0, 7 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 9-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 25 | SafeLoadAs(in + 4 * 5) << 7, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + }; shifts = simd_batch{ 16, 0, 2, 11, 20, 0, 6, 15, 0, 1, 10, 19, 0, 5, 14, 23 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 9; + in += 9 * 4; return in; } -inline static const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3ff; simd_batch masks(mask); @@ -278,24 +1171,60 @@ inline static const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 10-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + }; shifts = simd_batch{ 0, 10, 20, 0, 8, 18, 0, 6, 16, 0, 4, 14, 0, 2, 12, 22 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 10-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2, SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 30 | SafeLoadAs(in + 4 * 6) << 2, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 26 | SafeLoadAs(in + 4 * 8) << 6, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), + }; shifts = simd_batch{ 0, 10, 20, 0, 8, 18, 0, 6, 16, 0, 4, 14, 0, 2, 12, 22 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 10; + in += 10 * 4; return in; } -inline static const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7ff; simd_batch masks(mask); @@ -303,24 +1232,60 @@ inline static const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 11-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 23 | SafeLoad(in + 2) << 9, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 25 | SafeLoad(in + 4) << 7, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 22 | SafeLoadAs(in + 4 * 1) << 10, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 23 | SafeLoadAs(in + 4 * 2) << 9, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 24 | SafeLoadAs(in + 4 * 3) << 8, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 25 | SafeLoadAs(in + 4 * 4) << 7, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 0, 11, 0, 1, 12, 0, 2, 13, 0, 3, 14, 0, 4, 15, 0, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 11-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5, SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1, SafeLoad(in + 10), SafeLoad(in + 10) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 27 | SafeLoadAs(in + 4 * 6) << 5, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 29 | SafeLoadAs(in + 4 * 8) << 3, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10), + }; shifts = simd_batch{ 16, 0, 6, 17, 0, 7, 18, 0, 8, 19, 0, 9, 20, 0, 10, 21 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 11; + in += 11 * 4; return in; } -inline static const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xfff; simd_batch masks(mask); @@ -328,24 +1293,60 @@ inline static const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 12-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 24 | SafeLoad(in + 1) << 8, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5), SafeLoad(in + 5) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 24 | SafeLoadAs(in + 4 * 1) << 8, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + }; shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20, 0, 12, 0, 4, 16, 0, 8, 20 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 12-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 10) >> 28 | SafeLoad(in + 11) << 4, SafeLoad(in + 11), SafeLoad(in + 11) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11), + }; shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20, 0, 12, 0, 4, 16, 0, 8, 20 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 12; + in += 12 * 4; return in; } -inline static const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1fff; simd_batch masks(mask); @@ -353,24 +1354,60 @@ inline static const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 13-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 26 | SafeLoad(in + 1) << 6, SafeLoad(in + 1), SafeLoad(in + 1) >> 20 | SafeLoad(in + 2) << 12, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5, SafeLoad(in + 3), SafeLoad(in + 3) >> 21 | SafeLoad(in + 4) << 11, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5), SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 6) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 26 | SafeLoadAs(in + 4 * 1) << 6, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 20 | SafeLoadAs(in + 4 * 2) << 12, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 27 | SafeLoadAs(in + 4 * 3) << 5, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 21 | SafeLoadAs(in + 4 * 4) << 11, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, + SafeLoadAs(in + 4 * 6), + }; shifts = simd_batch{ 0, 13, 0, 7, 0, 1, 14, 0, 8, 0, 2, 15, 0, 9, 0, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 13-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6) >> 29 | SafeLoad(in + 7) << 3, SafeLoad(in + 7), SafeLoad(in + 7) >> 23 | SafeLoad(in + 8) << 9, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 10) >> 31 | SafeLoad(in + 11) << 1, SafeLoad(in + 11), SafeLoad(in + 11) >> 25 | SafeLoad(in + 12) << 7, SafeLoad(in + 12), SafeLoad(in + 12) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 29 | SafeLoadAs(in + 4 * 7) << 3, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 23 | SafeLoadAs(in + 4 * 8) << 9, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12), + }; shifts = simd_batch{ 16, 0, 10, 0, 4, 17, 0, 11, 0, 5, 18, 0, 12, 0, 6, 19 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 13; + in += 13 * 4; return in; } -inline static const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3fff; simd_batch masks(mask); @@ -378,24 +1415,60 @@ inline static const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 14-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1), SafeLoad(in + 1) >> 24 | SafeLoad(in + 2) << 8, SafeLoad(in + 2), SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5), SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 6), SafeLoad(in + 6) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 24 | SafeLoadAs(in + 4 * 2) << 8, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + }; shifts = simd_batch{ 0, 14, 0, 10, 0, 6, 0, 2, 16, 0, 12, 0, 8, 0, 4, 18 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 14-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9), SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 10) >> 30 | SafeLoad(in + 11) << 2, SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12), SafeLoad(in + 12) >> 22 | SafeLoad(in + 13) << 10, SafeLoad(in + 13), SafeLoad(in + 13) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 30 | SafeLoadAs(in + 4 * 11) << 2, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13), + }; shifts = simd_batch{ 0, 14, 0, 10, 0, 6, 0, 2, 16, 0, 12, 0, 8, 0, 4, 18 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 14; + in += 14 * 4; return in; } -inline static const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7fff; simd_batch masks(mask); @@ -403,24 +1476,60 @@ inline static const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 15-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6), SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14, SafeLoad(in + 7) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, + SafeLoadAs(in + 4 * 7), + }; shifts = simd_batch{ 0, 15, 0, 13, 0, 11, 0, 9, 0, 7, 0, 5, 0, 3, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 15-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 31 | SafeLoad(in + 8) << 1, SafeLoad(in + 8), SafeLoad(in + 8) >> 29 | SafeLoad(in + 9) << 3, SafeLoad(in + 9), SafeLoad(in + 9) >> 27 | SafeLoad(in + 10) << 5, SafeLoad(in + 10), SafeLoad(in + 10) >> 25 | SafeLoad(in + 11) << 7, SafeLoad(in + 11), SafeLoad(in + 11) >> 23 | SafeLoad(in + 12) << 9, SafeLoad(in + 12), SafeLoad(in + 12) >> 21 | SafeLoad(in + 13) << 11, SafeLoad(in + 13), SafeLoad(in + 13) >> 19 | SafeLoad(in + 14) << 13, SafeLoad(in + 14), SafeLoad(in + 14) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 31 | SafeLoadAs(in + 4 * 8) << 1, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 29 | SafeLoadAs(in + 4 * 9) << 3, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 27 | SafeLoadAs(in + 4 * 10) << 5, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 25 | SafeLoadAs(in + 4 * 11) << 7, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 23 | SafeLoadAs(in + 4 * 12) << 9, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14), + }; shifts = simd_batch{ 16, 0, 14, 0, 12, 0, 10, 0, 8, 0, 6, 0, 4, 0, 2, 17 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 15; + in += 15 * 4; return in; } -inline static const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xffff; simd_batch masks(mask); @@ -428,24 +1537,60 @@ inline static const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 16-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 7), SafeLoad(in + 7) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + }; shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 16-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 11), SafeLoad(in + 11), SafeLoad(in + 12), SafeLoad(in + 12), SafeLoad(in + 13), SafeLoad(in + 13), SafeLoad(in + 14), SafeLoad(in + 14), SafeLoad(in + 15), SafeLoad(in + 15) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15), + }; shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 16; + in += 16 * 4; return in; } -inline static const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1ffff; simd_batch masks(mask); @@ -453,24 +1598,60 @@ inline static const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 17-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 17 | SafeLoad(in + 1) << 15, SafeLoad(in + 1), SafeLoad(in + 1) >> 19 | SafeLoad(in + 2) << 13, SafeLoad(in + 2), SafeLoad(in + 2) >> 21 | SafeLoad(in + 3) << 11, SafeLoad(in + 3), SafeLoad(in + 3) >> 23 | SafeLoad(in + 4) << 9, SafeLoad(in + 4), SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 5), SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5, SafeLoad(in + 6), SafeLoad(in + 6) >> 29 | SafeLoad(in + 7) << 3, SafeLoad(in + 7), SafeLoad(in + 7) >> 31 | SafeLoad(in + 8) << 1 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 17 | SafeLoadAs(in + 4 * 1) << 15, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 19 | SafeLoadAs(in + 4 * 2) << 13, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 21 | SafeLoadAs(in + 4 * 3) << 11, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 23 | SafeLoadAs(in + 4 * 4) << 9, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 25 | SafeLoadAs(in + 4 * 5) << 7, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 27 | SafeLoadAs(in + 4 * 6) << 5, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 29 | SafeLoadAs(in + 4 * 7) << 3, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 31 | SafeLoadAs(in + 4 * 8) << 1, + }; shifts = simd_batch{ 0, 0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 17-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 8) >> 16 | SafeLoad(in + 9) << 16, SafeLoad(in + 9), SafeLoad(in + 9) >> 18 | SafeLoad(in + 10) << 14, SafeLoad(in + 10), SafeLoad(in + 10) >> 20 | SafeLoad(in + 11) << 12, SafeLoad(in + 11), SafeLoad(in + 11) >> 22 | SafeLoad(in + 12) << 10, SafeLoad(in + 12), SafeLoad(in + 12) >> 24 | SafeLoad(in + 13) << 8, SafeLoad(in + 13), SafeLoad(in + 13) >> 26 | SafeLoad(in + 14) << 6, SafeLoad(in + 14), SafeLoad(in + 14) >> 28 | SafeLoad(in + 15) << 4, SafeLoad(in + 15), SafeLoad(in + 15) >> 30 | SafeLoad(in + 16) << 2, SafeLoad(in + 16) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 8) >> 16 | SafeLoadAs(in + 4 * 9) << 16, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 18 | SafeLoadAs(in + 4 * 10) << 14, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 20 | SafeLoadAs(in + 4 * 11) << 12, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 22 | SafeLoadAs(in + 4 * 12) << 10, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 24 | SafeLoadAs(in + 4 * 13) << 8, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 26 | SafeLoadAs(in + 4 * 14) << 6, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 28 | SafeLoadAs(in + 4 * 15) << 4, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, + SafeLoadAs(in + 4 * 16), + }; shifts = simd_batch{ 0, 1, 0, 3, 0, 5, 0, 7, 0, 9, 0, 11, 0, 13, 0, 15 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 17; + in += 17 * 4; return in; } -inline static const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3ffff; simd_batch masks(mask); @@ -478,24 +1659,60 @@ inline static const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 18-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 18 | SafeLoad(in + 1) << 14, SafeLoad(in + 1), SafeLoad(in + 1) >> 22 | SafeLoad(in + 2) << 10, SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) >> 16 | SafeLoad(in + 5) << 16, SafeLoad(in + 5), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 18 | SafeLoadAs(in + 4 * 1) << 14, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4) >> 16 | SafeLoadAs(in + 4 * 5) << 16, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8), + }; shifts = simd_batch{ 0, 0, 4, 0, 8, 0, 12, 0, 0, 2, 0, 6, 0, 10, 0, 14 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 18-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9) >> 18 | SafeLoad(in + 10) << 14, SafeLoad(in + 10), SafeLoad(in + 10) >> 22 | SafeLoad(in + 11) << 10, SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12), SafeLoad(in + 12) >> 30 | SafeLoad(in + 13) << 2, SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14), SafeLoad(in + 14) >> 20 | SafeLoad(in + 15) << 12, SafeLoad(in + 15), SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 18 | SafeLoadAs(in + 4 * 10) << 14, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 22 | SafeLoadAs(in + 4 * 11) << 10, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 30 | SafeLoadAs(in + 4 * 13) << 2, + SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 20 | SafeLoadAs(in + 4 * 15) << 12, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, + SafeLoadAs(in + 4 * 16), + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 17), + }; shifts = simd_batch{ 0, 0, 4, 0, 8, 0, 12, 0, 0, 2, 0, 6, 0, 10, 0, 14 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 18; + in += 18 * 4; return in; } -inline static const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7ffff; simd_batch masks(mask); @@ -503,24 +1720,60 @@ inline static const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 19-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 19 | SafeLoad(in + 1) << 13, SafeLoad(in + 1), SafeLoad(in + 1) >> 25 | SafeLoad(in + 2) << 7, SafeLoad(in + 2), SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1, SafeLoad(in + 3) >> 18 | SafeLoad(in + 4) << 14, SafeLoad(in + 4), SafeLoad(in + 4) >> 24 | SafeLoad(in + 5) << 8, SafeLoad(in + 5), SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2, SafeLoad(in + 6) >> 17 | SafeLoad(in + 7) << 15, SafeLoad(in + 7), SafeLoad(in + 7) >> 23 | SafeLoad(in + 8) << 9, SafeLoad(in + 8), SafeLoad(in + 8) >> 29 | SafeLoad(in + 9) << 3 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 19 | SafeLoadAs(in + 4 * 1) << 13, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 25 | SafeLoadAs(in + 4 * 2) << 7, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 31 | SafeLoadAs(in + 4 * 3) << 1, + SafeLoadAs(in + 4 * 3) >> 18 | SafeLoadAs(in + 4 * 4) << 14, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 24 | SafeLoadAs(in + 4 * 5) << 8, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 30 | SafeLoadAs(in + 4 * 6) << 2, + SafeLoadAs(in + 4 * 6) >> 17 | SafeLoadAs(in + 4 * 7) << 15, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 23 | SafeLoadAs(in + 4 * 8) << 9, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 29 | SafeLoadAs(in + 4 * 9) << 3, + }; shifts = simd_batch{ 0, 0, 6, 0, 12, 0, 0, 5, 0, 11, 0, 0, 4, 0, 10, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 19-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 9) >> 16 | SafeLoad(in + 10) << 16, SafeLoad(in + 10), SafeLoad(in + 10) >> 22 | SafeLoad(in + 11) << 10, SafeLoad(in + 11), SafeLoad(in + 11) >> 28 | SafeLoad(in + 12) << 4, SafeLoad(in + 12) >> 15 | SafeLoad(in + 13) << 17, SafeLoad(in + 13), SafeLoad(in + 13) >> 21 | SafeLoad(in + 14) << 11, SafeLoad(in + 14), SafeLoad(in + 14) >> 27 | SafeLoad(in + 15) << 5, SafeLoad(in + 15) >> 14 | SafeLoad(in + 16) << 18, SafeLoad(in + 16), SafeLoad(in + 16) >> 20 | SafeLoad(in + 17) << 12, SafeLoad(in + 17), SafeLoad(in + 17) >> 26 | SafeLoad(in + 18) << 6, SafeLoad(in + 18) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 9) >> 16 | SafeLoadAs(in + 4 * 10) << 16, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 22 | SafeLoadAs(in + 4 * 11) << 10, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 28 | SafeLoadAs(in + 4 * 12) << 4, + SafeLoadAs(in + 4 * 12) >> 15 | SafeLoadAs(in + 4 * 13) << 17, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 21 | SafeLoadAs(in + 4 * 14) << 11, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 27 | SafeLoadAs(in + 4 * 15) << 5, + SafeLoadAs(in + 4 * 15) >> 14 | SafeLoadAs(in + 4 * 16) << 18, + SafeLoadAs(in + 4 * 16), + SafeLoadAs(in + 4 * 16) >> 20 | SafeLoadAs(in + 4 * 17) << 12, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, + SafeLoadAs(in + 4 * 18), + }; shifts = simd_batch{ 0, 3, 0, 9, 0, 0, 2, 0, 8, 0, 0, 1, 0, 7, 0, 13 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 19; + in += 19 * 4; return in; } -inline static const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xfffff; simd_batch masks(mask); @@ -528,24 +1781,60 @@ inline static const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 20-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 20 | SafeLoad(in + 1) << 12, SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) >> 16 | SafeLoad(in + 3) << 16, SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 5), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 20 | SafeLoadAs(in + 4 * 1) << 12, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2) >> 16 | SafeLoadAs(in + 4 * 3) << 16, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, + SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, + SafeLoadAs(in + 4 * 9), + }; shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12, 0, 0, 8, 0, 0, 4, 0, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 20-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 10), SafeLoad(in + 10) >> 20 | SafeLoad(in + 11) << 12, SafeLoad(in + 11), SafeLoad(in + 11) >> 28 | SafeLoad(in + 12) << 4, SafeLoad(in + 12) >> 16 | SafeLoad(in + 13) << 16, SafeLoad(in + 13), SafeLoad(in + 13) >> 24 | SafeLoad(in + 14) << 8, SafeLoad(in + 14), SafeLoad(in + 15), SafeLoad(in + 15) >> 20 | SafeLoad(in + 16) << 12, SafeLoad(in + 16), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) >> 16 | SafeLoad(in + 18) << 16, SafeLoad(in + 18), SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 20 | SafeLoadAs(in + 4 * 11) << 12, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 28 | SafeLoadAs(in + 4 * 12) << 4, + SafeLoadAs(in + 4 * 12) >> 16 | SafeLoadAs(in + 4 * 13) << 16, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 24 | SafeLoadAs(in + 4 * 14) << 8, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 20 | SafeLoadAs(in + 4 * 16) << 12, + SafeLoadAs(in + 4 * 16), + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 17) >> 16 | SafeLoadAs(in + 4 * 18) << 16, + SafeLoadAs(in + 4 * 18), + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19), + }; shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12, 0, 0, 8, 0, 0, 4, 0, 12 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 20; + in += 20 * 4; return in; } -inline static const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1fffff; simd_batch masks(mask); @@ -553,24 +1842,60 @@ inline static const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 21-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 21 | SafeLoad(in + 1) << 11, SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) >> 19 | SafeLoad(in + 5) << 13, SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14, SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) >> 17 | SafeLoad(in + 9) << 15, SafeLoad(in + 9), SafeLoad(in + 9) >> 27 | SafeLoad(in + 10) << 5 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 21 | SafeLoadAs(in + 4 * 1) << 11, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, + SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4) >> 19 | SafeLoadAs(in + 4 * 5) << 13, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8) >> 17 | SafeLoadAs(in + 4 * 9) << 15, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 27 | SafeLoadAs(in + 4 * 10) << 5, + }; shifts = simd_batch{ 0, 0, 10, 0, 0, 9, 0, 0, 8, 0, 0, 7, 0, 0, 6, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 21-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12) >> 15 | SafeLoad(in + 13) << 17, SafeLoad(in + 13), SafeLoad(in + 13) >> 25 | SafeLoad(in + 14) << 7, SafeLoad(in + 14) >> 14 | SafeLoad(in + 15) << 18, SafeLoad(in + 15), SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 13 | SafeLoad(in + 17) << 19, SafeLoad(in + 17), SafeLoad(in + 17) >> 23 | SafeLoad(in + 18) << 9, SafeLoad(in + 18) >> 12 | SafeLoad(in + 19) << 20, SafeLoad(in + 19), SafeLoad(in + 19) >> 22 | SafeLoad(in + 20) << 10, SafeLoad(in + 20) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + SafeLoadAs(in + 4 * 12) >> 15 | SafeLoadAs(in + 4 * 13) << 17, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 25 | SafeLoadAs(in + 4 * 14) << 7, + SafeLoadAs(in + 4 * 14) >> 14 | SafeLoadAs(in + 4 * 15) << 18, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, + SafeLoadAs(in + 4 * 16) >> 13 | SafeLoadAs(in + 4 * 17) << 19, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, + SafeLoadAs(in + 4 * 18) >> 12 | SafeLoadAs(in + 4 * 19) << 20, + SafeLoadAs(in + 4 * 19), + SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, + SafeLoadAs(in + 4 * 20), + }; shifts = simd_batch{ 0, 5, 0, 0, 4, 0, 0, 3, 0, 0, 2, 0, 0, 1, 0, 11 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 21; + in += 21 * 4; return in; } -inline static const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3fffff; simd_batch masks(mask); @@ -578,24 +1903,60 @@ inline static const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 22-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10, SafeLoad(in + 1) >> 12 | SafeLoad(in + 2) << 20, SafeLoad(in + 2), SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8, SafeLoad(in + 3) >> 14 | SafeLoad(in + 4) << 18, SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5) >> 16 | SafeLoad(in + 6) << 16, SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7) >> 18 | SafeLoad(in + 8) << 14, SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 10) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 22 | SafeLoadAs(in + 4 * 1) << 10, + SafeLoadAs(in + 4 * 1) >> 12 | SafeLoadAs(in + 4 * 2) << 20, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 24 | SafeLoadAs(in + 4 * 3) << 8, + SafeLoadAs(in + 4 * 3) >> 14 | SafeLoadAs(in + 4 * 4) << 18, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, + SafeLoadAs(in + 4 * 5) >> 16 | SafeLoadAs(in + 4 * 6) << 16, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, + SafeLoadAs(in + 4 * 7) >> 18 | SafeLoadAs(in + 4 * 8) << 14, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, + SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, + SafeLoadAs(in + 4 * 10), + }; shifts = simd_batch{ 0, 0, 0, 2, 0, 0, 4, 0, 0, 6, 0, 0, 8, 0, 0, 10 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 22-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 11), SafeLoad(in + 11) >> 22 | SafeLoad(in + 12) << 10, SafeLoad(in + 12) >> 12 | SafeLoad(in + 13) << 20, SafeLoad(in + 13), SafeLoad(in + 13) >> 24 | SafeLoad(in + 14) << 8, SafeLoad(in + 14) >> 14 | SafeLoad(in + 15) << 18, SafeLoad(in + 15), SafeLoad(in + 15) >> 26 | SafeLoad(in + 16) << 6, SafeLoad(in + 16) >> 16 | SafeLoad(in + 17) << 16, SafeLoad(in + 17), SafeLoad(in + 17) >> 28 | SafeLoad(in + 18) << 4, SafeLoad(in + 18) >> 18 | SafeLoad(in + 19) << 14, SafeLoad(in + 19), SafeLoad(in + 19) >> 30 | SafeLoad(in + 20) << 2, SafeLoad(in + 20) >> 20 | SafeLoad(in + 21) << 12, SafeLoad(in + 21) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 22 | SafeLoadAs(in + 4 * 12) << 10, + SafeLoadAs(in + 4 * 12) >> 12 | SafeLoadAs(in + 4 * 13) << 20, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 24 | SafeLoadAs(in + 4 * 14) << 8, + SafeLoadAs(in + 4 * 14) >> 14 | SafeLoadAs(in + 4 * 15) << 18, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 26 | SafeLoadAs(in + 4 * 16) << 6, + SafeLoadAs(in + 4 * 16) >> 16 | SafeLoadAs(in + 4 * 17) << 16, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 28 | SafeLoadAs(in + 4 * 18) << 4, + SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, + SafeLoadAs(in + 4 * 19), + SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, + SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, + SafeLoadAs(in + 4 * 21), + }; shifts = simd_batch{ 0, 0, 0, 2, 0, 0, 4, 0, 0, 6, 0, 0, 8, 0, 0, 10 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 22; + in += 22 * 4; return in; } -inline static const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7fffff; simd_batch masks(mask); @@ -603,24 +1964,60 @@ inline static const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 23-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 23 | SafeLoad(in + 1) << 9, SafeLoad(in + 1) >> 14 | SafeLoad(in + 2) << 18, SafeLoad(in + 2), SafeLoad(in + 2) >> 28 | SafeLoad(in + 3) << 4, SafeLoad(in + 3) >> 19 | SafeLoad(in + 4) << 13, SafeLoad(in + 4) >> 10 | SafeLoad(in + 5) << 22, SafeLoad(in + 5), SafeLoad(in + 5) >> 24 | SafeLoad(in + 6) << 8, SafeLoad(in + 6) >> 15 | SafeLoad(in + 7) << 17, SafeLoad(in + 7), SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3, SafeLoad(in + 8) >> 20 | SafeLoad(in + 9) << 12, SafeLoad(in + 9) >> 11 | SafeLoad(in + 10) << 21, SafeLoad(in + 10), SafeLoad(in + 10) >> 25 | SafeLoad(in + 11) << 7 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 23 | SafeLoadAs(in + 4 * 1) << 9, + SafeLoadAs(in + 4 * 1) >> 14 | SafeLoadAs(in + 4 * 2) << 18, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 28 | SafeLoadAs(in + 4 * 3) << 4, + SafeLoadAs(in + 4 * 3) >> 19 | SafeLoadAs(in + 4 * 4) << 13, + SafeLoadAs(in + 4 * 4) >> 10 | SafeLoadAs(in + 4 * 5) << 22, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 24 | SafeLoadAs(in + 4 * 6) << 8, + SafeLoadAs(in + 4 * 6) >> 15 | SafeLoadAs(in + 4 * 7) << 17, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 29 | SafeLoadAs(in + 4 * 8) << 3, + SafeLoadAs(in + 4 * 8) >> 20 | SafeLoadAs(in + 4 * 9) << 12, + SafeLoadAs(in + 4 * 9) >> 11 | SafeLoadAs(in + 4 * 10) << 21, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 25 | SafeLoadAs(in + 4 * 11) << 7, + }; shifts = simd_batch{ 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 6, 0, 0, 0, 2, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 23-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 11) >> 16 | SafeLoad(in + 12) << 16, SafeLoad(in + 12), SafeLoad(in + 12) >> 30 | SafeLoad(in + 13) << 2, SafeLoad(in + 13) >> 21 | SafeLoad(in + 14) << 11, SafeLoad(in + 14) >> 12 | SafeLoad(in + 15) << 20, SafeLoad(in + 15), SafeLoad(in + 15) >> 26 | SafeLoad(in + 16) << 6, SafeLoad(in + 16) >> 17 | SafeLoad(in + 17) << 15, SafeLoad(in + 17), SafeLoad(in + 17) >> 31 | SafeLoad(in + 18) << 1, SafeLoad(in + 18) >> 22 | SafeLoad(in + 19) << 10, SafeLoad(in + 19) >> 13 | SafeLoad(in + 20) << 19, SafeLoad(in + 20), SafeLoad(in + 20) >> 27 | SafeLoad(in + 21) << 5, SafeLoad(in + 21) >> 18 | SafeLoad(in + 22) << 14, SafeLoad(in + 22) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 11) >> 16 | SafeLoadAs(in + 4 * 12) << 16, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 30 | SafeLoadAs(in + 4 * 13) << 2, + SafeLoadAs(in + 4 * 13) >> 21 | SafeLoadAs(in + 4 * 14) << 11, + SafeLoadAs(in + 4 * 14) >> 12 | SafeLoadAs(in + 4 * 15) << 20, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 26 | SafeLoadAs(in + 4 * 16) << 6, + SafeLoadAs(in + 4 * 16) >> 17 | SafeLoadAs(in + 4 * 17) << 15, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 31 | SafeLoadAs(in + 4 * 18) << 1, + SafeLoadAs(in + 4 * 18) >> 22 | SafeLoadAs(in + 4 * 19) << 10, + SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, + SafeLoadAs(in + 4 * 20), + SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, + SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, + SafeLoadAs(in + 4 * 22), + }; shifts = simd_batch{ 0, 7, 0, 0, 0, 3, 0, 0, 8, 0, 0, 0, 4, 0, 0, 9 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 23; + in += 23 * 4; return in; } -inline static const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xffffff; simd_batch masks(mask); @@ -628,24 +2025,60 @@ inline static const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 24-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 24 | SafeLoad(in + 1) << 8, SafeLoad(in + 1) >> 16 | SafeLoad(in + 2) << 16, SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) >> 16 | SafeLoad(in + 5) << 16, SafeLoad(in + 5), SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8), SafeLoad(in + 9), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 24 | SafeLoadAs(in + 4 * 1) << 8, + SafeLoadAs(in + 4 * 1) >> 16 | SafeLoadAs(in + 4 * 2) << 16, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4) >> 16 | SafeLoadAs(in + 4 * 5) << 16, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, + SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, + SafeLoadAs(in + 4 * 11), + }; shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, 0, 8 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 24-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 12), SafeLoad(in + 12) >> 24 | SafeLoad(in + 13) << 8, SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14), SafeLoad(in + 15), SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 16 | SafeLoad(in + 17) << 16, SafeLoad(in + 17), SafeLoad(in + 18), SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 16 | SafeLoad(in + 20) << 16, SafeLoad(in + 20), SafeLoad(in + 21), SafeLoad(in + 21) >> 24 | SafeLoad(in + 22) << 8, SafeLoad(in + 22) >> 16 | SafeLoad(in + 23) << 16, SafeLoad(in + 23) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 24 | SafeLoadAs(in + 4 * 13) << 8, + SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, + SafeLoadAs(in + 4 * 16) >> 16 | SafeLoadAs(in + 4 * 17) << 16, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 18), + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19) >> 16 | SafeLoadAs(in + 4 * 20) << 16, + SafeLoadAs(in + 4 * 20), + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, + SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, + SafeLoadAs(in + 4 * 23), + }; shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, 0, 8 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 24; + in += 24 * 4; return in; } -inline static const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1ffffff; simd_batch masks(mask); @@ -653,24 +2086,60 @@ inline static const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 25-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 25 | SafeLoad(in + 1) << 7, SafeLoad(in + 1) >> 18 | SafeLoad(in + 2) << 14, SafeLoad(in + 2) >> 11 | SafeLoad(in + 3) << 21, SafeLoad(in + 3), SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5) >> 15 | SafeLoad(in + 6) << 17, SafeLoad(in + 6) >> 8 | SafeLoad(in + 7) << 24, SafeLoad(in + 7), SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6, SafeLoad(in + 8) >> 19 | SafeLoad(in + 9) << 13, SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 20, SafeLoad(in + 10), SafeLoad(in + 10) >> 30 | SafeLoad(in + 11) << 2, SafeLoad(in + 11) >> 23 | SafeLoad(in + 12) << 9 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 25 | SafeLoadAs(in + 4 * 1) << 7, + SafeLoadAs(in + 4 * 1) >> 18 | SafeLoadAs(in + 4 * 2) << 14, + SafeLoadAs(in + 4 * 2) >> 11 | SafeLoadAs(in + 4 * 3) << 21, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 29 | SafeLoadAs(in + 4 * 4) << 3, + SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, + SafeLoadAs(in + 4 * 5) >> 15 | SafeLoadAs(in + 4 * 6) << 17, + SafeLoadAs(in + 4 * 6) >> 8 | SafeLoadAs(in + 4 * 7) << 24, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 26 | SafeLoadAs(in + 4 * 8) << 6, + SafeLoadAs(in + 4 * 8) >> 19 | SafeLoadAs(in + 4 * 9) << 13, + SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 30 | SafeLoadAs(in + 4 * 11) << 2, + SafeLoadAs(in + 4 * 11) >> 23 | SafeLoadAs(in + 4 * 12) << 9, + }; shifts = simd_batch{ 0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 25-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 12) >> 16 | SafeLoad(in + 13) << 16, SafeLoad(in + 13) >> 9 | SafeLoad(in + 14) << 23, SafeLoad(in + 14), SafeLoad(in + 14) >> 27 | SafeLoad(in + 15) << 5, SafeLoad(in + 15) >> 20 | SafeLoad(in + 16) << 12, SafeLoad(in + 16) >> 13 | SafeLoad(in + 17) << 19, SafeLoad(in + 17), SafeLoad(in + 17) >> 31 | SafeLoad(in + 18) << 1, SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 17 | SafeLoad(in + 20) << 15, SafeLoad(in + 20) >> 10 | SafeLoad(in + 21) << 22, SafeLoad(in + 21), SafeLoad(in + 21) >> 28 | SafeLoad(in + 22) << 4, SafeLoad(in + 22) >> 21 | SafeLoad(in + 23) << 11, SafeLoad(in + 23) >> 14 | SafeLoad(in + 24) << 18, SafeLoad(in + 24) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 12) >> 16 | SafeLoadAs(in + 4 * 13) << 16, + SafeLoadAs(in + 4 * 13) >> 9 | SafeLoadAs(in + 4 * 14) << 23, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 27 | SafeLoadAs(in + 4 * 15) << 5, + SafeLoadAs(in + 4 * 15) >> 20 | SafeLoadAs(in + 4 * 16) << 12, + SafeLoadAs(in + 4 * 16) >> 13 | SafeLoadAs(in + 4 * 17) << 19, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 31 | SafeLoadAs(in + 4 * 18) << 1, + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19) >> 17 | SafeLoadAs(in + 4 * 20) << 15, + SafeLoadAs(in + 4 * 20) >> 10 | SafeLoadAs(in + 4 * 21) << 22, + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 28 | SafeLoadAs(in + 4 * 22) << 4, + SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, + SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, + SafeLoadAs(in + 4 * 24), + }; shifts = simd_batch{ 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 0, 3, 0, 0, 0, 7 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 25; + in += 25 * 4; return in; } -inline static const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3ffffff; simd_batch masks(mask); @@ -678,24 +2147,60 @@ inline static const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 26-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 26 | SafeLoad(in + 1) << 6, SafeLoad(in + 1) >> 20 | SafeLoad(in + 2) << 12, SafeLoad(in + 2) >> 14 | SafeLoad(in + 3) << 18, SafeLoad(in + 3) >> 8 | SafeLoad(in + 4) << 24, SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 6) >> 16 | SafeLoad(in + 7) << 16, SafeLoad(in + 7) >> 10 | SafeLoad(in + 8) << 22, SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10) >> 18 | SafeLoad(in + 11) << 14, SafeLoad(in + 11) >> 12 | SafeLoad(in + 12) << 20, SafeLoad(in + 12) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 26 | SafeLoadAs(in + 4 * 1) << 6, + SafeLoadAs(in + 4 * 1) >> 20 | SafeLoadAs(in + 4 * 2) << 12, + SafeLoadAs(in + 4 * 2) >> 14 | SafeLoadAs(in + 4 * 3) << 18, + SafeLoadAs(in + 4 * 3) >> 8 | SafeLoadAs(in + 4 * 4) << 24, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, + SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, + SafeLoadAs(in + 4 * 6) >> 16 | SafeLoadAs(in + 4 * 7) << 16, + SafeLoadAs(in + 4 * 7) >> 10 | SafeLoadAs(in + 4 * 8) << 22, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, + SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, + SafeLoadAs(in + 4 * 10) >> 18 | SafeLoadAs(in + 4 * 11) << 14, + SafeLoadAs(in + 4 * 11) >> 12 | SafeLoadAs(in + 4 * 12) << 20, + SafeLoadAs(in + 4 * 12), + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 26-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 13), SafeLoad(in + 13) >> 26 | SafeLoad(in + 14) << 6, SafeLoad(in + 14) >> 20 | SafeLoad(in + 15) << 12, SafeLoad(in + 15) >> 14 | SafeLoad(in + 16) << 18, SafeLoad(in + 16) >> 8 | SafeLoad(in + 17) << 24, SafeLoad(in + 17), SafeLoad(in + 17) >> 28 | SafeLoad(in + 18) << 4, SafeLoad(in + 18) >> 22 | SafeLoad(in + 19) << 10, SafeLoad(in + 19) >> 16 | SafeLoad(in + 20) << 16, SafeLoad(in + 20) >> 10 | SafeLoad(in + 21) << 22, SafeLoad(in + 21), SafeLoad(in + 21) >> 30 | SafeLoad(in + 22) << 2, SafeLoad(in + 22) >> 24 | SafeLoad(in + 23) << 8, SafeLoad(in + 23) >> 18 | SafeLoad(in + 24) << 14, SafeLoad(in + 24) >> 12 | SafeLoad(in + 25) << 20, SafeLoad(in + 25) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 26 | SafeLoadAs(in + 4 * 14) << 6, + SafeLoadAs(in + 4 * 14) >> 20 | SafeLoadAs(in + 4 * 15) << 12, + SafeLoadAs(in + 4 * 15) >> 14 | SafeLoadAs(in + 4 * 16) << 18, + SafeLoadAs(in + 4 * 16) >> 8 | SafeLoadAs(in + 4 * 17) << 24, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 28 | SafeLoadAs(in + 4 * 18) << 4, + SafeLoadAs(in + 4 * 18) >> 22 | SafeLoadAs(in + 4 * 19) << 10, + SafeLoadAs(in + 4 * 19) >> 16 | SafeLoadAs(in + 4 * 20) << 16, + SafeLoadAs(in + 4 * 20) >> 10 | SafeLoadAs(in + 4 * 21) << 22, + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, + SafeLoadAs(in + 4 * 22) >> 24 | SafeLoadAs(in + 4 * 23) << 8, + SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, + SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, + SafeLoadAs(in + 4 * 25), + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 6 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 26; + in += 26 * 4; return in; } -inline static const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7ffffff; simd_batch masks(mask); @@ -703,24 +2208,60 @@ inline static const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 27-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5, SafeLoad(in + 1) >> 22 | SafeLoad(in + 2) << 10, SafeLoad(in + 2) >> 17 | SafeLoad(in + 3) << 15, SafeLoad(in + 3) >> 12 | SafeLoad(in + 4) << 20, SafeLoad(in + 4) >> 7 | SafeLoad(in + 5) << 25, SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7) >> 19 | SafeLoad(in + 8) << 13, SafeLoad(in + 8) >> 14 | SafeLoad(in + 9) << 18, SafeLoad(in + 9) >> 9 | SafeLoad(in + 10) << 23, SafeLoad(in + 10), SafeLoad(in + 10) >> 31 | SafeLoad(in + 11) << 1, SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12) >> 21 | SafeLoad(in + 13) << 11 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, + SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, + SafeLoadAs(in + 4 * 2) >> 17 | SafeLoadAs(in + 4 * 3) << 15, + SafeLoadAs(in + 4 * 3) >> 12 | SafeLoadAs(in + 4 * 4) << 20, + SafeLoadAs(in + 4 * 4) >> 7 | SafeLoadAs(in + 4 * 5) << 25, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7) >> 19 | SafeLoadAs(in + 4 * 8) << 13, + SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, + SafeLoadAs(in + 4 * 9) >> 9 | SafeLoadAs(in + 4 * 10) << 23, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 4, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 27-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14) >> 11 | SafeLoad(in + 15) << 21, SafeLoad(in + 15) >> 6 | SafeLoad(in + 16) << 26, SafeLoad(in + 16), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) >> 23 | SafeLoad(in + 18) << 9, SafeLoad(in + 18) >> 18 | SafeLoad(in + 19) << 14, SafeLoad(in + 19) >> 13 | SafeLoad(in + 20) << 19, SafeLoad(in + 20) >> 8 | SafeLoad(in + 21) << 24, SafeLoad(in + 21), SafeLoad(in + 21) >> 30 | SafeLoad(in + 22) << 2, SafeLoad(in + 22) >> 25 | SafeLoad(in + 23) << 7, SafeLoad(in + 23) >> 20 | SafeLoad(in + 24) << 12, SafeLoad(in + 24) >> 15 | SafeLoad(in + 25) << 17, SafeLoad(in + 25) >> 10 | SafeLoad(in + 26) << 22, SafeLoad(in + 26) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, + SafeLoadAs(in + 4 * 14) >> 11 | SafeLoadAs(in + 4 * 15) << 21, + SafeLoadAs(in + 4 * 15) >> 6 | SafeLoadAs(in + 4 * 16) << 26, + SafeLoadAs(in + 4 * 16), + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, + SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, + SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, + SafeLoadAs(in + 4 * 20) >> 8 | SafeLoadAs(in + 4 * 21) << 24, + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, + SafeLoadAs(in + 4 * 22) >> 25 | SafeLoadAs(in + 4 * 23) << 7, + SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, + SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, + SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, + SafeLoadAs(in + 4 * 26), + }; shifts = simd_batch{ 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 5 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 27; + in += 27 * 4; return in; } -inline static const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0xfffffff; simd_batch masks(mask); @@ -728,24 +2269,60 @@ inline static const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 28-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1) >> 24 | SafeLoad(in + 2) << 8, SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 3) >> 16 | SafeLoad(in + 4) << 16, SafeLoad(in + 4) >> 12 | SafeLoad(in + 5) << 20, SafeLoad(in + 5) >> 8 | SafeLoad(in + 6) << 24, SafeLoad(in + 6), SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11) >> 12 | SafeLoad(in + 12) << 20, SafeLoad(in + 12) >> 8 | SafeLoad(in + 13) << 24, SafeLoad(in + 13) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, + SafeLoadAs(in + 4 * 1) >> 24 | SafeLoadAs(in + 4 * 2) << 8, + SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, + SafeLoadAs(in + 4 * 3) >> 16 | SafeLoadAs(in + 4 * 4) << 16, + SafeLoadAs(in + 4 * 4) >> 12 | SafeLoadAs(in + 4 * 5) << 20, + SafeLoadAs(in + 4 * 5) >> 8 | SafeLoadAs(in + 4 * 6) << 24, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, + SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, + SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, + SafeLoadAs(in + 4 * 11) >> 12 | SafeLoadAs(in + 4 * 12) << 20, + SafeLoadAs(in + 4 * 12) >> 8 | SafeLoadAs(in + 4 * 13) << 24, + SafeLoadAs(in + 4 * 13), + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 4 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 28-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 14), SafeLoad(in + 14) >> 28 | SafeLoad(in + 15) << 4, SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 20 | SafeLoad(in + 17) << 12, SafeLoad(in + 17) >> 16 | SafeLoad(in + 18) << 16, SafeLoad(in + 18) >> 12 | SafeLoad(in + 19) << 20, SafeLoad(in + 19) >> 8 | SafeLoad(in + 20) << 24, SafeLoad(in + 20), SafeLoad(in + 21), SafeLoad(in + 21) >> 28 | SafeLoad(in + 22) << 4, SafeLoad(in + 22) >> 24 | SafeLoad(in + 23) << 8, SafeLoad(in + 23) >> 20 | SafeLoad(in + 24) << 12, SafeLoad(in + 24) >> 16 | SafeLoad(in + 25) << 16, SafeLoad(in + 25) >> 12 | SafeLoad(in + 26) << 20, SafeLoad(in + 26) >> 8 | SafeLoad(in + 27) << 24, SafeLoad(in + 27) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 28 | SafeLoadAs(in + 4 * 15) << 4, + SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, + SafeLoadAs(in + 4 * 16) >> 20 | SafeLoadAs(in + 4 * 17) << 12, + SafeLoadAs(in + 4 * 17) >> 16 | SafeLoadAs(in + 4 * 18) << 16, + SafeLoadAs(in + 4 * 18) >> 12 | SafeLoadAs(in + 4 * 19) << 20, + SafeLoadAs(in + 4 * 19) >> 8 | SafeLoadAs(in + 4 * 20) << 24, + SafeLoadAs(in + 4 * 20), + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 28 | SafeLoadAs(in + 4 * 22) << 4, + SafeLoadAs(in + 4 * 22) >> 24 | SafeLoadAs(in + 4 * 23) << 8, + SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, + SafeLoadAs(in + 4 * 24) >> 16 | SafeLoadAs(in + 4 * 25) << 16, + SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, + SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, + SafeLoadAs(in + 4 * 27), + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 4 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 28; + in += 28 * 4; return in; } -inline static const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x1fffffff; simd_batch masks(mask); @@ -753,24 +2330,60 @@ inline static const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 29-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 29 | SafeLoad(in + 1) << 3, SafeLoad(in + 1) >> 26 | SafeLoad(in + 2) << 6, SafeLoad(in + 2) >> 23 | SafeLoad(in + 3) << 9, SafeLoad(in + 3) >> 20 | SafeLoad(in + 4) << 12, SafeLoad(in + 4) >> 17 | SafeLoad(in + 5) << 15, SafeLoad(in + 5) >> 14 | SafeLoad(in + 6) << 18, SafeLoad(in + 6) >> 11 | SafeLoad(in + 7) << 21, SafeLoad(in + 7) >> 8 | SafeLoad(in + 8) << 24, SafeLoad(in + 8) >> 5 | SafeLoad(in + 9) << 27, SafeLoad(in + 9), SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1, SafeLoad(in + 10) >> 28 | SafeLoad(in + 11) << 4, SafeLoad(in + 11) >> 25 | SafeLoad(in + 12) << 7, SafeLoad(in + 12) >> 22 | SafeLoad(in + 13) << 10, SafeLoad(in + 13) >> 19 | SafeLoad(in + 14) << 13 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 29 | SafeLoadAs(in + 4 * 1) << 3, + SafeLoadAs(in + 4 * 1) >> 26 | SafeLoadAs(in + 4 * 2) << 6, + SafeLoadAs(in + 4 * 2) >> 23 | SafeLoadAs(in + 4 * 3) << 9, + SafeLoadAs(in + 4 * 3) >> 20 | SafeLoadAs(in + 4 * 4) << 12, + SafeLoadAs(in + 4 * 4) >> 17 | SafeLoadAs(in + 4 * 5) << 15, + SafeLoadAs(in + 4 * 5) >> 14 | SafeLoadAs(in + 4 * 6) << 18, + SafeLoadAs(in + 4 * 6) >> 11 | SafeLoadAs(in + 4 * 7) << 21, + SafeLoadAs(in + 4 * 7) >> 8 | SafeLoadAs(in + 4 * 8) << 24, + SafeLoadAs(in + 4 * 8) >> 5 | SafeLoadAs(in + 4 * 9) << 27, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, + SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, + SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, + SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, + SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 29-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 14) >> 16 | SafeLoad(in + 15) << 16, SafeLoad(in + 15) >> 13 | SafeLoad(in + 16) << 19, SafeLoad(in + 16) >> 10 | SafeLoad(in + 17) << 22, SafeLoad(in + 17) >> 7 | SafeLoad(in + 18) << 25, SafeLoad(in + 18) >> 4 | SafeLoad(in + 19) << 28, SafeLoad(in + 19), SafeLoad(in + 19) >> 30 | SafeLoad(in + 20) << 2, SafeLoad(in + 20) >> 27 | SafeLoad(in + 21) << 5, SafeLoad(in + 21) >> 24 | SafeLoad(in + 22) << 8, SafeLoad(in + 22) >> 21 | SafeLoad(in + 23) << 11, SafeLoad(in + 23) >> 18 | SafeLoad(in + 24) << 14, SafeLoad(in + 24) >> 15 | SafeLoad(in + 25) << 17, SafeLoad(in + 25) >> 12 | SafeLoad(in + 26) << 20, SafeLoad(in + 26) >> 9 | SafeLoad(in + 27) << 23, SafeLoad(in + 27) >> 6 | SafeLoad(in + 28) << 26, SafeLoad(in + 28) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 14) >> 16 | SafeLoadAs(in + 4 * 15) << 16, + SafeLoadAs(in + 4 * 15) >> 13 | SafeLoadAs(in + 4 * 16) << 19, + SafeLoadAs(in + 4 * 16) >> 10 | SafeLoadAs(in + 4 * 17) << 22, + SafeLoadAs(in + 4 * 17) >> 7 | SafeLoadAs(in + 4 * 18) << 25, + SafeLoadAs(in + 4 * 18) >> 4 | SafeLoadAs(in + 4 * 19) << 28, + SafeLoadAs(in + 4 * 19), + SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, + SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, + SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, + SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, + SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, + SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, + SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, + SafeLoadAs(in + 4 * 26) >> 9 | SafeLoadAs(in + 4 * 27) << 23, + SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, + SafeLoadAs(in + 4 * 28), + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 29; + in += 29 * 4; return in; } -inline static const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x3fffffff; simd_batch masks(mask); @@ -778,24 +2391,60 @@ inline static const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 30-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14, SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8) >> 14 | SafeLoad(in + 9) << 18, SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 20, SafeLoad(in + 10) >> 10 | SafeLoad(in + 11) << 22, SafeLoad(in + 11) >> 8 | SafeLoad(in + 12) << 24, SafeLoad(in + 12) >> 6 | SafeLoad(in + 13) << 26, SafeLoad(in + 13) >> 4 | SafeLoad(in + 14) << 28, SafeLoad(in + 14) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, + SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, + SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, + SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, + SafeLoadAs(in + 4 * 10) >> 10 | SafeLoadAs(in + 4 * 11) << 22, + SafeLoadAs(in + 4 * 11) >> 8 | SafeLoadAs(in + 4 * 12) << 24, + SafeLoadAs(in + 4 * 12) >> 6 | SafeLoadAs(in + 4 * 13) << 26, + SafeLoadAs(in + 4 * 13) >> 4 | SafeLoadAs(in + 4 * 14) << 28, + SafeLoadAs(in + 4 * 14), + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 30-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 15), SafeLoad(in + 15) >> 30 | SafeLoad(in + 16) << 2, SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) >> 26 | SafeLoad(in + 18) << 6, SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 22 | SafeLoad(in + 20) << 10, SafeLoad(in + 20) >> 20 | SafeLoad(in + 21) << 12, SafeLoad(in + 21) >> 18 | SafeLoad(in + 22) << 14, SafeLoad(in + 22) >> 16 | SafeLoad(in + 23) << 16, SafeLoad(in + 23) >> 14 | SafeLoad(in + 24) << 18, SafeLoad(in + 24) >> 12 | SafeLoad(in + 25) << 20, SafeLoad(in + 25) >> 10 | SafeLoad(in + 26) << 22, SafeLoad(in + 26) >> 8 | SafeLoad(in + 27) << 24, SafeLoad(in + 27) >> 6 | SafeLoad(in + 28) << 26, SafeLoad(in + 28) >> 4 | SafeLoad(in + 29) << 28, SafeLoad(in + 29) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, + SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, + SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, + SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, + SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, + SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, + SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, + SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, + SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, + SafeLoadAs(in + 4 * 28) >> 4 | SafeLoadAs(in + 4 * 29) << 28, + SafeLoadAs(in + 4 * 29), + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 30; + in += 30 * 4; return in; } -inline static const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) { +inline static const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out) { + using simd_batch = xsimd::make_sized_batch_t; + uint32_t mask = 0x7fffffff; simd_batch masks(mask); @@ -803,26 +2452,60 @@ inline static const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) { simd_batch results; // extract 31-bit bundles 0 to 15 - words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 31 | SafeLoad(in + 1) << 1, SafeLoad(in + 1) >> 30 | SafeLoad(in + 2) << 2, SafeLoad(in + 2) >> 29 | SafeLoad(in + 3) << 3, SafeLoad(in + 3) >> 28 | SafeLoad(in + 4) << 4, SafeLoad(in + 4) >> 27 | SafeLoad(in + 5) << 5, SafeLoad(in + 5) >> 26 | SafeLoad(in + 6) << 6, SafeLoad(in + 6) >> 25 | SafeLoad(in + 7) << 7, SafeLoad(in + 7) >> 24 | SafeLoad(in + 8) << 8, SafeLoad(in + 8) >> 23 | SafeLoad(in + 9) << 9, SafeLoad(in + 9) >> 22 | SafeLoad(in + 10) << 10, SafeLoad(in + 10) >> 21 | SafeLoad(in + 11) << 11, SafeLoad(in + 11) >> 20 | SafeLoad(in + 12) << 12, SafeLoad(in + 12) >> 19 | SafeLoad(in + 13) << 13, SafeLoad(in + 13) >> 18 | SafeLoad(in + 14) << 14, SafeLoad(in + 14) >> 17 | SafeLoad(in + 15) << 15 }; + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 31 | SafeLoadAs(in + 4 * 1) << 1, + SafeLoadAs(in + 4 * 1) >> 30 | SafeLoadAs(in + 4 * 2) << 2, + SafeLoadAs(in + 4 * 2) >> 29 | SafeLoadAs(in + 4 * 3) << 3, + SafeLoadAs(in + 4 * 3) >> 28 | SafeLoadAs(in + 4 * 4) << 4, + SafeLoadAs(in + 4 * 4) >> 27 | SafeLoadAs(in + 4 * 5) << 5, + SafeLoadAs(in + 4 * 5) >> 26 | SafeLoadAs(in + 4 * 6) << 6, + SafeLoadAs(in + 4 * 6) >> 25 | SafeLoadAs(in + 4 * 7) << 7, + SafeLoadAs(in + 4 * 7) >> 24 | SafeLoadAs(in + 4 * 8) << 8, + SafeLoadAs(in + 4 * 8) >> 23 | SafeLoadAs(in + 4 * 9) << 9, + SafeLoadAs(in + 4 * 9) >> 22 | SafeLoadAs(in + 4 * 10) << 10, + SafeLoadAs(in + 4 * 10) >> 21 | SafeLoadAs(in + 4 * 11) << 11, + SafeLoadAs(in + 4 * 11) >> 20 | SafeLoadAs(in + 4 * 12) << 12, + SafeLoadAs(in + 4 * 12) >> 19 | SafeLoadAs(in + 4 * 13) << 13, + SafeLoadAs(in + 4 * 13) >> 18 | SafeLoadAs(in + 4 * 14) << 14, + SafeLoadAs(in + 4 * 14) >> 17 | SafeLoadAs(in + 4 * 15) << 15, + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; // extract 31-bit bundles 16 to 31 - words = simd_batch{ SafeLoad(in + 15) >> 16 | SafeLoad(in + 16) << 16, SafeLoad(in + 16) >> 15 | SafeLoad(in + 17) << 17, SafeLoad(in + 17) >> 14 | SafeLoad(in + 18) << 18, SafeLoad(in + 18) >> 13 | SafeLoad(in + 19) << 19, SafeLoad(in + 19) >> 12 | SafeLoad(in + 20) << 20, SafeLoad(in + 20) >> 11 | SafeLoad(in + 21) << 21, SafeLoad(in + 21) >> 10 | SafeLoad(in + 22) << 22, SafeLoad(in + 22) >> 9 | SafeLoad(in + 23) << 23, SafeLoad(in + 23) >> 8 | SafeLoad(in + 24) << 24, SafeLoad(in + 24) >> 7 | SafeLoad(in + 25) << 25, SafeLoad(in + 25) >> 6 | SafeLoad(in + 26) << 26, SafeLoad(in + 26) >> 5 | SafeLoad(in + 27) << 27, SafeLoad(in + 27) >> 4 | SafeLoad(in + 28) << 28, SafeLoad(in + 28) >> 3 | SafeLoad(in + 29) << 29, SafeLoad(in + 29) >> 2 | SafeLoad(in + 30) << 30, SafeLoad(in + 30) }; + words = simd_batch{ + SafeLoadAs(in + 4 * 15) >> 16 | SafeLoadAs(in + 4 * 16) << 16, + SafeLoadAs(in + 4 * 16) >> 15 | SafeLoadAs(in + 4 * 17) << 17, + SafeLoadAs(in + 4 * 17) >> 14 | SafeLoadAs(in + 4 * 18) << 18, + SafeLoadAs(in + 4 * 18) >> 13 | SafeLoadAs(in + 4 * 19) << 19, + SafeLoadAs(in + 4 * 19) >> 12 | SafeLoadAs(in + 4 * 20) << 20, + SafeLoadAs(in + 4 * 20) >> 11 | SafeLoadAs(in + 4 * 21) << 21, + SafeLoadAs(in + 4 * 21) >> 10 | SafeLoadAs(in + 4 * 22) << 22, + SafeLoadAs(in + 4 * 22) >> 9 | SafeLoadAs(in + 4 * 23) << 23, + SafeLoadAs(in + 4 * 23) >> 8 | SafeLoadAs(in + 4 * 24) << 24, + SafeLoadAs(in + 4 * 24) >> 7 | SafeLoadAs(in + 4 * 25) << 25, + SafeLoadAs(in + 4 * 25) >> 6 | SafeLoadAs(in + 4 * 26) << 26, + SafeLoadAs(in + 4 * 26) >> 5 | SafeLoadAs(in + 4 * 27) << 27, + SafeLoadAs(in + 4 * 27) >> 4 | SafeLoadAs(in + 4 * 28) << 28, + SafeLoadAs(in + 4 * 28) >> 3 | SafeLoadAs(in + 4 * 29) << 29, + SafeLoadAs(in + 4 * 29) >> 2 | SafeLoadAs(in + 4 * 30) << 30, + SafeLoadAs(in + 4 * 30), + }; shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }; results = (words >> shifts) & masks; results.store_unaligned(out); out += 16; - in += 31; + in += 31 * 4; return in; } -inline static const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) { - memcpy(out, in, 32 * sizeof(*out)); - in += 32; +inline static const uint8_t* unpack32_32(const uint8_t* in, uint32_t* out) { + std::memcpy(out, in, 32 * sizeof(*out)); + in += 4 * 32; out += 32; return in; @@ -831,6 +2514,5 @@ inline static const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) { }; // struct UnpackBits512 } // namespace -} // namespace internal -} // namespace arrow +} // namespace arrow::internal From c44ad174c68952b0bca5f60a4afd1b512a172405 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 16 Sep 2025 14:54:39 +0200 Subject: [PATCH 05/76] Exclude Python codegen from doxygen --- cpp/apidoc/Doxyfile | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/apidoc/Doxyfile b/cpp/apidoc/Doxyfile index de7777a6c1c..baf848ca237 100644 --- a/cpp/apidoc/Doxyfile +++ b/cpp/apidoc/Doxyfile @@ -1095,6 +1095,7 @@ EXCLUDE_PATTERNS = *-test.cc \ *test* \ *_generated.h \ *-benchmark.cc \ + *_codegen.py \ *internal* # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names From eccd09ac02d2e670cf52e6bf2c1e13ef83ffce5f Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 16 Sep 2025 17:00:39 +0200 Subject: [PATCH 06/76] Make generic scalar unpacking codegen --- cpp/src/arrow/util/bpacking.cc | 6 +- cpp/src/arrow/util/bpacking64_codegen.py | 128 ----------- cpp/src/arrow/util/bpacking_scalar_codegen.py | 203 ++++++++++++++++++ 3 files changed, 205 insertions(+), 132 deletions(-) delete mode 100644 cpp/src/arrow/util/bpacking64_codegen.py create mode 100644 cpp/src/arrow/util/bpacking_scalar_codegen.py diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index 990f76875aa..dc780b38598 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -36,15 +36,13 @@ namespace arrow { namespace internal { -int unpack32_scalar(const uint8_t* in_, uint32_t* out, int batch_size, int num_bits) { - const uint32_t* in = reinterpret_cast(in_); - +int unpack32_scalar(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { batch_size = batch_size / 32 * 32; int num_loops = batch_size / 32; switch (num_bits) { case 0: - for (int i = 0; i < num_loops; ++i) in = nullunpacker32(in, out + i * 32); + for (int i = 0; i < num_loops; ++i) in = unpack0_32(in, out + i * 32); break; case 1: for (int i = 0; i < num_loops; ++i) in = unpack1_32(in, out + i * 32); diff --git a/cpp/src/arrow/util/bpacking64_codegen.py b/cpp/src/arrow/util/bpacking64_codegen.py deleted file mode 100644 index 22135fcbb23..00000000000 --- a/cpp/src/arrow/util/bpacking64_codegen.py +++ /dev/null @@ -1,128 +0,0 @@ -#!/bin/python - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# This script is modified from its original version in GitHub. Original source: -# https://github.com/lemire/FrameOfReference/blob/146948b6058a976bc7767262ad3a2ce201486b93/scripts/turbopacking64.py - -# Usage: -# python bpacking64_codegen.py > bpacking64_default_internal.h - -def howmany(bit): - """ how many values are we going to pack? """ - return 32 - - -def howmanywords(bit): - return (howmany(bit) * bit + 63)//64 - - -def howmanybytes(bit): - return (howmany(bit) * bit + 7)//8 - - -print('''// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This file was generated by script which is modified from its original version in GitHub. -// Original source: -// https://github.com/lemire/FrameOfReference/blob/master/scripts/turbopacking64.py -// The original copyright notice follows. - -// This code is released under the -// Apache License Version 2.0 http://www.apache.org/licenses/. -// (c) Daniel Lemire 2013 - -#pragma once - -#include "arrow/util/bit_util.h" -#include "arrow/util/ubsan.h" - -namespace arrow { -namespace internal { -''') - - -print("inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out) {") -print(f" for(int k = 0; k < {howmany(0)} ; k += 1) {{") -print(" out[k] = 0;") -print(" }") -print(" return in;") -print("}") - -for bit in range(1, 65): - print("") - print(f"inline const uint8_t* unpack{bit}_64(const uint8_t* in, uint64_t* out) {{") - - if(bit < 64): - print(f" const uint64_t mask = {((1 << bit)-1)}ULL;") - maskstr = " & mask" - if (bit == 64): - maskstr = "" # no need - - for k in range(howmanywords(bit)-1): - print(f" uint64_t w{k} = util::SafeLoadAs(in);") - print(f" w{k} = arrow::BitUtil::FromLittleEndian(w{k});") - print(" in += 8;") - k = howmanywords(bit) - 1 - if (bit % 2 == 0): - print(f" uint64_t w{k} = util::SafeLoadAs(in);") - print(f" w{k} = arrow::BitUtil::FromLittleEndian(w{k});") - print(" in += 8;") - else: - print(f" uint64_t w{k} = util::SafeLoadAs(in);") - print(f" w{k} = arrow::BitUtil::FromLittleEndian(w{k});") - print(" in += 4;") - - for j in range(howmany(bit)): - firstword = j * bit // 64 - secondword = (j * bit + bit - 1)//64 - firstshift = (j*bit) % 64 - firstshiftstr = f" >> {firstshift}" - if(firstshift == 0): - firstshiftstr = "" # no need - if(firstword == secondword): - if(firstshift + bit == 64): - print(f" out[{j}] = w{firstword}{firstshiftstr};") - else: - print(f" out[{j}] = (w{firstword}{firstshiftstr}){maskstr};") - else: - secondshift = (64-firstshift) - print(f" out[{j}] = ((w{firstword}{firstshiftstr}) | " - f"(w{firstword+1} << {secondshift})){maskstr};") - print("") - print(" return in;") - print("}") - -print(''' -} // namespace internal -} // namespace arrow''') diff --git a/cpp/src/arrow/util/bpacking_scalar_codegen.py b/cpp/src/arrow/util/bpacking_scalar_codegen.py new file mode 100644 index 00000000000..0a373ab2748 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_scalar_codegen.py @@ -0,0 +1,203 @@ +#!/bin/python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script is modified from its original version in GitHub. Original source: +# https://github.com/lemire/FrameOfReference/blob/146948b6058a976bc7767262ad3a2ce201486b93/scripts/turbopacking64.py + +# Usage: +# python bpacking64_codegen.py > bpacking64_default_internal.h + + +import dataclasses +import sys + + +LICENSE = """// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file was generated by script which is modified from its original version in GitHub. +// Original source: +// https://github.com/lemire/FrameOfReference/blob/master/scripts/turbopacking64.py +// The original copyright notice follows. + +// This code is released under the +// Apache License Version 2.0 http://www.apache.org/licenses/. +// (c) Daniel Lemire 2013 +""" + +HEADER = """ +#pragma once + +#include +#include + +#include "arrow/util/endian.h" +#include "arrow/util/ubsan.h" + +namespace arrow::internal { +""" + +FOOTER = """ +} // namespace arrow::internal +""" + + +@dataclasses.dataclass +class ScalarUnpackGenerator: + out_bit_width: int + smart_halve: bool + + @property + def out_byte_width(self) -> int: + return self.out_bit_width // 8 + + @property + def unsigned_type(self) -> str: + return f"uint{self.out_bit_width}_t" + + @property + def unsigned_type_half(self) -> str: + return f"uint{self.out_bit_width // 2}_t" + + @property + def howmany(self) -> int: + """How many values are we going to pack?""" + if self.smart_halve: + return self.out_bit_width // 2 + return self.out_bit_width + + def howmanywords(self, bit: int) -> int: + return (self.howmany * bit + self.out_bit_width - 1) // self.out_bit_width + + def howmanybytes(self, bit: int) -> int: + return (self.howmany * bit + self.out_byte_width - 1) // self.out_byte_width + + def unpack_signature(self, bit: int) -> str: + return ( + f"inline const uint8_t* unpack{bit}_{self.out_bit_width}" + f"(const uint8_t* in, {self.unsigned_type}* out)" + "{" + ) + + def print_unpack_0(self) -> None: + print(self.unpack_signature(0)) + print(f" std::memset(out, 0, {self.howmany} * {self.out_byte_width});") + print(" return in;") + print("}") + + def print_unpack_last(self) -> None: + print(self.unpack_signature(self.out_bit_width)) + print(f" for(int k = 0; k < {self.howmany}; k += 1) {{") + print(f" auto w = util::SafeLoadAs<{self.unsigned_type}>(in);") + print(" out[k] = bit_util::FromLittleEndian(w);") + print(f" in += {self.out_byte_width};") + print(" }") + print(" return in;") + print("}") + + def print_unpack_k(self, bit: int) -> None: + print(self.unpack_signature(bit)) + print( + f" constexpr {self.unsigned_type} mask = " + f"(({self.unsigned_type}{{1}} << {bit}) - {self.unsigned_type}{{1}});" + ) + print("") + maskstr = " & mask" + + for k in range(self.howmanywords(bit) - 1): + print( + f" const auto w{k} = " + f"bit_util::FromLittleEndian(util::SafeLoadAs<{self.unsigned_type}>(in));" + ) + print(f" in += {self.out_byte_width};") + + k = self.howmanywords(bit) - 1 + if self.smart_halve and bit % 2 == 1: + print( + f" auto w{k} = static_cast<{self.unsigned_type}>(util::SafeLoadAs<{self.unsigned_type_half}>(in));" + ) + print(f" w{k} = bit_util::FromLittleEndian(w{k});") + print(f" in += {self.out_byte_width // 2};") + else: + print( + f" const auto w{k} = " + f"bit_util::FromLittleEndian(util::SafeLoadAs<{self.unsigned_type}>(in));" + ) + print(f" in += {self.out_byte_width};") + + for j in range(self.howmany): + firstword = j * bit // self.out_bit_width + secondword = (j * bit + bit - 1) // self.out_bit_width + firstshift = (j * bit) % self.out_bit_width + firstshiftstr = f" >> {firstshift}" + if firstshift == 0: + firstshiftstr = "" # no need + if firstword == secondword: + if firstshift + bit == self.out_bit_width: + print(f" out[{j}] = w{firstword}{firstshiftstr};") + else: + print(f" out[{j}] = (w{firstword}{firstshiftstr}){maskstr};") + else: + secondshift = self.out_bit_width - firstshift + print( + f" out[{j}] = ((w{firstword}{firstshiftstr}) | " + f"(w{firstword + 1} << {secondshift})){maskstr};" + ) + print("") + print(" return in;") + print("}") + + def print_all(self) -> None: + self.print_unpack_0() + print("") + + for bit in range(1, self.out_bit_width): + self.print_unpack_k(bit) + print("") + + self.print_unpack_last() + + +if __name__ == "__main__": + print(LICENSE) + print("// WARNING: this file is generated, DO NOT EDIT.") + print("// Usage:") + print(f"// python {' '.join(sys.orig_argv[1:])}") + print(HEADER) + + ScalarUnpackGenerator(32, smart_halve=False).print_all() + print("") + + ScalarUnpackGenerator(64, smart_halve=True).print_all() + + print(FOOTER) From 9a5c2545b77a657d00a65eccd02d55839a0a47e0 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 16 Sep 2025 17:06:04 +0200 Subject: [PATCH 07/76] Gen: regenerate scalar unpack function in single file --- cpp/src/arrow/util/bpacking.cc | 3 +- .../arrow/util/bpacking64_default_internal.h | 5640 -------------- .../arrow/util/bpacking_default_internal.h | 4251 ---------- .../util/bpacking_scalar_generated_internal.h | 6808 +++++++++++++++++ 4 files changed, 6809 insertions(+), 9893 deletions(-) delete mode 100644 cpp/src/arrow/util/bpacking64_default_internal.h delete mode 100644 cpp/src/arrow/util/bpacking_default_internal.h create mode 100644 cpp/src/arrow/util/bpacking_scalar_generated_internal.h diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index dc780b38598..fefca194518 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -17,8 +17,7 @@ #include "arrow/util/bpacking_internal.h" -#include "arrow/util/bpacking64_default_internal.h" -#include "arrow/util/bpacking_default_internal.h" +#include "arrow/util/bpacking_scalar_generated_internal.h" #include "arrow/util/cpu_info.h" #include "arrow/util/dispatch_internal.h" #include "arrow/util/logging_internal.h" diff --git a/cpp/src/arrow/util/bpacking64_default_internal.h b/cpp/src/arrow/util/bpacking64_default_internal.h deleted file mode 100644 index 256cdda87e3..00000000000 --- a/cpp/src/arrow/util/bpacking64_default_internal.h +++ /dev/null @@ -1,5640 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This file was generated by script which is modified from its original version in -// GitHub. Original source: -// https://github.com/lemire/FrameOfReference/blob/146948b6058a976bc7767262ad3a2ce201486b93/scripts/turbopacking64.py -// The original copyright notice follows. - -// This code is released under the -// Apache License Version 2.0 http://www.apache.org/licenses/. -// (c) Daniel Lemire 2013 - -#pragma once - -#include "arrow/util/endian.h" -#include "arrow/util/ubsan.h" - -namespace arrow::internal { - -inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out) { - for (int k = 0; k < 32; k += 1) { - out[k] = 0; - } - return in; -} - -inline const uint8_t* unpack1_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 1ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 1) & mask; - out[2] = (w0 >> 2) & mask; - out[3] = (w0 >> 3) & mask; - out[4] = (w0 >> 4) & mask; - out[5] = (w0 >> 5) & mask; - out[6] = (w0 >> 6) & mask; - out[7] = (w0 >> 7) & mask; - out[8] = (w0 >> 8) & mask; - out[9] = (w0 >> 9) & mask; - out[10] = (w0 >> 10) & mask; - out[11] = (w0 >> 11) & mask; - out[12] = (w0 >> 12) & mask; - out[13] = (w0 >> 13) & mask; - out[14] = (w0 >> 14) & mask; - out[15] = (w0 >> 15) & mask; - out[16] = (w0 >> 16) & mask; - out[17] = (w0 >> 17) & mask; - out[18] = (w0 >> 18) & mask; - out[19] = (w0 >> 19) & mask; - out[20] = (w0 >> 20) & mask; - out[21] = (w0 >> 21) & mask; - out[22] = (w0 >> 22) & mask; - out[23] = (w0 >> 23) & mask; - out[24] = (w0 >> 24) & mask; - out[25] = (w0 >> 25) & mask; - out[26] = (w0 >> 26) & mask; - out[27] = (w0 >> 27) & mask; - out[28] = (w0 >> 28) & mask; - out[29] = (w0 >> 29) & mask; - out[30] = (w0 >> 30) & mask; - out[31] = (w0 >> 31) & mask; - - return in; -} - -inline const uint8_t* unpack2_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 3ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 2) & mask; - out[2] = (w0 >> 4) & mask; - out[3] = (w0 >> 6) & mask; - out[4] = (w0 >> 8) & mask; - out[5] = (w0 >> 10) & mask; - out[6] = (w0 >> 12) & mask; - out[7] = (w0 >> 14) & mask; - out[8] = (w0 >> 16) & mask; - out[9] = (w0 >> 18) & mask; - out[10] = (w0 >> 20) & mask; - out[11] = (w0 >> 22) & mask; - out[12] = (w0 >> 24) & mask; - out[13] = (w0 >> 26) & mask; - out[14] = (w0 >> 28) & mask; - out[15] = (w0 >> 30) & mask; - out[16] = (w0 >> 32) & mask; - out[17] = (w0 >> 34) & mask; - out[18] = (w0 >> 36) & mask; - out[19] = (w0 >> 38) & mask; - out[20] = (w0 >> 40) & mask; - out[21] = (w0 >> 42) & mask; - out[22] = (w0 >> 44) & mask; - out[23] = (w0 >> 46) & mask; - out[24] = (w0 >> 48) & mask; - out[25] = (w0 >> 50) & mask; - out[26] = (w0 >> 52) & mask; - out[27] = (w0 >> 54) & mask; - out[28] = (w0 >> 56) & mask; - out[29] = (w0 >> 58) & mask; - out[30] = (w0 >> 60) & mask; - out[31] = w0 >> 62; - - return in; -} - -inline const uint8_t* unpack3_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 7ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 3) & mask; - out[2] = (w0 >> 6) & mask; - out[3] = (w0 >> 9) & mask; - out[4] = (w0 >> 12) & mask; - out[5] = (w0 >> 15) & mask; - out[6] = (w0 >> 18) & mask; - out[7] = (w0 >> 21) & mask; - out[8] = (w0 >> 24) & mask; - out[9] = (w0 >> 27) & mask; - out[10] = (w0 >> 30) & mask; - out[11] = (w0 >> 33) & mask; - out[12] = (w0 >> 36) & mask; - out[13] = (w0 >> 39) & mask; - out[14] = (w0 >> 42) & mask; - out[15] = (w0 >> 45) & mask; - out[16] = (w0 >> 48) & mask; - out[17] = (w0 >> 51) & mask; - out[18] = (w0 >> 54) & mask; - out[19] = (w0 >> 57) & mask; - out[20] = (w0 >> 60) & mask; - out[21] = ((w0 >> 63) | (w1 << 1)) & mask; - out[22] = (w1 >> 2) & mask; - out[23] = (w1 >> 5) & mask; - out[24] = (w1 >> 8) & mask; - out[25] = (w1 >> 11) & mask; - out[26] = (w1 >> 14) & mask; - out[27] = (w1 >> 17) & mask; - out[28] = (w1 >> 20) & mask; - out[29] = (w1 >> 23) & mask; - out[30] = (w1 >> 26) & mask; - out[31] = (w1 >> 29) & mask; - - return in; -} - -inline const uint8_t* unpack4_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 15ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 4) & mask; - out[2] = (w0 >> 8) & mask; - out[3] = (w0 >> 12) & mask; - out[4] = (w0 >> 16) & mask; - out[5] = (w0 >> 20) & mask; - out[6] = (w0 >> 24) & mask; - out[7] = (w0 >> 28) & mask; - out[8] = (w0 >> 32) & mask; - out[9] = (w0 >> 36) & mask; - out[10] = (w0 >> 40) & mask; - out[11] = (w0 >> 44) & mask; - out[12] = (w0 >> 48) & mask; - out[13] = (w0 >> 52) & mask; - out[14] = (w0 >> 56) & mask; - out[15] = w0 >> 60; - out[16] = (w1)&mask; - out[17] = (w1 >> 4) & mask; - out[18] = (w1 >> 8) & mask; - out[19] = (w1 >> 12) & mask; - out[20] = (w1 >> 16) & mask; - out[21] = (w1 >> 20) & mask; - out[22] = (w1 >> 24) & mask; - out[23] = (w1 >> 28) & mask; - out[24] = (w1 >> 32) & mask; - out[25] = (w1 >> 36) & mask; - out[26] = (w1 >> 40) & mask; - out[27] = (w1 >> 44) & mask; - out[28] = (w1 >> 48) & mask; - out[29] = (w1 >> 52) & mask; - out[30] = (w1 >> 56) & mask; - out[31] = w1 >> 60; - - return in; -} - -inline const uint8_t* unpack5_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 31ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 5) & mask; - out[2] = (w0 >> 10) & mask; - out[3] = (w0 >> 15) & mask; - out[4] = (w0 >> 20) & mask; - out[5] = (w0 >> 25) & mask; - out[6] = (w0 >> 30) & mask; - out[7] = (w0 >> 35) & mask; - out[8] = (w0 >> 40) & mask; - out[9] = (w0 >> 45) & mask; - out[10] = (w0 >> 50) & mask; - out[11] = (w0 >> 55) & mask; - out[12] = ((w0 >> 60) | (w1 << 4)) & mask; - out[13] = (w1 >> 1) & mask; - out[14] = (w1 >> 6) & mask; - out[15] = (w1 >> 11) & mask; - out[16] = (w1 >> 16) & mask; - out[17] = (w1 >> 21) & mask; - out[18] = (w1 >> 26) & mask; - out[19] = (w1 >> 31) & mask; - out[20] = (w1 >> 36) & mask; - out[21] = (w1 >> 41) & mask; - out[22] = (w1 >> 46) & mask; - out[23] = (w1 >> 51) & mask; - out[24] = (w1 >> 56) & mask; - out[25] = ((w1 >> 61) | (w2 << 3)) & mask; - out[26] = (w2 >> 2) & mask; - out[27] = (w2 >> 7) & mask; - out[28] = (w2 >> 12) & mask; - out[29] = (w2 >> 17) & mask; - out[30] = (w2 >> 22) & mask; - out[31] = (w2 >> 27) & mask; - - return in; -} - -inline const uint8_t* unpack6_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 63ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 6) & mask; - out[2] = (w0 >> 12) & mask; - out[3] = (w0 >> 18) & mask; - out[4] = (w0 >> 24) & mask; - out[5] = (w0 >> 30) & mask; - out[6] = (w0 >> 36) & mask; - out[7] = (w0 >> 42) & mask; - out[8] = (w0 >> 48) & mask; - out[9] = (w0 >> 54) & mask; - out[10] = ((w0 >> 60) | (w1 << 4)) & mask; - out[11] = (w1 >> 2) & mask; - out[12] = (w1 >> 8) & mask; - out[13] = (w1 >> 14) & mask; - out[14] = (w1 >> 20) & mask; - out[15] = (w1 >> 26) & mask; - out[16] = (w1 >> 32) & mask; - out[17] = (w1 >> 38) & mask; - out[18] = (w1 >> 44) & mask; - out[19] = (w1 >> 50) & mask; - out[20] = (w1 >> 56) & mask; - out[21] = ((w1 >> 62) | (w2 << 2)) & mask; - out[22] = (w2 >> 4) & mask; - out[23] = (w2 >> 10) & mask; - out[24] = (w2 >> 16) & mask; - out[25] = (w2 >> 22) & mask; - out[26] = (w2 >> 28) & mask; - out[27] = (w2 >> 34) & mask; - out[28] = (w2 >> 40) & mask; - out[29] = (w2 >> 46) & mask; - out[30] = (w2 >> 52) & mask; - out[31] = w2 >> 58; - - return in; -} - -inline const uint8_t* unpack7_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 127ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 7) & mask; - out[2] = (w0 >> 14) & mask; - out[3] = (w0 >> 21) & mask; - out[4] = (w0 >> 28) & mask; - out[5] = (w0 >> 35) & mask; - out[6] = (w0 >> 42) & mask; - out[7] = (w0 >> 49) & mask; - out[8] = (w0 >> 56) & mask; - out[9] = ((w0 >> 63) | (w1 << 1)) & mask; - out[10] = (w1 >> 6) & mask; - out[11] = (w1 >> 13) & mask; - out[12] = (w1 >> 20) & mask; - out[13] = (w1 >> 27) & mask; - out[14] = (w1 >> 34) & mask; - out[15] = (w1 >> 41) & mask; - out[16] = (w1 >> 48) & mask; - out[17] = (w1 >> 55) & mask; - out[18] = ((w1 >> 62) | (w2 << 2)) & mask; - out[19] = (w2 >> 5) & mask; - out[20] = (w2 >> 12) & mask; - out[21] = (w2 >> 19) & mask; - out[22] = (w2 >> 26) & mask; - out[23] = (w2 >> 33) & mask; - out[24] = (w2 >> 40) & mask; - out[25] = (w2 >> 47) & mask; - out[26] = (w2 >> 54) & mask; - out[27] = ((w2 >> 61) | (w3 << 3)) & mask; - out[28] = (w3 >> 4) & mask; - out[29] = (w3 >> 11) & mask; - out[30] = (w3 >> 18) & mask; - out[31] = (w3 >> 25) & mask; - - return in; -} - -inline const uint8_t* unpack8_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 255ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 8) & mask; - out[2] = (w0 >> 16) & mask; - out[3] = (w0 >> 24) & mask; - out[4] = (w0 >> 32) & mask; - out[5] = (w0 >> 40) & mask; - out[6] = (w0 >> 48) & mask; - out[7] = w0 >> 56; - out[8] = (w1)&mask; - out[9] = (w1 >> 8) & mask; - out[10] = (w1 >> 16) & mask; - out[11] = (w1 >> 24) & mask; - out[12] = (w1 >> 32) & mask; - out[13] = (w1 >> 40) & mask; - out[14] = (w1 >> 48) & mask; - out[15] = w1 >> 56; - out[16] = (w2)&mask; - out[17] = (w2 >> 8) & mask; - out[18] = (w2 >> 16) & mask; - out[19] = (w2 >> 24) & mask; - out[20] = (w2 >> 32) & mask; - out[21] = (w2 >> 40) & mask; - out[22] = (w2 >> 48) & mask; - out[23] = w2 >> 56; - out[24] = (w3)&mask; - out[25] = (w3 >> 8) & mask; - out[26] = (w3 >> 16) & mask; - out[27] = (w3 >> 24) & mask; - out[28] = (w3 >> 32) & mask; - out[29] = (w3 >> 40) & mask; - out[30] = (w3 >> 48) & mask; - out[31] = w3 >> 56; - - return in; -} - -inline const uint8_t* unpack9_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 511ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 9) & mask; - out[2] = (w0 >> 18) & mask; - out[3] = (w0 >> 27) & mask; - out[4] = (w0 >> 36) & mask; - out[5] = (w0 >> 45) & mask; - out[6] = (w0 >> 54) & mask; - out[7] = ((w0 >> 63) | (w1 << 1)) & mask; - out[8] = (w1 >> 8) & mask; - out[9] = (w1 >> 17) & mask; - out[10] = (w1 >> 26) & mask; - out[11] = (w1 >> 35) & mask; - out[12] = (w1 >> 44) & mask; - out[13] = (w1 >> 53) & mask; - out[14] = ((w1 >> 62) | (w2 << 2)) & mask; - out[15] = (w2 >> 7) & mask; - out[16] = (w2 >> 16) & mask; - out[17] = (w2 >> 25) & mask; - out[18] = (w2 >> 34) & mask; - out[19] = (w2 >> 43) & mask; - out[20] = (w2 >> 52) & mask; - out[21] = ((w2 >> 61) | (w3 << 3)) & mask; - out[22] = (w3 >> 6) & mask; - out[23] = (w3 >> 15) & mask; - out[24] = (w3 >> 24) & mask; - out[25] = (w3 >> 33) & mask; - out[26] = (w3 >> 42) & mask; - out[27] = (w3 >> 51) & mask; - out[28] = ((w3 >> 60) | (w4 << 4)) & mask; - out[29] = (w4 >> 5) & mask; - out[30] = (w4 >> 14) & mask; - out[31] = (w4 >> 23) & mask; - - return in; -} - -inline const uint8_t* unpack10_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 1023ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 10) & mask; - out[2] = (w0 >> 20) & mask; - out[3] = (w0 >> 30) & mask; - out[4] = (w0 >> 40) & mask; - out[5] = (w0 >> 50) & mask; - out[6] = ((w0 >> 60) | (w1 << 4)) & mask; - out[7] = (w1 >> 6) & mask; - out[8] = (w1 >> 16) & mask; - out[9] = (w1 >> 26) & mask; - out[10] = (w1 >> 36) & mask; - out[11] = (w1 >> 46) & mask; - out[12] = ((w1 >> 56) | (w2 << 8)) & mask; - out[13] = (w2 >> 2) & mask; - out[14] = (w2 >> 12) & mask; - out[15] = (w2 >> 22) & mask; - out[16] = (w2 >> 32) & mask; - out[17] = (w2 >> 42) & mask; - out[18] = (w2 >> 52) & mask; - out[19] = ((w2 >> 62) | (w3 << 2)) & mask; - out[20] = (w3 >> 8) & mask; - out[21] = (w3 >> 18) & mask; - out[22] = (w3 >> 28) & mask; - out[23] = (w3 >> 38) & mask; - out[24] = (w3 >> 48) & mask; - out[25] = ((w3 >> 58) | (w4 << 6)) & mask; - out[26] = (w4 >> 4) & mask; - out[27] = (w4 >> 14) & mask; - out[28] = (w4 >> 24) & mask; - out[29] = (w4 >> 34) & mask; - out[30] = (w4 >> 44) & mask; - out[31] = w4 >> 54; - - return in; -} - -inline const uint8_t* unpack11_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 2047ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 11) & mask; - out[2] = (w0 >> 22) & mask; - out[3] = (w0 >> 33) & mask; - out[4] = (w0 >> 44) & mask; - out[5] = ((w0 >> 55) | (w1 << 9)) & mask; - out[6] = (w1 >> 2) & mask; - out[7] = (w1 >> 13) & mask; - out[8] = (w1 >> 24) & mask; - out[9] = (w1 >> 35) & mask; - out[10] = (w1 >> 46) & mask; - out[11] = ((w1 >> 57) | (w2 << 7)) & mask; - out[12] = (w2 >> 4) & mask; - out[13] = (w2 >> 15) & mask; - out[14] = (w2 >> 26) & mask; - out[15] = (w2 >> 37) & mask; - out[16] = (w2 >> 48) & mask; - out[17] = ((w2 >> 59) | (w3 << 5)) & mask; - out[18] = (w3 >> 6) & mask; - out[19] = (w3 >> 17) & mask; - out[20] = (w3 >> 28) & mask; - out[21] = (w3 >> 39) & mask; - out[22] = (w3 >> 50) & mask; - out[23] = ((w3 >> 61) | (w4 << 3)) & mask; - out[24] = (w4 >> 8) & mask; - out[25] = (w4 >> 19) & mask; - out[26] = (w4 >> 30) & mask; - out[27] = (w4 >> 41) & mask; - out[28] = (w4 >> 52) & mask; - out[29] = ((w4 >> 63) | (w5 << 1)) & mask; - out[30] = (w5 >> 10) & mask; - out[31] = (w5 >> 21) & mask; - - return in; -} - -inline const uint8_t* unpack12_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 4095ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 12) & mask; - out[2] = (w0 >> 24) & mask; - out[3] = (w0 >> 36) & mask; - out[4] = (w0 >> 48) & mask; - out[5] = ((w0 >> 60) | (w1 << 4)) & mask; - out[6] = (w1 >> 8) & mask; - out[7] = (w1 >> 20) & mask; - out[8] = (w1 >> 32) & mask; - out[9] = (w1 >> 44) & mask; - out[10] = ((w1 >> 56) | (w2 << 8)) & mask; - out[11] = (w2 >> 4) & mask; - out[12] = (w2 >> 16) & mask; - out[13] = (w2 >> 28) & mask; - out[14] = (w2 >> 40) & mask; - out[15] = w2 >> 52; - out[16] = (w3)&mask; - out[17] = (w3 >> 12) & mask; - out[18] = (w3 >> 24) & mask; - out[19] = (w3 >> 36) & mask; - out[20] = (w3 >> 48) & mask; - out[21] = ((w3 >> 60) | (w4 << 4)) & mask; - out[22] = (w4 >> 8) & mask; - out[23] = (w4 >> 20) & mask; - out[24] = (w4 >> 32) & mask; - out[25] = (w4 >> 44) & mask; - out[26] = ((w4 >> 56) | (w5 << 8)) & mask; - out[27] = (w5 >> 4) & mask; - out[28] = (w5 >> 16) & mask; - out[29] = (w5 >> 28) & mask; - out[30] = (w5 >> 40) & mask; - out[31] = w5 >> 52; - - return in; -} - -inline const uint8_t* unpack13_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 8191ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 13) & mask; - out[2] = (w0 >> 26) & mask; - out[3] = (w0 >> 39) & mask; - out[4] = ((w0 >> 52) | (w1 << 12)) & mask; - out[5] = (w1 >> 1) & mask; - out[6] = (w1 >> 14) & mask; - out[7] = (w1 >> 27) & mask; - out[8] = (w1 >> 40) & mask; - out[9] = ((w1 >> 53) | (w2 << 11)) & mask; - out[10] = (w2 >> 2) & mask; - out[11] = (w2 >> 15) & mask; - out[12] = (w2 >> 28) & mask; - out[13] = (w2 >> 41) & mask; - out[14] = ((w2 >> 54) | (w3 << 10)) & mask; - out[15] = (w3 >> 3) & mask; - out[16] = (w3 >> 16) & mask; - out[17] = (w3 >> 29) & mask; - out[18] = (w3 >> 42) & mask; - out[19] = ((w3 >> 55) | (w4 << 9)) & mask; - out[20] = (w4 >> 4) & mask; - out[21] = (w4 >> 17) & mask; - out[22] = (w4 >> 30) & mask; - out[23] = (w4 >> 43) & mask; - out[24] = ((w4 >> 56) | (w5 << 8)) & mask; - out[25] = (w5 >> 5) & mask; - out[26] = (w5 >> 18) & mask; - out[27] = (w5 >> 31) & mask; - out[28] = (w5 >> 44) & mask; - out[29] = ((w5 >> 57) | (w6 << 7)) & mask; - out[30] = (w6 >> 6) & mask; - out[31] = (w6 >> 19) & mask; - - return in; -} - -inline const uint8_t* unpack14_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 16383ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 14) & mask; - out[2] = (w0 >> 28) & mask; - out[3] = (w0 >> 42) & mask; - out[4] = ((w0 >> 56) | (w1 << 8)) & mask; - out[5] = (w1 >> 6) & mask; - out[6] = (w1 >> 20) & mask; - out[7] = (w1 >> 34) & mask; - out[8] = (w1 >> 48) & mask; - out[9] = ((w1 >> 62) | (w2 << 2)) & mask; - out[10] = (w2 >> 12) & mask; - out[11] = (w2 >> 26) & mask; - out[12] = (w2 >> 40) & mask; - out[13] = ((w2 >> 54) | (w3 << 10)) & mask; - out[14] = (w3 >> 4) & mask; - out[15] = (w3 >> 18) & mask; - out[16] = (w3 >> 32) & mask; - out[17] = (w3 >> 46) & mask; - out[18] = ((w3 >> 60) | (w4 << 4)) & mask; - out[19] = (w4 >> 10) & mask; - out[20] = (w4 >> 24) & mask; - out[21] = (w4 >> 38) & mask; - out[22] = ((w4 >> 52) | (w5 << 12)) & mask; - out[23] = (w5 >> 2) & mask; - out[24] = (w5 >> 16) & mask; - out[25] = (w5 >> 30) & mask; - out[26] = (w5 >> 44) & mask; - out[27] = ((w5 >> 58) | (w6 << 6)) & mask; - out[28] = (w6 >> 8) & mask; - out[29] = (w6 >> 22) & mask; - out[30] = (w6 >> 36) & mask; - out[31] = w6 >> 50; - - return in; -} - -inline const uint8_t* unpack15_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 32767ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 15) & mask; - out[2] = (w0 >> 30) & mask; - out[3] = (w0 >> 45) & mask; - out[4] = ((w0 >> 60) | (w1 << 4)) & mask; - out[5] = (w1 >> 11) & mask; - out[6] = (w1 >> 26) & mask; - out[7] = (w1 >> 41) & mask; - out[8] = ((w1 >> 56) | (w2 << 8)) & mask; - out[9] = (w2 >> 7) & mask; - out[10] = (w2 >> 22) & mask; - out[11] = (w2 >> 37) & mask; - out[12] = ((w2 >> 52) | (w3 << 12)) & mask; - out[13] = (w3 >> 3) & mask; - out[14] = (w3 >> 18) & mask; - out[15] = (w3 >> 33) & mask; - out[16] = (w3 >> 48) & mask; - out[17] = ((w3 >> 63) | (w4 << 1)) & mask; - out[18] = (w4 >> 14) & mask; - out[19] = (w4 >> 29) & mask; - out[20] = (w4 >> 44) & mask; - out[21] = ((w4 >> 59) | (w5 << 5)) & mask; - out[22] = (w5 >> 10) & mask; - out[23] = (w5 >> 25) & mask; - out[24] = (w5 >> 40) & mask; - out[25] = ((w5 >> 55) | (w6 << 9)) & mask; - out[26] = (w6 >> 6) & mask; - out[27] = (w6 >> 21) & mask; - out[28] = (w6 >> 36) & mask; - out[29] = ((w6 >> 51) | (w7 << 13)) & mask; - out[30] = (w7 >> 2) & mask; - out[31] = (w7 >> 17) & mask; - - return in; -} - -inline const uint8_t* unpack16_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 65535ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 16) & mask; - out[2] = (w0 >> 32) & mask; - out[3] = w0 >> 48; - out[4] = (w1)&mask; - out[5] = (w1 >> 16) & mask; - out[6] = (w1 >> 32) & mask; - out[7] = w1 >> 48; - out[8] = (w2)&mask; - out[9] = (w2 >> 16) & mask; - out[10] = (w2 >> 32) & mask; - out[11] = w2 >> 48; - out[12] = (w3)&mask; - out[13] = (w3 >> 16) & mask; - out[14] = (w3 >> 32) & mask; - out[15] = w3 >> 48; - out[16] = (w4)&mask; - out[17] = (w4 >> 16) & mask; - out[18] = (w4 >> 32) & mask; - out[19] = w4 >> 48; - out[20] = (w5)&mask; - out[21] = (w5 >> 16) & mask; - out[22] = (w5 >> 32) & mask; - out[23] = w5 >> 48; - out[24] = (w6)&mask; - out[25] = (w6 >> 16) & mask; - out[26] = (w6 >> 32) & mask; - out[27] = w6 >> 48; - out[28] = (w7)&mask; - out[29] = (w7 >> 16) & mask; - out[30] = (w7 >> 32) & mask; - out[31] = w7 >> 48; - - return in; -} - -inline const uint8_t* unpack17_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 131071ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 17) & mask; - out[2] = (w0 >> 34) & mask; - out[3] = ((w0 >> 51) | (w1 << 13)) & mask; - out[4] = (w1 >> 4) & mask; - out[5] = (w1 >> 21) & mask; - out[6] = (w1 >> 38) & mask; - out[7] = ((w1 >> 55) | (w2 << 9)) & mask; - out[8] = (w2 >> 8) & mask; - out[9] = (w2 >> 25) & mask; - out[10] = (w2 >> 42) & mask; - out[11] = ((w2 >> 59) | (w3 << 5)) & mask; - out[12] = (w3 >> 12) & mask; - out[13] = (w3 >> 29) & mask; - out[14] = (w3 >> 46) & mask; - out[15] = ((w3 >> 63) | (w4 << 1)) & mask; - out[16] = (w4 >> 16) & mask; - out[17] = (w4 >> 33) & mask; - out[18] = ((w4 >> 50) | (w5 << 14)) & mask; - out[19] = (w5 >> 3) & mask; - out[20] = (w5 >> 20) & mask; - out[21] = (w5 >> 37) & mask; - out[22] = ((w5 >> 54) | (w6 << 10)) & mask; - out[23] = (w6 >> 7) & mask; - out[24] = (w6 >> 24) & mask; - out[25] = (w6 >> 41) & mask; - out[26] = ((w6 >> 58) | (w7 << 6)) & mask; - out[27] = (w7 >> 11) & mask; - out[28] = (w7 >> 28) & mask; - out[29] = (w7 >> 45) & mask; - out[30] = ((w7 >> 62) | (w8 << 2)) & mask; - out[31] = (w8 >> 15) & mask; - - return in; -} - -inline const uint8_t* unpack18_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 262143ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 18) & mask; - out[2] = (w0 >> 36) & mask; - out[3] = ((w0 >> 54) | (w1 << 10)) & mask; - out[4] = (w1 >> 8) & mask; - out[5] = (w1 >> 26) & mask; - out[6] = (w1 >> 44) & mask; - out[7] = ((w1 >> 62) | (w2 << 2)) & mask; - out[8] = (w2 >> 16) & mask; - out[9] = (w2 >> 34) & mask; - out[10] = ((w2 >> 52) | (w3 << 12)) & mask; - out[11] = (w3 >> 6) & mask; - out[12] = (w3 >> 24) & mask; - out[13] = (w3 >> 42) & mask; - out[14] = ((w3 >> 60) | (w4 << 4)) & mask; - out[15] = (w4 >> 14) & mask; - out[16] = (w4 >> 32) & mask; - out[17] = ((w4 >> 50) | (w5 << 14)) & mask; - out[18] = (w5 >> 4) & mask; - out[19] = (w5 >> 22) & mask; - out[20] = (w5 >> 40) & mask; - out[21] = ((w5 >> 58) | (w6 << 6)) & mask; - out[22] = (w6 >> 12) & mask; - out[23] = (w6 >> 30) & mask; - out[24] = ((w6 >> 48) | (w7 << 16)) & mask; - out[25] = (w7 >> 2) & mask; - out[26] = (w7 >> 20) & mask; - out[27] = (w7 >> 38) & mask; - out[28] = ((w7 >> 56) | (w8 << 8)) & mask; - out[29] = (w8 >> 10) & mask; - out[30] = (w8 >> 28) & mask; - out[31] = w8 >> 46; - - return in; -} - -inline const uint8_t* unpack19_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 524287ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 19) & mask; - out[2] = (w0 >> 38) & mask; - out[3] = ((w0 >> 57) | (w1 << 7)) & mask; - out[4] = (w1 >> 12) & mask; - out[5] = (w1 >> 31) & mask; - out[6] = ((w1 >> 50) | (w2 << 14)) & mask; - out[7] = (w2 >> 5) & mask; - out[8] = (w2 >> 24) & mask; - out[9] = (w2 >> 43) & mask; - out[10] = ((w2 >> 62) | (w3 << 2)) & mask; - out[11] = (w3 >> 17) & mask; - out[12] = (w3 >> 36) & mask; - out[13] = ((w3 >> 55) | (w4 << 9)) & mask; - out[14] = (w4 >> 10) & mask; - out[15] = (w4 >> 29) & mask; - out[16] = ((w4 >> 48) | (w5 << 16)) & mask; - out[17] = (w5 >> 3) & mask; - out[18] = (w5 >> 22) & mask; - out[19] = (w5 >> 41) & mask; - out[20] = ((w5 >> 60) | (w6 << 4)) & mask; - out[21] = (w6 >> 15) & mask; - out[22] = (w6 >> 34) & mask; - out[23] = ((w6 >> 53) | (w7 << 11)) & mask; - out[24] = (w7 >> 8) & mask; - out[25] = (w7 >> 27) & mask; - out[26] = ((w7 >> 46) | (w8 << 18)) & mask; - out[27] = (w8 >> 1) & mask; - out[28] = (w8 >> 20) & mask; - out[29] = (w8 >> 39) & mask; - out[30] = ((w8 >> 58) | (w9 << 6)) & mask; - out[31] = (w9 >> 13) & mask; - - return in; -} - -inline const uint8_t* unpack20_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 1048575ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 20) & mask; - out[2] = (w0 >> 40) & mask; - out[3] = ((w0 >> 60) | (w1 << 4)) & mask; - out[4] = (w1 >> 16) & mask; - out[5] = (w1 >> 36) & mask; - out[6] = ((w1 >> 56) | (w2 << 8)) & mask; - out[7] = (w2 >> 12) & mask; - out[8] = (w2 >> 32) & mask; - out[9] = ((w2 >> 52) | (w3 << 12)) & mask; - out[10] = (w3 >> 8) & mask; - out[11] = (w3 >> 28) & mask; - out[12] = ((w3 >> 48) | (w4 << 16)) & mask; - out[13] = (w4 >> 4) & mask; - out[14] = (w4 >> 24) & mask; - out[15] = w4 >> 44; - out[16] = (w5)&mask; - out[17] = (w5 >> 20) & mask; - out[18] = (w5 >> 40) & mask; - out[19] = ((w5 >> 60) | (w6 << 4)) & mask; - out[20] = (w6 >> 16) & mask; - out[21] = (w6 >> 36) & mask; - out[22] = ((w6 >> 56) | (w7 << 8)) & mask; - out[23] = (w7 >> 12) & mask; - out[24] = (w7 >> 32) & mask; - out[25] = ((w7 >> 52) | (w8 << 12)) & mask; - out[26] = (w8 >> 8) & mask; - out[27] = (w8 >> 28) & mask; - out[28] = ((w8 >> 48) | (w9 << 16)) & mask; - out[29] = (w9 >> 4) & mask; - out[30] = (w9 >> 24) & mask; - out[31] = w9 >> 44; - - return in; -} - -inline const uint8_t* unpack21_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 2097151ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 21) & mask; - out[2] = (w0 >> 42) & mask; - out[3] = ((w0 >> 63) | (w1 << 1)) & mask; - out[4] = (w1 >> 20) & mask; - out[5] = (w1 >> 41) & mask; - out[6] = ((w1 >> 62) | (w2 << 2)) & mask; - out[7] = (w2 >> 19) & mask; - out[8] = (w2 >> 40) & mask; - out[9] = ((w2 >> 61) | (w3 << 3)) & mask; - out[10] = (w3 >> 18) & mask; - out[11] = (w3 >> 39) & mask; - out[12] = ((w3 >> 60) | (w4 << 4)) & mask; - out[13] = (w4 >> 17) & mask; - out[14] = (w4 >> 38) & mask; - out[15] = ((w4 >> 59) | (w5 << 5)) & mask; - out[16] = (w5 >> 16) & mask; - out[17] = (w5 >> 37) & mask; - out[18] = ((w5 >> 58) | (w6 << 6)) & mask; - out[19] = (w6 >> 15) & mask; - out[20] = (w6 >> 36) & mask; - out[21] = ((w6 >> 57) | (w7 << 7)) & mask; - out[22] = (w7 >> 14) & mask; - out[23] = (w7 >> 35) & mask; - out[24] = ((w7 >> 56) | (w8 << 8)) & mask; - out[25] = (w8 >> 13) & mask; - out[26] = (w8 >> 34) & mask; - out[27] = ((w8 >> 55) | (w9 << 9)) & mask; - out[28] = (w9 >> 12) & mask; - out[29] = (w9 >> 33) & mask; - out[30] = ((w9 >> 54) | (w10 << 10)) & mask; - out[31] = (w10 >> 11) & mask; - - return in; -} - -inline const uint8_t* unpack22_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 4194303ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 22) & mask; - out[2] = ((w0 >> 44) | (w1 << 20)) & mask; - out[3] = (w1 >> 2) & mask; - out[4] = (w1 >> 24) & mask; - out[5] = ((w1 >> 46) | (w2 << 18)) & mask; - out[6] = (w2 >> 4) & mask; - out[7] = (w2 >> 26) & mask; - out[8] = ((w2 >> 48) | (w3 << 16)) & mask; - out[9] = (w3 >> 6) & mask; - out[10] = (w3 >> 28) & mask; - out[11] = ((w3 >> 50) | (w4 << 14)) & mask; - out[12] = (w4 >> 8) & mask; - out[13] = (w4 >> 30) & mask; - out[14] = ((w4 >> 52) | (w5 << 12)) & mask; - out[15] = (w5 >> 10) & mask; - out[16] = (w5 >> 32) & mask; - out[17] = ((w5 >> 54) | (w6 << 10)) & mask; - out[18] = (w6 >> 12) & mask; - out[19] = (w6 >> 34) & mask; - out[20] = ((w6 >> 56) | (w7 << 8)) & mask; - out[21] = (w7 >> 14) & mask; - out[22] = (w7 >> 36) & mask; - out[23] = ((w7 >> 58) | (w8 << 6)) & mask; - out[24] = (w8 >> 16) & mask; - out[25] = (w8 >> 38) & mask; - out[26] = ((w8 >> 60) | (w9 << 4)) & mask; - out[27] = (w9 >> 18) & mask; - out[28] = (w9 >> 40) & mask; - out[29] = ((w9 >> 62) | (w10 << 2)) & mask; - out[30] = (w10 >> 20) & mask; - out[31] = w10 >> 42; - - return in; -} - -inline const uint8_t* unpack23_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 8388607ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 23) & mask; - out[2] = ((w0 >> 46) | (w1 << 18)) & mask; - out[3] = (w1 >> 5) & mask; - out[4] = (w1 >> 28) & mask; - out[5] = ((w1 >> 51) | (w2 << 13)) & mask; - out[6] = (w2 >> 10) & mask; - out[7] = (w2 >> 33) & mask; - out[8] = ((w2 >> 56) | (w3 << 8)) & mask; - out[9] = (w3 >> 15) & mask; - out[10] = (w3 >> 38) & mask; - out[11] = ((w3 >> 61) | (w4 << 3)) & mask; - out[12] = (w4 >> 20) & mask; - out[13] = ((w4 >> 43) | (w5 << 21)) & mask; - out[14] = (w5 >> 2) & mask; - out[15] = (w5 >> 25) & mask; - out[16] = ((w5 >> 48) | (w6 << 16)) & mask; - out[17] = (w6 >> 7) & mask; - out[18] = (w6 >> 30) & mask; - out[19] = ((w6 >> 53) | (w7 << 11)) & mask; - out[20] = (w7 >> 12) & mask; - out[21] = (w7 >> 35) & mask; - out[22] = ((w7 >> 58) | (w8 << 6)) & mask; - out[23] = (w8 >> 17) & mask; - out[24] = (w8 >> 40) & mask; - out[25] = ((w8 >> 63) | (w9 << 1)) & mask; - out[26] = (w9 >> 22) & mask; - out[27] = ((w9 >> 45) | (w10 << 19)) & mask; - out[28] = (w10 >> 4) & mask; - out[29] = (w10 >> 27) & mask; - out[30] = ((w10 >> 50) | (w11 << 14)) & mask; - out[31] = (w11 >> 9) & mask; - - return in; -} - -inline const uint8_t* unpack24_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 16777215ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 24) & mask; - out[2] = ((w0 >> 48) | (w1 << 16)) & mask; - out[3] = (w1 >> 8) & mask; - out[4] = (w1 >> 32) & mask; - out[5] = ((w1 >> 56) | (w2 << 8)) & mask; - out[6] = (w2 >> 16) & mask; - out[7] = w2 >> 40; - out[8] = (w3)&mask; - out[9] = (w3 >> 24) & mask; - out[10] = ((w3 >> 48) | (w4 << 16)) & mask; - out[11] = (w4 >> 8) & mask; - out[12] = (w4 >> 32) & mask; - out[13] = ((w4 >> 56) | (w5 << 8)) & mask; - out[14] = (w5 >> 16) & mask; - out[15] = w5 >> 40; - out[16] = (w6)&mask; - out[17] = (w6 >> 24) & mask; - out[18] = ((w6 >> 48) | (w7 << 16)) & mask; - out[19] = (w7 >> 8) & mask; - out[20] = (w7 >> 32) & mask; - out[21] = ((w7 >> 56) | (w8 << 8)) & mask; - out[22] = (w8 >> 16) & mask; - out[23] = w8 >> 40; - out[24] = (w9)&mask; - out[25] = (w9 >> 24) & mask; - out[26] = ((w9 >> 48) | (w10 << 16)) & mask; - out[27] = (w10 >> 8) & mask; - out[28] = (w10 >> 32) & mask; - out[29] = ((w10 >> 56) | (w11 << 8)) & mask; - out[30] = (w11 >> 16) & mask; - out[31] = w11 >> 40; - - return in; -} - -inline const uint8_t* unpack25_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 33554431ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 25) & mask; - out[2] = ((w0 >> 50) | (w1 << 14)) & mask; - out[3] = (w1 >> 11) & mask; - out[4] = (w1 >> 36) & mask; - out[5] = ((w1 >> 61) | (w2 << 3)) & mask; - out[6] = (w2 >> 22) & mask; - out[7] = ((w2 >> 47) | (w3 << 17)) & mask; - out[8] = (w3 >> 8) & mask; - out[9] = (w3 >> 33) & mask; - out[10] = ((w3 >> 58) | (w4 << 6)) & mask; - out[11] = (w4 >> 19) & mask; - out[12] = ((w4 >> 44) | (w5 << 20)) & mask; - out[13] = (w5 >> 5) & mask; - out[14] = (w5 >> 30) & mask; - out[15] = ((w5 >> 55) | (w6 << 9)) & mask; - out[16] = (w6 >> 16) & mask; - out[17] = ((w6 >> 41) | (w7 << 23)) & mask; - out[18] = (w7 >> 2) & mask; - out[19] = (w7 >> 27) & mask; - out[20] = ((w7 >> 52) | (w8 << 12)) & mask; - out[21] = (w8 >> 13) & mask; - out[22] = (w8 >> 38) & mask; - out[23] = ((w8 >> 63) | (w9 << 1)) & mask; - out[24] = (w9 >> 24) & mask; - out[25] = ((w9 >> 49) | (w10 << 15)) & mask; - out[26] = (w10 >> 10) & mask; - out[27] = (w10 >> 35) & mask; - out[28] = ((w10 >> 60) | (w11 << 4)) & mask; - out[29] = (w11 >> 21) & mask; - out[30] = ((w11 >> 46) | (w12 << 18)) & mask; - out[31] = (w12 >> 7) & mask; - - return in; -} - -inline const uint8_t* unpack26_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 67108863ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 26) & mask; - out[2] = ((w0 >> 52) | (w1 << 12)) & mask; - out[3] = (w1 >> 14) & mask; - out[4] = ((w1 >> 40) | (w2 << 24)) & mask; - out[5] = (w2 >> 2) & mask; - out[6] = (w2 >> 28) & mask; - out[7] = ((w2 >> 54) | (w3 << 10)) & mask; - out[8] = (w3 >> 16) & mask; - out[9] = ((w3 >> 42) | (w4 << 22)) & mask; - out[10] = (w4 >> 4) & mask; - out[11] = (w4 >> 30) & mask; - out[12] = ((w4 >> 56) | (w5 << 8)) & mask; - out[13] = (w5 >> 18) & mask; - out[14] = ((w5 >> 44) | (w6 << 20)) & mask; - out[15] = (w6 >> 6) & mask; - out[16] = (w6 >> 32) & mask; - out[17] = ((w6 >> 58) | (w7 << 6)) & mask; - out[18] = (w7 >> 20) & mask; - out[19] = ((w7 >> 46) | (w8 << 18)) & mask; - out[20] = (w8 >> 8) & mask; - out[21] = (w8 >> 34) & mask; - out[22] = ((w8 >> 60) | (w9 << 4)) & mask; - out[23] = (w9 >> 22) & mask; - out[24] = ((w9 >> 48) | (w10 << 16)) & mask; - out[25] = (w10 >> 10) & mask; - out[26] = (w10 >> 36) & mask; - out[27] = ((w10 >> 62) | (w11 << 2)) & mask; - out[28] = (w11 >> 24) & mask; - out[29] = ((w11 >> 50) | (w12 << 14)) & mask; - out[30] = (w12 >> 12) & mask; - out[31] = w12 >> 38; - - return in; -} - -inline const uint8_t* unpack27_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 134217727ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 27) & mask; - out[2] = ((w0 >> 54) | (w1 << 10)) & mask; - out[3] = (w1 >> 17) & mask; - out[4] = ((w1 >> 44) | (w2 << 20)) & mask; - out[5] = (w2 >> 7) & mask; - out[6] = (w2 >> 34) & mask; - out[7] = ((w2 >> 61) | (w3 << 3)) & mask; - out[8] = (w3 >> 24) & mask; - out[9] = ((w3 >> 51) | (w4 << 13)) & mask; - out[10] = (w4 >> 14) & mask; - out[11] = ((w4 >> 41) | (w5 << 23)) & mask; - out[12] = (w5 >> 4) & mask; - out[13] = (w5 >> 31) & mask; - out[14] = ((w5 >> 58) | (w6 << 6)) & mask; - out[15] = (w6 >> 21) & mask; - out[16] = ((w6 >> 48) | (w7 << 16)) & mask; - out[17] = (w7 >> 11) & mask; - out[18] = ((w7 >> 38) | (w8 << 26)) & mask; - out[19] = (w8 >> 1) & mask; - out[20] = (w8 >> 28) & mask; - out[21] = ((w8 >> 55) | (w9 << 9)) & mask; - out[22] = (w9 >> 18) & mask; - out[23] = ((w9 >> 45) | (w10 << 19)) & mask; - out[24] = (w10 >> 8) & mask; - out[25] = (w10 >> 35) & mask; - out[26] = ((w10 >> 62) | (w11 << 2)) & mask; - out[27] = (w11 >> 25) & mask; - out[28] = ((w11 >> 52) | (w12 << 12)) & mask; - out[29] = (w12 >> 15) & mask; - out[30] = ((w12 >> 42) | (w13 << 22)) & mask; - out[31] = (w13 >> 5) & mask; - - return in; -} - -inline const uint8_t* unpack28_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 268435455ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 28) & mask; - out[2] = ((w0 >> 56) | (w1 << 8)) & mask; - out[3] = (w1 >> 20) & mask; - out[4] = ((w1 >> 48) | (w2 << 16)) & mask; - out[5] = (w2 >> 12) & mask; - out[6] = ((w2 >> 40) | (w3 << 24)) & mask; - out[7] = (w3 >> 4) & mask; - out[8] = (w3 >> 32) & mask; - out[9] = ((w3 >> 60) | (w4 << 4)) & mask; - out[10] = (w4 >> 24) & mask; - out[11] = ((w4 >> 52) | (w5 << 12)) & mask; - out[12] = (w5 >> 16) & mask; - out[13] = ((w5 >> 44) | (w6 << 20)) & mask; - out[14] = (w6 >> 8) & mask; - out[15] = w6 >> 36; - out[16] = (w7)&mask; - out[17] = (w7 >> 28) & mask; - out[18] = ((w7 >> 56) | (w8 << 8)) & mask; - out[19] = (w8 >> 20) & mask; - out[20] = ((w8 >> 48) | (w9 << 16)) & mask; - out[21] = (w9 >> 12) & mask; - out[22] = ((w9 >> 40) | (w10 << 24)) & mask; - out[23] = (w10 >> 4) & mask; - out[24] = (w10 >> 32) & mask; - out[25] = ((w10 >> 60) | (w11 << 4)) & mask; - out[26] = (w11 >> 24) & mask; - out[27] = ((w11 >> 52) | (w12 << 12)) & mask; - out[28] = (w12 >> 16) & mask; - out[29] = ((w12 >> 44) | (w13 << 20)) & mask; - out[30] = (w13 >> 8) & mask; - out[31] = w13 >> 36; - - return in; -} - -inline const uint8_t* unpack29_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 536870911ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 29) & mask; - out[2] = ((w0 >> 58) | (w1 << 6)) & mask; - out[3] = (w1 >> 23) & mask; - out[4] = ((w1 >> 52) | (w2 << 12)) & mask; - out[5] = (w2 >> 17) & mask; - out[6] = ((w2 >> 46) | (w3 << 18)) & mask; - out[7] = (w3 >> 11) & mask; - out[8] = ((w3 >> 40) | (w4 << 24)) & mask; - out[9] = (w4 >> 5) & mask; - out[10] = (w4 >> 34) & mask; - out[11] = ((w4 >> 63) | (w5 << 1)) & mask; - out[12] = (w5 >> 28) & mask; - out[13] = ((w5 >> 57) | (w6 << 7)) & mask; - out[14] = (w6 >> 22) & mask; - out[15] = ((w6 >> 51) | (w7 << 13)) & mask; - out[16] = (w7 >> 16) & mask; - out[17] = ((w7 >> 45) | (w8 << 19)) & mask; - out[18] = (w8 >> 10) & mask; - out[19] = ((w8 >> 39) | (w9 << 25)) & mask; - out[20] = (w9 >> 4) & mask; - out[21] = (w9 >> 33) & mask; - out[22] = ((w9 >> 62) | (w10 << 2)) & mask; - out[23] = (w10 >> 27) & mask; - out[24] = ((w10 >> 56) | (w11 << 8)) & mask; - out[25] = (w11 >> 21) & mask; - out[26] = ((w11 >> 50) | (w12 << 14)) & mask; - out[27] = (w12 >> 15) & mask; - out[28] = ((w12 >> 44) | (w13 << 20)) & mask; - out[29] = (w13 >> 9) & mask; - out[30] = ((w13 >> 38) | (w14 << 26)) & mask; - out[31] = (w14 >> 3) & mask; - - return in; -} - -inline const uint8_t* unpack30_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 1073741823ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - out[0] = (w0)&mask; - out[1] = (w0 >> 30) & mask; - out[2] = ((w0 >> 60) | (w1 << 4)) & mask; - out[3] = (w1 >> 26) & mask; - out[4] = ((w1 >> 56) | (w2 << 8)) & mask; - out[5] = (w2 >> 22) & mask; - out[6] = ((w2 >> 52) | (w3 << 12)) & mask; - out[7] = (w3 >> 18) & mask; - out[8] = ((w3 >> 48) | (w4 << 16)) & mask; - out[9] = (w4 >> 14) & mask; - out[10] = ((w4 >> 44) | (w5 << 20)) & mask; - out[11] = (w5 >> 10) & mask; - out[12] = ((w5 >> 40) | (w6 << 24)) & mask; - out[13] = (w6 >> 6) & mask; - out[14] = ((w6 >> 36) | (w7 << 28)) & mask; - out[15] = (w7 >> 2) & mask; - out[16] = (w7 >> 32) & mask; - out[17] = ((w7 >> 62) | (w8 << 2)) & mask; - out[18] = (w8 >> 28) & mask; - out[19] = ((w8 >> 58) | (w9 << 6)) & mask; - out[20] = (w9 >> 24) & mask; - out[21] = ((w9 >> 54) | (w10 << 10)) & mask; - out[22] = (w10 >> 20) & mask; - out[23] = ((w10 >> 50) | (w11 << 14)) & mask; - out[24] = (w11 >> 16) & mask; - out[25] = ((w11 >> 46) | (w12 << 18)) & mask; - out[26] = (w12 >> 12) & mask; - out[27] = ((w12 >> 42) | (w13 << 22)) & mask; - out[28] = (w13 >> 8) & mask; - out[29] = ((w13 >> 38) | (w14 << 26)) & mask; - out[30] = (w14 >> 4) & mask; - out[31] = w14 >> 34; - - return in; -} - -inline const uint8_t* unpack31_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 2147483647ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 4; - out[0] = (w0)&mask; - out[1] = (w0 >> 31) & mask; - out[2] = ((w0 >> 62) | (w1 << 2)) & mask; - out[3] = (w1 >> 29) & mask; - out[4] = ((w1 >> 60) | (w2 << 4)) & mask; - out[5] = (w2 >> 27) & mask; - out[6] = ((w2 >> 58) | (w3 << 6)) & mask; - out[7] = (w3 >> 25) & mask; - out[8] = ((w3 >> 56) | (w4 << 8)) & mask; - out[9] = (w4 >> 23) & mask; - out[10] = ((w4 >> 54) | (w5 << 10)) & mask; - out[11] = (w5 >> 21) & mask; - out[12] = ((w5 >> 52) | (w6 << 12)) & mask; - out[13] = (w6 >> 19) & mask; - out[14] = ((w6 >> 50) | (w7 << 14)) & mask; - out[15] = (w7 >> 17) & mask; - out[16] = ((w7 >> 48) | (w8 << 16)) & mask; - out[17] = (w8 >> 15) & mask; - out[18] = ((w8 >> 46) | (w9 << 18)) & mask; - out[19] = (w9 >> 13) & mask; - out[20] = ((w9 >> 44) | (w10 << 20)) & mask; - out[21] = (w10 >> 11) & mask; - out[22] = ((w10 >> 42) | (w11 << 22)) & mask; - out[23] = (w11 >> 9) & mask; - out[24] = ((w11 >> 40) | (w12 << 24)) & mask; - out[25] = (w12 >> 7) & mask; - out[26] = ((w12 >> 38) | (w13 << 26)) & mask; - out[27] = (w13 >> 5) & mask; - out[28] = ((w13 >> 36) | (w14 << 28)) & mask; - out[29] = (w14 >> 3) & mask; - out[30] = ((w14 >> 34) | (w15 << 30)) & mask; - out[31] = (w15 >> 1) & mask; - - return in; -} - -inline const uint8_t* unpack32_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 4294967295ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - out[0] = (w0)&mask; - out[1] = w0 >> 32; - out[2] = (w1)&mask; - out[3] = w1 >> 32; - out[4] = (w2)&mask; - out[5] = w2 >> 32; - out[6] = (w3)&mask; - out[7] = w3 >> 32; - out[8] = (w4)&mask; - out[9] = w4 >> 32; - out[10] = (w5)&mask; - out[11] = w5 >> 32; - out[12] = (w6)&mask; - out[13] = w6 >> 32; - out[14] = (w7)&mask; - out[15] = w7 >> 32; - out[16] = (w8)&mask; - out[17] = w8 >> 32; - out[18] = (w9)&mask; - out[19] = w9 >> 32; - out[20] = (w10)&mask; - out[21] = w10 >> 32; - out[22] = (w11)&mask; - out[23] = w11 >> 32; - out[24] = (w12)&mask; - out[25] = w12 >> 32; - out[26] = (w13)&mask; - out[27] = w13 >> 32; - out[28] = (w14)&mask; - out[29] = w14 >> 32; - out[30] = (w15)&mask; - out[31] = w15 >> 32; - - return in; -} - -inline const uint8_t* unpack33_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 8589934591ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 33) | (w1 << 31)) & mask; - out[2] = (w1 >> 2) & mask; - out[3] = ((w1 >> 35) | (w2 << 29)) & mask; - out[4] = (w2 >> 4) & mask; - out[5] = ((w2 >> 37) | (w3 << 27)) & mask; - out[6] = (w3 >> 6) & mask; - out[7] = ((w3 >> 39) | (w4 << 25)) & mask; - out[8] = (w4 >> 8) & mask; - out[9] = ((w4 >> 41) | (w5 << 23)) & mask; - out[10] = (w5 >> 10) & mask; - out[11] = ((w5 >> 43) | (w6 << 21)) & mask; - out[12] = (w6 >> 12) & mask; - out[13] = ((w6 >> 45) | (w7 << 19)) & mask; - out[14] = (w7 >> 14) & mask; - out[15] = ((w7 >> 47) | (w8 << 17)) & mask; - out[16] = (w8 >> 16) & mask; - out[17] = ((w8 >> 49) | (w9 << 15)) & mask; - out[18] = (w9 >> 18) & mask; - out[19] = ((w9 >> 51) | (w10 << 13)) & mask; - out[20] = (w10 >> 20) & mask; - out[21] = ((w10 >> 53) | (w11 << 11)) & mask; - out[22] = (w11 >> 22) & mask; - out[23] = ((w11 >> 55) | (w12 << 9)) & mask; - out[24] = (w12 >> 24) & mask; - out[25] = ((w12 >> 57) | (w13 << 7)) & mask; - out[26] = (w13 >> 26) & mask; - out[27] = ((w13 >> 59) | (w14 << 5)) & mask; - out[28] = (w14 >> 28) & mask; - out[29] = ((w14 >> 61) | (w15 << 3)) & mask; - out[30] = (w15 >> 30) & mask; - out[31] = ((w15 >> 63) | (w16 << 1)) & mask; - - return in; -} - -inline const uint8_t* unpack34_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 17179869183ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 34) | (w1 << 30)) & mask; - out[2] = (w1 >> 4) & mask; - out[3] = ((w1 >> 38) | (w2 << 26)) & mask; - out[4] = (w2 >> 8) & mask; - out[5] = ((w2 >> 42) | (w3 << 22)) & mask; - out[6] = (w3 >> 12) & mask; - out[7] = ((w3 >> 46) | (w4 << 18)) & mask; - out[8] = (w4 >> 16) & mask; - out[9] = ((w4 >> 50) | (w5 << 14)) & mask; - out[10] = (w5 >> 20) & mask; - out[11] = ((w5 >> 54) | (w6 << 10)) & mask; - out[12] = (w6 >> 24) & mask; - out[13] = ((w6 >> 58) | (w7 << 6)) & mask; - out[14] = (w7 >> 28) & mask; - out[15] = ((w7 >> 62) | (w8 << 2)) & mask; - out[16] = ((w8 >> 32) | (w9 << 32)) & mask; - out[17] = (w9 >> 2) & mask; - out[18] = ((w9 >> 36) | (w10 << 28)) & mask; - out[19] = (w10 >> 6) & mask; - out[20] = ((w10 >> 40) | (w11 << 24)) & mask; - out[21] = (w11 >> 10) & mask; - out[22] = ((w11 >> 44) | (w12 << 20)) & mask; - out[23] = (w12 >> 14) & mask; - out[24] = ((w12 >> 48) | (w13 << 16)) & mask; - out[25] = (w13 >> 18) & mask; - out[26] = ((w13 >> 52) | (w14 << 12)) & mask; - out[27] = (w14 >> 22) & mask; - out[28] = ((w14 >> 56) | (w15 << 8)) & mask; - out[29] = (w15 >> 26) & mask; - out[30] = ((w15 >> 60) | (w16 << 4)) & mask; - out[31] = w16 >> 30; - - return in; -} - -inline const uint8_t* unpack35_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 34359738367ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 35) | (w1 << 29)) & mask; - out[2] = (w1 >> 6) & mask; - out[3] = ((w1 >> 41) | (w2 << 23)) & mask; - out[4] = (w2 >> 12) & mask; - out[5] = ((w2 >> 47) | (w3 << 17)) & mask; - out[6] = (w3 >> 18) & mask; - out[7] = ((w3 >> 53) | (w4 << 11)) & mask; - out[8] = (w4 >> 24) & mask; - out[9] = ((w4 >> 59) | (w5 << 5)) & mask; - out[10] = ((w5 >> 30) | (w6 << 34)) & mask; - out[11] = (w6 >> 1) & mask; - out[12] = ((w6 >> 36) | (w7 << 28)) & mask; - out[13] = (w7 >> 7) & mask; - out[14] = ((w7 >> 42) | (w8 << 22)) & mask; - out[15] = (w8 >> 13) & mask; - out[16] = ((w8 >> 48) | (w9 << 16)) & mask; - out[17] = (w9 >> 19) & mask; - out[18] = ((w9 >> 54) | (w10 << 10)) & mask; - out[19] = (w10 >> 25) & mask; - out[20] = ((w10 >> 60) | (w11 << 4)) & mask; - out[21] = ((w11 >> 31) | (w12 << 33)) & mask; - out[22] = (w12 >> 2) & mask; - out[23] = ((w12 >> 37) | (w13 << 27)) & mask; - out[24] = (w13 >> 8) & mask; - out[25] = ((w13 >> 43) | (w14 << 21)) & mask; - out[26] = (w14 >> 14) & mask; - out[27] = ((w14 >> 49) | (w15 << 15)) & mask; - out[28] = (w15 >> 20) & mask; - out[29] = ((w15 >> 55) | (w16 << 9)) & mask; - out[30] = (w16 >> 26) & mask; - out[31] = ((w16 >> 61) | (w17 << 3)) & mask; - - return in; -} - -inline const uint8_t* unpack36_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 68719476735ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 36) | (w1 << 28)) & mask; - out[2] = (w1 >> 8) & mask; - out[3] = ((w1 >> 44) | (w2 << 20)) & mask; - out[4] = (w2 >> 16) & mask; - out[5] = ((w2 >> 52) | (w3 << 12)) & mask; - out[6] = (w3 >> 24) & mask; - out[7] = ((w3 >> 60) | (w4 << 4)) & mask; - out[8] = ((w4 >> 32) | (w5 << 32)) & mask; - out[9] = (w5 >> 4) & mask; - out[10] = ((w5 >> 40) | (w6 << 24)) & mask; - out[11] = (w6 >> 12) & mask; - out[12] = ((w6 >> 48) | (w7 << 16)) & mask; - out[13] = (w7 >> 20) & mask; - out[14] = ((w7 >> 56) | (w8 << 8)) & mask; - out[15] = w8 >> 28; - out[16] = (w9)&mask; - out[17] = ((w9 >> 36) | (w10 << 28)) & mask; - out[18] = (w10 >> 8) & mask; - out[19] = ((w10 >> 44) | (w11 << 20)) & mask; - out[20] = (w11 >> 16) & mask; - out[21] = ((w11 >> 52) | (w12 << 12)) & mask; - out[22] = (w12 >> 24) & mask; - out[23] = ((w12 >> 60) | (w13 << 4)) & mask; - out[24] = ((w13 >> 32) | (w14 << 32)) & mask; - out[25] = (w14 >> 4) & mask; - out[26] = ((w14 >> 40) | (w15 << 24)) & mask; - out[27] = (w15 >> 12) & mask; - out[28] = ((w15 >> 48) | (w16 << 16)) & mask; - out[29] = (w16 >> 20) & mask; - out[30] = ((w16 >> 56) | (w17 << 8)) & mask; - out[31] = w17 >> 28; - - return in; -} - -inline const uint8_t* unpack37_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 137438953471ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 37) | (w1 << 27)) & mask; - out[2] = (w1 >> 10) & mask; - out[3] = ((w1 >> 47) | (w2 << 17)) & mask; - out[4] = (w2 >> 20) & mask; - out[5] = ((w2 >> 57) | (w3 << 7)) & mask; - out[6] = ((w3 >> 30) | (w4 << 34)) & mask; - out[7] = (w4 >> 3) & mask; - out[8] = ((w4 >> 40) | (w5 << 24)) & mask; - out[9] = (w5 >> 13) & mask; - out[10] = ((w5 >> 50) | (w6 << 14)) & mask; - out[11] = (w6 >> 23) & mask; - out[12] = ((w6 >> 60) | (w7 << 4)) & mask; - out[13] = ((w7 >> 33) | (w8 << 31)) & mask; - out[14] = (w8 >> 6) & mask; - out[15] = ((w8 >> 43) | (w9 << 21)) & mask; - out[16] = (w9 >> 16) & mask; - out[17] = ((w9 >> 53) | (w10 << 11)) & mask; - out[18] = (w10 >> 26) & mask; - out[19] = ((w10 >> 63) | (w11 << 1)) & mask; - out[20] = ((w11 >> 36) | (w12 << 28)) & mask; - out[21] = (w12 >> 9) & mask; - out[22] = ((w12 >> 46) | (w13 << 18)) & mask; - out[23] = (w13 >> 19) & mask; - out[24] = ((w13 >> 56) | (w14 << 8)) & mask; - out[25] = ((w14 >> 29) | (w15 << 35)) & mask; - out[26] = (w15 >> 2) & mask; - out[27] = ((w15 >> 39) | (w16 << 25)) & mask; - out[28] = (w16 >> 12) & mask; - out[29] = ((w16 >> 49) | (w17 << 15)) & mask; - out[30] = (w17 >> 22) & mask; - out[31] = ((w17 >> 59) | (w18 << 5)) & mask; - - return in; -} - -inline const uint8_t* unpack38_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 274877906943ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 38) | (w1 << 26)) & mask; - out[2] = (w1 >> 12) & mask; - out[3] = ((w1 >> 50) | (w2 << 14)) & mask; - out[4] = (w2 >> 24) & mask; - out[5] = ((w2 >> 62) | (w3 << 2)) & mask; - out[6] = ((w3 >> 36) | (w4 << 28)) & mask; - out[7] = (w4 >> 10) & mask; - out[8] = ((w4 >> 48) | (w5 << 16)) & mask; - out[9] = (w5 >> 22) & mask; - out[10] = ((w5 >> 60) | (w6 << 4)) & mask; - out[11] = ((w6 >> 34) | (w7 << 30)) & mask; - out[12] = (w7 >> 8) & mask; - out[13] = ((w7 >> 46) | (w8 << 18)) & mask; - out[14] = (w8 >> 20) & mask; - out[15] = ((w8 >> 58) | (w9 << 6)) & mask; - out[16] = ((w9 >> 32) | (w10 << 32)) & mask; - out[17] = (w10 >> 6) & mask; - out[18] = ((w10 >> 44) | (w11 << 20)) & mask; - out[19] = (w11 >> 18) & mask; - out[20] = ((w11 >> 56) | (w12 << 8)) & mask; - out[21] = ((w12 >> 30) | (w13 << 34)) & mask; - out[22] = (w13 >> 4) & mask; - out[23] = ((w13 >> 42) | (w14 << 22)) & mask; - out[24] = (w14 >> 16) & mask; - out[25] = ((w14 >> 54) | (w15 << 10)) & mask; - out[26] = ((w15 >> 28) | (w16 << 36)) & mask; - out[27] = (w16 >> 2) & mask; - out[28] = ((w16 >> 40) | (w17 << 24)) & mask; - out[29] = (w17 >> 14) & mask; - out[30] = ((w17 >> 52) | (w18 << 12)) & mask; - out[31] = w18 >> 26; - - return in; -} - -inline const uint8_t* unpack39_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 549755813887ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 39) | (w1 << 25)) & mask; - out[2] = (w1 >> 14) & mask; - out[3] = ((w1 >> 53) | (w2 << 11)) & mask; - out[4] = ((w2 >> 28) | (w3 << 36)) & mask; - out[5] = (w3 >> 3) & mask; - out[6] = ((w3 >> 42) | (w4 << 22)) & mask; - out[7] = (w4 >> 17) & mask; - out[8] = ((w4 >> 56) | (w5 << 8)) & mask; - out[9] = ((w5 >> 31) | (w6 << 33)) & mask; - out[10] = (w6 >> 6) & mask; - out[11] = ((w6 >> 45) | (w7 << 19)) & mask; - out[12] = (w7 >> 20) & mask; - out[13] = ((w7 >> 59) | (w8 << 5)) & mask; - out[14] = ((w8 >> 34) | (w9 << 30)) & mask; - out[15] = (w9 >> 9) & mask; - out[16] = ((w9 >> 48) | (w10 << 16)) & mask; - out[17] = (w10 >> 23) & mask; - out[18] = ((w10 >> 62) | (w11 << 2)) & mask; - out[19] = ((w11 >> 37) | (w12 << 27)) & mask; - out[20] = (w12 >> 12) & mask; - out[21] = ((w12 >> 51) | (w13 << 13)) & mask; - out[22] = ((w13 >> 26) | (w14 << 38)) & mask; - out[23] = (w14 >> 1) & mask; - out[24] = ((w14 >> 40) | (w15 << 24)) & mask; - out[25] = (w15 >> 15) & mask; - out[26] = ((w15 >> 54) | (w16 << 10)) & mask; - out[27] = ((w16 >> 29) | (w17 << 35)) & mask; - out[28] = (w17 >> 4) & mask; - out[29] = ((w17 >> 43) | (w18 << 21)) & mask; - out[30] = (w18 >> 18) & mask; - out[31] = ((w18 >> 57) | (w19 << 7)) & mask; - - return in; -} - -inline const uint8_t* unpack40_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 1099511627775ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 40) | (w1 << 24)) & mask; - out[2] = (w1 >> 16) & mask; - out[3] = ((w1 >> 56) | (w2 << 8)) & mask; - out[4] = ((w2 >> 32) | (w3 << 32)) & mask; - out[5] = (w3 >> 8) & mask; - out[6] = ((w3 >> 48) | (w4 << 16)) & mask; - out[7] = w4 >> 24; - out[8] = (w5)&mask; - out[9] = ((w5 >> 40) | (w6 << 24)) & mask; - out[10] = (w6 >> 16) & mask; - out[11] = ((w6 >> 56) | (w7 << 8)) & mask; - out[12] = ((w7 >> 32) | (w8 << 32)) & mask; - out[13] = (w8 >> 8) & mask; - out[14] = ((w8 >> 48) | (w9 << 16)) & mask; - out[15] = w9 >> 24; - out[16] = (w10)&mask; - out[17] = ((w10 >> 40) | (w11 << 24)) & mask; - out[18] = (w11 >> 16) & mask; - out[19] = ((w11 >> 56) | (w12 << 8)) & mask; - out[20] = ((w12 >> 32) | (w13 << 32)) & mask; - out[21] = (w13 >> 8) & mask; - out[22] = ((w13 >> 48) | (w14 << 16)) & mask; - out[23] = w14 >> 24; - out[24] = (w15)&mask; - out[25] = ((w15 >> 40) | (w16 << 24)) & mask; - out[26] = (w16 >> 16) & mask; - out[27] = ((w16 >> 56) | (w17 << 8)) & mask; - out[28] = ((w17 >> 32) | (w18 << 32)) & mask; - out[29] = (w18 >> 8) & mask; - out[30] = ((w18 >> 48) | (w19 << 16)) & mask; - out[31] = w19 >> 24; - - return in; -} - -inline const uint8_t* unpack41_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 2199023255551ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 41) | (w1 << 23)) & mask; - out[2] = (w1 >> 18) & mask; - out[3] = ((w1 >> 59) | (w2 << 5)) & mask; - out[4] = ((w2 >> 36) | (w3 << 28)) & mask; - out[5] = (w3 >> 13) & mask; - out[6] = ((w3 >> 54) | (w4 << 10)) & mask; - out[7] = ((w4 >> 31) | (w5 << 33)) & mask; - out[8] = (w5 >> 8) & mask; - out[9] = ((w5 >> 49) | (w6 << 15)) & mask; - out[10] = ((w6 >> 26) | (w7 << 38)) & mask; - out[11] = (w7 >> 3) & mask; - out[12] = ((w7 >> 44) | (w8 << 20)) & mask; - out[13] = (w8 >> 21) & mask; - out[14] = ((w8 >> 62) | (w9 << 2)) & mask; - out[15] = ((w9 >> 39) | (w10 << 25)) & mask; - out[16] = (w10 >> 16) & mask; - out[17] = ((w10 >> 57) | (w11 << 7)) & mask; - out[18] = ((w11 >> 34) | (w12 << 30)) & mask; - out[19] = (w12 >> 11) & mask; - out[20] = ((w12 >> 52) | (w13 << 12)) & mask; - out[21] = ((w13 >> 29) | (w14 << 35)) & mask; - out[22] = (w14 >> 6) & mask; - out[23] = ((w14 >> 47) | (w15 << 17)) & mask; - out[24] = ((w15 >> 24) | (w16 << 40)) & mask; - out[25] = (w16 >> 1) & mask; - out[26] = ((w16 >> 42) | (w17 << 22)) & mask; - out[27] = (w17 >> 19) & mask; - out[28] = ((w17 >> 60) | (w18 << 4)) & mask; - out[29] = ((w18 >> 37) | (w19 << 27)) & mask; - out[30] = (w19 >> 14) & mask; - out[31] = ((w19 >> 55) | (w20 << 9)) & mask; - - return in; -} - -inline const uint8_t* unpack42_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 4398046511103ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 42) | (w1 << 22)) & mask; - out[2] = (w1 >> 20) & mask; - out[3] = ((w1 >> 62) | (w2 << 2)) & mask; - out[4] = ((w2 >> 40) | (w3 << 24)) & mask; - out[5] = (w3 >> 18) & mask; - out[6] = ((w3 >> 60) | (w4 << 4)) & mask; - out[7] = ((w4 >> 38) | (w5 << 26)) & mask; - out[8] = (w5 >> 16) & mask; - out[9] = ((w5 >> 58) | (w6 << 6)) & mask; - out[10] = ((w6 >> 36) | (w7 << 28)) & mask; - out[11] = (w7 >> 14) & mask; - out[12] = ((w7 >> 56) | (w8 << 8)) & mask; - out[13] = ((w8 >> 34) | (w9 << 30)) & mask; - out[14] = (w9 >> 12) & mask; - out[15] = ((w9 >> 54) | (w10 << 10)) & mask; - out[16] = ((w10 >> 32) | (w11 << 32)) & mask; - out[17] = (w11 >> 10) & mask; - out[18] = ((w11 >> 52) | (w12 << 12)) & mask; - out[19] = ((w12 >> 30) | (w13 << 34)) & mask; - out[20] = (w13 >> 8) & mask; - out[21] = ((w13 >> 50) | (w14 << 14)) & mask; - out[22] = ((w14 >> 28) | (w15 << 36)) & mask; - out[23] = (w15 >> 6) & mask; - out[24] = ((w15 >> 48) | (w16 << 16)) & mask; - out[25] = ((w16 >> 26) | (w17 << 38)) & mask; - out[26] = (w17 >> 4) & mask; - out[27] = ((w17 >> 46) | (w18 << 18)) & mask; - out[28] = ((w18 >> 24) | (w19 << 40)) & mask; - out[29] = (w19 >> 2) & mask; - out[30] = ((w19 >> 44) | (w20 << 20)) & mask; - out[31] = w20 >> 22; - - return in; -} - -inline const uint8_t* unpack43_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 8796093022207ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 43) | (w1 << 21)) & mask; - out[2] = ((w1 >> 22) | (w2 << 42)) & mask; - out[3] = (w2 >> 1) & mask; - out[4] = ((w2 >> 44) | (w3 << 20)) & mask; - out[5] = ((w3 >> 23) | (w4 << 41)) & mask; - out[6] = (w4 >> 2) & mask; - out[7] = ((w4 >> 45) | (w5 << 19)) & mask; - out[8] = ((w5 >> 24) | (w6 << 40)) & mask; - out[9] = (w6 >> 3) & mask; - out[10] = ((w6 >> 46) | (w7 << 18)) & mask; - out[11] = ((w7 >> 25) | (w8 << 39)) & mask; - out[12] = (w8 >> 4) & mask; - out[13] = ((w8 >> 47) | (w9 << 17)) & mask; - out[14] = ((w9 >> 26) | (w10 << 38)) & mask; - out[15] = (w10 >> 5) & mask; - out[16] = ((w10 >> 48) | (w11 << 16)) & mask; - out[17] = ((w11 >> 27) | (w12 << 37)) & mask; - out[18] = (w12 >> 6) & mask; - out[19] = ((w12 >> 49) | (w13 << 15)) & mask; - out[20] = ((w13 >> 28) | (w14 << 36)) & mask; - out[21] = (w14 >> 7) & mask; - out[22] = ((w14 >> 50) | (w15 << 14)) & mask; - out[23] = ((w15 >> 29) | (w16 << 35)) & mask; - out[24] = (w16 >> 8) & mask; - out[25] = ((w16 >> 51) | (w17 << 13)) & mask; - out[26] = ((w17 >> 30) | (w18 << 34)) & mask; - out[27] = (w18 >> 9) & mask; - out[28] = ((w18 >> 52) | (w19 << 12)) & mask; - out[29] = ((w19 >> 31) | (w20 << 33)) & mask; - out[30] = (w20 >> 10) & mask; - out[31] = ((w20 >> 53) | (w21 << 11)) & mask; - - return in; -} - -inline const uint8_t* unpack44_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 17592186044415ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 44) | (w1 << 20)) & mask; - out[2] = ((w1 >> 24) | (w2 << 40)) & mask; - out[3] = (w2 >> 4) & mask; - out[4] = ((w2 >> 48) | (w3 << 16)) & mask; - out[5] = ((w3 >> 28) | (w4 << 36)) & mask; - out[6] = (w4 >> 8) & mask; - out[7] = ((w4 >> 52) | (w5 << 12)) & mask; - out[8] = ((w5 >> 32) | (w6 << 32)) & mask; - out[9] = (w6 >> 12) & mask; - out[10] = ((w6 >> 56) | (w7 << 8)) & mask; - out[11] = ((w7 >> 36) | (w8 << 28)) & mask; - out[12] = (w8 >> 16) & mask; - out[13] = ((w8 >> 60) | (w9 << 4)) & mask; - out[14] = ((w9 >> 40) | (w10 << 24)) & mask; - out[15] = w10 >> 20; - out[16] = (w11)&mask; - out[17] = ((w11 >> 44) | (w12 << 20)) & mask; - out[18] = ((w12 >> 24) | (w13 << 40)) & mask; - out[19] = (w13 >> 4) & mask; - out[20] = ((w13 >> 48) | (w14 << 16)) & mask; - out[21] = ((w14 >> 28) | (w15 << 36)) & mask; - out[22] = (w15 >> 8) & mask; - out[23] = ((w15 >> 52) | (w16 << 12)) & mask; - out[24] = ((w16 >> 32) | (w17 << 32)) & mask; - out[25] = (w17 >> 12) & mask; - out[26] = ((w17 >> 56) | (w18 << 8)) & mask; - out[27] = ((w18 >> 36) | (w19 << 28)) & mask; - out[28] = (w19 >> 16) & mask; - out[29] = ((w19 >> 60) | (w20 << 4)) & mask; - out[30] = ((w20 >> 40) | (w21 << 24)) & mask; - out[31] = w21 >> 20; - - return in; -} - -inline const uint8_t* unpack45_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 35184372088831ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 45) | (w1 << 19)) & mask; - out[2] = ((w1 >> 26) | (w2 << 38)) & mask; - out[3] = (w2 >> 7) & mask; - out[4] = ((w2 >> 52) | (w3 << 12)) & mask; - out[5] = ((w3 >> 33) | (w4 << 31)) & mask; - out[6] = (w4 >> 14) & mask; - out[7] = ((w4 >> 59) | (w5 << 5)) & mask; - out[8] = ((w5 >> 40) | (w6 << 24)) & mask; - out[9] = ((w6 >> 21) | (w7 << 43)) & mask; - out[10] = (w7 >> 2) & mask; - out[11] = ((w7 >> 47) | (w8 << 17)) & mask; - out[12] = ((w8 >> 28) | (w9 << 36)) & mask; - out[13] = (w9 >> 9) & mask; - out[14] = ((w9 >> 54) | (w10 << 10)) & mask; - out[15] = ((w10 >> 35) | (w11 << 29)) & mask; - out[16] = (w11 >> 16) & mask; - out[17] = ((w11 >> 61) | (w12 << 3)) & mask; - out[18] = ((w12 >> 42) | (w13 << 22)) & mask; - out[19] = ((w13 >> 23) | (w14 << 41)) & mask; - out[20] = (w14 >> 4) & mask; - out[21] = ((w14 >> 49) | (w15 << 15)) & mask; - out[22] = ((w15 >> 30) | (w16 << 34)) & mask; - out[23] = (w16 >> 11) & mask; - out[24] = ((w16 >> 56) | (w17 << 8)) & mask; - out[25] = ((w17 >> 37) | (w18 << 27)) & mask; - out[26] = (w18 >> 18) & mask; - out[27] = ((w18 >> 63) | (w19 << 1)) & mask; - out[28] = ((w19 >> 44) | (w20 << 20)) & mask; - out[29] = ((w20 >> 25) | (w21 << 39)) & mask; - out[30] = (w21 >> 6) & mask; - out[31] = ((w21 >> 51) | (w22 << 13)) & mask; - - return in; -} - -inline const uint8_t* unpack46_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 70368744177663ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 46) | (w1 << 18)) & mask; - out[2] = ((w1 >> 28) | (w2 << 36)) & mask; - out[3] = (w2 >> 10) & mask; - out[4] = ((w2 >> 56) | (w3 << 8)) & mask; - out[5] = ((w3 >> 38) | (w4 << 26)) & mask; - out[6] = ((w4 >> 20) | (w5 << 44)) & mask; - out[7] = (w5 >> 2) & mask; - out[8] = ((w5 >> 48) | (w6 << 16)) & mask; - out[9] = ((w6 >> 30) | (w7 << 34)) & mask; - out[10] = (w7 >> 12) & mask; - out[11] = ((w7 >> 58) | (w8 << 6)) & mask; - out[12] = ((w8 >> 40) | (w9 << 24)) & mask; - out[13] = ((w9 >> 22) | (w10 << 42)) & mask; - out[14] = (w10 >> 4) & mask; - out[15] = ((w10 >> 50) | (w11 << 14)) & mask; - out[16] = ((w11 >> 32) | (w12 << 32)) & mask; - out[17] = (w12 >> 14) & mask; - out[18] = ((w12 >> 60) | (w13 << 4)) & mask; - out[19] = ((w13 >> 42) | (w14 << 22)) & mask; - out[20] = ((w14 >> 24) | (w15 << 40)) & mask; - out[21] = (w15 >> 6) & mask; - out[22] = ((w15 >> 52) | (w16 << 12)) & mask; - out[23] = ((w16 >> 34) | (w17 << 30)) & mask; - out[24] = (w17 >> 16) & mask; - out[25] = ((w17 >> 62) | (w18 << 2)) & mask; - out[26] = ((w18 >> 44) | (w19 << 20)) & mask; - out[27] = ((w19 >> 26) | (w20 << 38)) & mask; - out[28] = (w20 >> 8) & mask; - out[29] = ((w20 >> 54) | (w21 << 10)) & mask; - out[30] = ((w21 >> 36) | (w22 << 28)) & mask; - out[31] = w22 >> 18; - - return in; -} - -inline const uint8_t* unpack47_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 140737488355327ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 47) | (w1 << 17)) & mask; - out[2] = ((w1 >> 30) | (w2 << 34)) & mask; - out[3] = (w2 >> 13) & mask; - out[4] = ((w2 >> 60) | (w3 << 4)) & mask; - out[5] = ((w3 >> 43) | (w4 << 21)) & mask; - out[6] = ((w4 >> 26) | (w5 << 38)) & mask; - out[7] = (w5 >> 9) & mask; - out[8] = ((w5 >> 56) | (w6 << 8)) & mask; - out[9] = ((w6 >> 39) | (w7 << 25)) & mask; - out[10] = ((w7 >> 22) | (w8 << 42)) & mask; - out[11] = (w8 >> 5) & mask; - out[12] = ((w8 >> 52) | (w9 << 12)) & mask; - out[13] = ((w9 >> 35) | (w10 << 29)) & mask; - out[14] = ((w10 >> 18) | (w11 << 46)) & mask; - out[15] = (w11 >> 1) & mask; - out[16] = ((w11 >> 48) | (w12 << 16)) & mask; - out[17] = ((w12 >> 31) | (w13 << 33)) & mask; - out[18] = (w13 >> 14) & mask; - out[19] = ((w13 >> 61) | (w14 << 3)) & mask; - out[20] = ((w14 >> 44) | (w15 << 20)) & mask; - out[21] = ((w15 >> 27) | (w16 << 37)) & mask; - out[22] = (w16 >> 10) & mask; - out[23] = ((w16 >> 57) | (w17 << 7)) & mask; - out[24] = ((w17 >> 40) | (w18 << 24)) & mask; - out[25] = ((w18 >> 23) | (w19 << 41)) & mask; - out[26] = (w19 >> 6) & mask; - out[27] = ((w19 >> 53) | (w20 << 11)) & mask; - out[28] = ((w20 >> 36) | (w21 << 28)) & mask; - out[29] = ((w21 >> 19) | (w22 << 45)) & mask; - out[30] = (w22 >> 2) & mask; - out[31] = ((w22 >> 49) | (w23 << 15)) & mask; - - return in; -} - -inline const uint8_t* unpack48_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 281474976710655ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 48) | (w1 << 16)) & mask; - out[2] = ((w1 >> 32) | (w2 << 32)) & mask; - out[3] = w2 >> 16; - out[4] = (w3)&mask; - out[5] = ((w3 >> 48) | (w4 << 16)) & mask; - out[6] = ((w4 >> 32) | (w5 << 32)) & mask; - out[7] = w5 >> 16; - out[8] = (w6)&mask; - out[9] = ((w6 >> 48) | (w7 << 16)) & mask; - out[10] = ((w7 >> 32) | (w8 << 32)) & mask; - out[11] = w8 >> 16; - out[12] = (w9)&mask; - out[13] = ((w9 >> 48) | (w10 << 16)) & mask; - out[14] = ((w10 >> 32) | (w11 << 32)) & mask; - out[15] = w11 >> 16; - out[16] = (w12)&mask; - out[17] = ((w12 >> 48) | (w13 << 16)) & mask; - out[18] = ((w13 >> 32) | (w14 << 32)) & mask; - out[19] = w14 >> 16; - out[20] = (w15)&mask; - out[21] = ((w15 >> 48) | (w16 << 16)) & mask; - out[22] = ((w16 >> 32) | (w17 << 32)) & mask; - out[23] = w17 >> 16; - out[24] = (w18)&mask; - out[25] = ((w18 >> 48) | (w19 << 16)) & mask; - out[26] = ((w19 >> 32) | (w20 << 32)) & mask; - out[27] = w20 >> 16; - out[28] = (w21)&mask; - out[29] = ((w21 >> 48) | (w22 << 16)) & mask; - out[30] = ((w22 >> 32) | (w23 << 32)) & mask; - out[31] = w23 >> 16; - - return in; -} - -inline const uint8_t* unpack49_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 562949953421311ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 49) | (w1 << 15)) & mask; - out[2] = ((w1 >> 34) | (w2 << 30)) & mask; - out[3] = ((w2 >> 19) | (w3 << 45)) & mask; - out[4] = (w3 >> 4) & mask; - out[5] = ((w3 >> 53) | (w4 << 11)) & mask; - out[6] = ((w4 >> 38) | (w5 << 26)) & mask; - out[7] = ((w5 >> 23) | (w6 << 41)) & mask; - out[8] = (w6 >> 8) & mask; - out[9] = ((w6 >> 57) | (w7 << 7)) & mask; - out[10] = ((w7 >> 42) | (w8 << 22)) & mask; - out[11] = ((w8 >> 27) | (w9 << 37)) & mask; - out[12] = (w9 >> 12) & mask; - out[13] = ((w9 >> 61) | (w10 << 3)) & mask; - out[14] = ((w10 >> 46) | (w11 << 18)) & mask; - out[15] = ((w11 >> 31) | (w12 << 33)) & mask; - out[16] = ((w12 >> 16) | (w13 << 48)) & mask; - out[17] = (w13 >> 1) & mask; - out[18] = ((w13 >> 50) | (w14 << 14)) & mask; - out[19] = ((w14 >> 35) | (w15 << 29)) & mask; - out[20] = ((w15 >> 20) | (w16 << 44)) & mask; - out[21] = (w16 >> 5) & mask; - out[22] = ((w16 >> 54) | (w17 << 10)) & mask; - out[23] = ((w17 >> 39) | (w18 << 25)) & mask; - out[24] = ((w18 >> 24) | (w19 << 40)) & mask; - out[25] = (w19 >> 9) & mask; - out[26] = ((w19 >> 58) | (w20 << 6)) & mask; - out[27] = ((w20 >> 43) | (w21 << 21)) & mask; - out[28] = ((w21 >> 28) | (w22 << 36)) & mask; - out[29] = (w22 >> 13) & mask; - out[30] = ((w22 >> 62) | (w23 << 2)) & mask; - out[31] = ((w23 >> 47) | (w24 << 17)) & mask; - - return in; -} - -inline const uint8_t* unpack50_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 1125899906842623ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 50) | (w1 << 14)) & mask; - out[2] = ((w1 >> 36) | (w2 << 28)) & mask; - out[3] = ((w2 >> 22) | (w3 << 42)) & mask; - out[4] = (w3 >> 8) & mask; - out[5] = ((w3 >> 58) | (w4 << 6)) & mask; - out[6] = ((w4 >> 44) | (w5 << 20)) & mask; - out[7] = ((w5 >> 30) | (w6 << 34)) & mask; - out[8] = ((w6 >> 16) | (w7 << 48)) & mask; - out[9] = (w7 >> 2) & mask; - out[10] = ((w7 >> 52) | (w8 << 12)) & mask; - out[11] = ((w8 >> 38) | (w9 << 26)) & mask; - out[12] = ((w9 >> 24) | (w10 << 40)) & mask; - out[13] = (w10 >> 10) & mask; - out[14] = ((w10 >> 60) | (w11 << 4)) & mask; - out[15] = ((w11 >> 46) | (w12 << 18)) & mask; - out[16] = ((w12 >> 32) | (w13 << 32)) & mask; - out[17] = ((w13 >> 18) | (w14 << 46)) & mask; - out[18] = (w14 >> 4) & mask; - out[19] = ((w14 >> 54) | (w15 << 10)) & mask; - out[20] = ((w15 >> 40) | (w16 << 24)) & mask; - out[21] = ((w16 >> 26) | (w17 << 38)) & mask; - out[22] = (w17 >> 12) & mask; - out[23] = ((w17 >> 62) | (w18 << 2)) & mask; - out[24] = ((w18 >> 48) | (w19 << 16)) & mask; - out[25] = ((w19 >> 34) | (w20 << 30)) & mask; - out[26] = ((w20 >> 20) | (w21 << 44)) & mask; - out[27] = (w21 >> 6) & mask; - out[28] = ((w21 >> 56) | (w22 << 8)) & mask; - out[29] = ((w22 >> 42) | (w23 << 22)) & mask; - out[30] = ((w23 >> 28) | (w24 << 36)) & mask; - out[31] = w24 >> 14; - - return in; -} - -inline const uint8_t* unpack51_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 2251799813685247ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 51) | (w1 << 13)) & mask; - out[2] = ((w1 >> 38) | (w2 << 26)) & mask; - out[3] = ((w2 >> 25) | (w3 << 39)) & mask; - out[4] = (w3 >> 12) & mask; - out[5] = ((w3 >> 63) | (w4 << 1)) & mask; - out[6] = ((w4 >> 50) | (w5 << 14)) & mask; - out[7] = ((w5 >> 37) | (w6 << 27)) & mask; - out[8] = ((w6 >> 24) | (w7 << 40)) & mask; - out[9] = (w7 >> 11) & mask; - out[10] = ((w7 >> 62) | (w8 << 2)) & mask; - out[11] = ((w8 >> 49) | (w9 << 15)) & mask; - out[12] = ((w9 >> 36) | (w10 << 28)) & mask; - out[13] = ((w10 >> 23) | (w11 << 41)) & mask; - out[14] = (w11 >> 10) & mask; - out[15] = ((w11 >> 61) | (w12 << 3)) & mask; - out[16] = ((w12 >> 48) | (w13 << 16)) & mask; - out[17] = ((w13 >> 35) | (w14 << 29)) & mask; - out[18] = ((w14 >> 22) | (w15 << 42)) & mask; - out[19] = (w15 >> 9) & mask; - out[20] = ((w15 >> 60) | (w16 << 4)) & mask; - out[21] = ((w16 >> 47) | (w17 << 17)) & mask; - out[22] = ((w17 >> 34) | (w18 << 30)) & mask; - out[23] = ((w18 >> 21) | (w19 << 43)) & mask; - out[24] = (w19 >> 8) & mask; - out[25] = ((w19 >> 59) | (w20 << 5)) & mask; - out[26] = ((w20 >> 46) | (w21 << 18)) & mask; - out[27] = ((w21 >> 33) | (w22 << 31)) & mask; - out[28] = ((w22 >> 20) | (w23 << 44)) & mask; - out[29] = (w23 >> 7) & mask; - out[30] = ((w23 >> 58) | (w24 << 6)) & mask; - out[31] = ((w24 >> 45) | (w25 << 19)) & mask; - - return in; -} - -inline const uint8_t* unpack52_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 4503599627370495ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 52) | (w1 << 12)) & mask; - out[2] = ((w1 >> 40) | (w2 << 24)) & mask; - out[3] = ((w2 >> 28) | (w3 << 36)) & mask; - out[4] = ((w3 >> 16) | (w4 << 48)) & mask; - out[5] = (w4 >> 4) & mask; - out[6] = ((w4 >> 56) | (w5 << 8)) & mask; - out[7] = ((w5 >> 44) | (w6 << 20)) & mask; - out[8] = ((w6 >> 32) | (w7 << 32)) & mask; - out[9] = ((w7 >> 20) | (w8 << 44)) & mask; - out[10] = (w8 >> 8) & mask; - out[11] = ((w8 >> 60) | (w9 << 4)) & mask; - out[12] = ((w9 >> 48) | (w10 << 16)) & mask; - out[13] = ((w10 >> 36) | (w11 << 28)) & mask; - out[14] = ((w11 >> 24) | (w12 << 40)) & mask; - out[15] = w12 >> 12; - out[16] = (w13)&mask; - out[17] = ((w13 >> 52) | (w14 << 12)) & mask; - out[18] = ((w14 >> 40) | (w15 << 24)) & mask; - out[19] = ((w15 >> 28) | (w16 << 36)) & mask; - out[20] = ((w16 >> 16) | (w17 << 48)) & mask; - out[21] = (w17 >> 4) & mask; - out[22] = ((w17 >> 56) | (w18 << 8)) & mask; - out[23] = ((w18 >> 44) | (w19 << 20)) & mask; - out[24] = ((w19 >> 32) | (w20 << 32)) & mask; - out[25] = ((w20 >> 20) | (w21 << 44)) & mask; - out[26] = (w21 >> 8) & mask; - out[27] = ((w21 >> 60) | (w22 << 4)) & mask; - out[28] = ((w22 >> 48) | (w23 << 16)) & mask; - out[29] = ((w23 >> 36) | (w24 << 28)) & mask; - out[30] = ((w24 >> 24) | (w25 << 40)) & mask; - out[31] = w25 >> 12; - - return in; -} - -inline const uint8_t* unpack53_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 9007199254740991ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 53) | (w1 << 11)) & mask; - out[2] = ((w1 >> 42) | (w2 << 22)) & mask; - out[3] = ((w2 >> 31) | (w3 << 33)) & mask; - out[4] = ((w3 >> 20) | (w4 << 44)) & mask; - out[5] = (w4 >> 9) & mask; - out[6] = ((w4 >> 62) | (w5 << 2)) & mask; - out[7] = ((w5 >> 51) | (w6 << 13)) & mask; - out[8] = ((w6 >> 40) | (w7 << 24)) & mask; - out[9] = ((w7 >> 29) | (w8 << 35)) & mask; - out[10] = ((w8 >> 18) | (w9 << 46)) & mask; - out[11] = (w9 >> 7) & mask; - out[12] = ((w9 >> 60) | (w10 << 4)) & mask; - out[13] = ((w10 >> 49) | (w11 << 15)) & mask; - out[14] = ((w11 >> 38) | (w12 << 26)) & mask; - out[15] = ((w12 >> 27) | (w13 << 37)) & mask; - out[16] = ((w13 >> 16) | (w14 << 48)) & mask; - out[17] = (w14 >> 5) & mask; - out[18] = ((w14 >> 58) | (w15 << 6)) & mask; - out[19] = ((w15 >> 47) | (w16 << 17)) & mask; - out[20] = ((w16 >> 36) | (w17 << 28)) & mask; - out[21] = ((w17 >> 25) | (w18 << 39)) & mask; - out[22] = ((w18 >> 14) | (w19 << 50)) & mask; - out[23] = (w19 >> 3) & mask; - out[24] = ((w19 >> 56) | (w20 << 8)) & mask; - out[25] = ((w20 >> 45) | (w21 << 19)) & mask; - out[26] = ((w21 >> 34) | (w22 << 30)) & mask; - out[27] = ((w22 >> 23) | (w23 << 41)) & mask; - out[28] = ((w23 >> 12) | (w24 << 52)) & mask; - out[29] = (w24 >> 1) & mask; - out[30] = ((w24 >> 54) | (w25 << 10)) & mask; - out[31] = ((w25 >> 43) | (w26 << 21)) & mask; - - return in; -} - -inline const uint8_t* unpack54_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 18014398509481983ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 54) | (w1 << 10)) & mask; - out[2] = ((w1 >> 44) | (w2 << 20)) & mask; - out[3] = ((w2 >> 34) | (w3 << 30)) & mask; - out[4] = ((w3 >> 24) | (w4 << 40)) & mask; - out[5] = ((w4 >> 14) | (w5 << 50)) & mask; - out[6] = (w5 >> 4) & mask; - out[7] = ((w5 >> 58) | (w6 << 6)) & mask; - out[8] = ((w6 >> 48) | (w7 << 16)) & mask; - out[9] = ((w7 >> 38) | (w8 << 26)) & mask; - out[10] = ((w8 >> 28) | (w9 << 36)) & mask; - out[11] = ((w9 >> 18) | (w10 << 46)) & mask; - out[12] = (w10 >> 8) & mask; - out[13] = ((w10 >> 62) | (w11 << 2)) & mask; - out[14] = ((w11 >> 52) | (w12 << 12)) & mask; - out[15] = ((w12 >> 42) | (w13 << 22)) & mask; - out[16] = ((w13 >> 32) | (w14 << 32)) & mask; - out[17] = ((w14 >> 22) | (w15 << 42)) & mask; - out[18] = ((w15 >> 12) | (w16 << 52)) & mask; - out[19] = (w16 >> 2) & mask; - out[20] = ((w16 >> 56) | (w17 << 8)) & mask; - out[21] = ((w17 >> 46) | (w18 << 18)) & mask; - out[22] = ((w18 >> 36) | (w19 << 28)) & mask; - out[23] = ((w19 >> 26) | (w20 << 38)) & mask; - out[24] = ((w20 >> 16) | (w21 << 48)) & mask; - out[25] = (w21 >> 6) & mask; - out[26] = ((w21 >> 60) | (w22 << 4)) & mask; - out[27] = ((w22 >> 50) | (w23 << 14)) & mask; - out[28] = ((w23 >> 40) | (w24 << 24)) & mask; - out[29] = ((w24 >> 30) | (w25 << 34)) & mask; - out[30] = ((w25 >> 20) | (w26 << 44)) & mask; - out[31] = w26 >> 10; - - return in; -} - -inline const uint8_t* unpack55_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 36028797018963967ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 55) | (w1 << 9)) & mask; - out[2] = ((w1 >> 46) | (w2 << 18)) & mask; - out[3] = ((w2 >> 37) | (w3 << 27)) & mask; - out[4] = ((w3 >> 28) | (w4 << 36)) & mask; - out[5] = ((w4 >> 19) | (w5 << 45)) & mask; - out[6] = ((w5 >> 10) | (w6 << 54)) & mask; - out[7] = (w6 >> 1) & mask; - out[8] = ((w6 >> 56) | (w7 << 8)) & mask; - out[9] = ((w7 >> 47) | (w8 << 17)) & mask; - out[10] = ((w8 >> 38) | (w9 << 26)) & mask; - out[11] = ((w9 >> 29) | (w10 << 35)) & mask; - out[12] = ((w10 >> 20) | (w11 << 44)) & mask; - out[13] = ((w11 >> 11) | (w12 << 53)) & mask; - out[14] = (w12 >> 2) & mask; - out[15] = ((w12 >> 57) | (w13 << 7)) & mask; - out[16] = ((w13 >> 48) | (w14 << 16)) & mask; - out[17] = ((w14 >> 39) | (w15 << 25)) & mask; - out[18] = ((w15 >> 30) | (w16 << 34)) & mask; - out[19] = ((w16 >> 21) | (w17 << 43)) & mask; - out[20] = ((w17 >> 12) | (w18 << 52)) & mask; - out[21] = (w18 >> 3) & mask; - out[22] = ((w18 >> 58) | (w19 << 6)) & mask; - out[23] = ((w19 >> 49) | (w20 << 15)) & mask; - out[24] = ((w20 >> 40) | (w21 << 24)) & mask; - out[25] = ((w21 >> 31) | (w22 << 33)) & mask; - out[26] = ((w22 >> 22) | (w23 << 42)) & mask; - out[27] = ((w23 >> 13) | (w24 << 51)) & mask; - out[28] = (w24 >> 4) & mask; - out[29] = ((w24 >> 59) | (w25 << 5)) & mask; - out[30] = ((w25 >> 50) | (w26 << 14)) & mask; - out[31] = ((w26 >> 41) | (w27 << 23)) & mask; - - return in; -} - -inline const uint8_t* unpack56_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 72057594037927935ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 56) | (w1 << 8)) & mask; - out[2] = ((w1 >> 48) | (w2 << 16)) & mask; - out[3] = ((w2 >> 40) | (w3 << 24)) & mask; - out[4] = ((w3 >> 32) | (w4 << 32)) & mask; - out[5] = ((w4 >> 24) | (w5 << 40)) & mask; - out[6] = ((w5 >> 16) | (w6 << 48)) & mask; - out[7] = w6 >> 8; - out[8] = (w7)&mask; - out[9] = ((w7 >> 56) | (w8 << 8)) & mask; - out[10] = ((w8 >> 48) | (w9 << 16)) & mask; - out[11] = ((w9 >> 40) | (w10 << 24)) & mask; - out[12] = ((w10 >> 32) | (w11 << 32)) & mask; - out[13] = ((w11 >> 24) | (w12 << 40)) & mask; - out[14] = ((w12 >> 16) | (w13 << 48)) & mask; - out[15] = w13 >> 8; - out[16] = (w14)&mask; - out[17] = ((w14 >> 56) | (w15 << 8)) & mask; - out[18] = ((w15 >> 48) | (w16 << 16)) & mask; - out[19] = ((w16 >> 40) | (w17 << 24)) & mask; - out[20] = ((w17 >> 32) | (w18 << 32)) & mask; - out[21] = ((w18 >> 24) | (w19 << 40)) & mask; - out[22] = ((w19 >> 16) | (w20 << 48)) & mask; - out[23] = w20 >> 8; - out[24] = (w21)&mask; - out[25] = ((w21 >> 56) | (w22 << 8)) & mask; - out[26] = ((w22 >> 48) | (w23 << 16)) & mask; - out[27] = ((w23 >> 40) | (w24 << 24)) & mask; - out[28] = ((w24 >> 32) | (w25 << 32)) & mask; - out[29] = ((w25 >> 24) | (w26 << 40)) & mask; - out[30] = ((w26 >> 16) | (w27 << 48)) & mask; - out[31] = w27 >> 8; - - return in; -} - -inline const uint8_t* unpack57_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 144115188075855871ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - uint64_t w28 = util::SafeLoadAs(in); - w28 = arrow::bit_util::FromLittleEndian(w28); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 57) | (w1 << 7)) & mask; - out[2] = ((w1 >> 50) | (w2 << 14)) & mask; - out[3] = ((w2 >> 43) | (w3 << 21)) & mask; - out[4] = ((w3 >> 36) | (w4 << 28)) & mask; - out[5] = ((w4 >> 29) | (w5 << 35)) & mask; - out[6] = ((w5 >> 22) | (w6 << 42)) & mask; - out[7] = ((w6 >> 15) | (w7 << 49)) & mask; - out[8] = ((w7 >> 8) | (w8 << 56)) & mask; - out[9] = (w8 >> 1) & mask; - out[10] = ((w8 >> 58) | (w9 << 6)) & mask; - out[11] = ((w9 >> 51) | (w10 << 13)) & mask; - out[12] = ((w10 >> 44) | (w11 << 20)) & mask; - out[13] = ((w11 >> 37) | (w12 << 27)) & mask; - out[14] = ((w12 >> 30) | (w13 << 34)) & mask; - out[15] = ((w13 >> 23) | (w14 << 41)) & mask; - out[16] = ((w14 >> 16) | (w15 << 48)) & mask; - out[17] = ((w15 >> 9) | (w16 << 55)) & mask; - out[18] = (w16 >> 2) & mask; - out[19] = ((w16 >> 59) | (w17 << 5)) & mask; - out[20] = ((w17 >> 52) | (w18 << 12)) & mask; - out[21] = ((w18 >> 45) | (w19 << 19)) & mask; - out[22] = ((w19 >> 38) | (w20 << 26)) & mask; - out[23] = ((w20 >> 31) | (w21 << 33)) & mask; - out[24] = ((w21 >> 24) | (w22 << 40)) & mask; - out[25] = ((w22 >> 17) | (w23 << 47)) & mask; - out[26] = ((w23 >> 10) | (w24 << 54)) & mask; - out[27] = (w24 >> 3) & mask; - out[28] = ((w24 >> 60) | (w25 << 4)) & mask; - out[29] = ((w25 >> 53) | (w26 << 11)) & mask; - out[30] = ((w26 >> 46) | (w27 << 18)) & mask; - out[31] = ((w27 >> 39) | (w28 << 25)) & mask; - - return in; -} - -inline const uint8_t* unpack58_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 288230376151711743ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - uint64_t w28 = util::SafeLoadAs(in); - w28 = arrow::bit_util::FromLittleEndian(w28); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 58) | (w1 << 6)) & mask; - out[2] = ((w1 >> 52) | (w2 << 12)) & mask; - out[3] = ((w2 >> 46) | (w3 << 18)) & mask; - out[4] = ((w3 >> 40) | (w4 << 24)) & mask; - out[5] = ((w4 >> 34) | (w5 << 30)) & mask; - out[6] = ((w5 >> 28) | (w6 << 36)) & mask; - out[7] = ((w6 >> 22) | (w7 << 42)) & mask; - out[8] = ((w7 >> 16) | (w8 << 48)) & mask; - out[9] = ((w8 >> 10) | (w9 << 54)) & mask; - out[10] = (w9 >> 4) & mask; - out[11] = ((w9 >> 62) | (w10 << 2)) & mask; - out[12] = ((w10 >> 56) | (w11 << 8)) & mask; - out[13] = ((w11 >> 50) | (w12 << 14)) & mask; - out[14] = ((w12 >> 44) | (w13 << 20)) & mask; - out[15] = ((w13 >> 38) | (w14 << 26)) & mask; - out[16] = ((w14 >> 32) | (w15 << 32)) & mask; - out[17] = ((w15 >> 26) | (w16 << 38)) & mask; - out[18] = ((w16 >> 20) | (w17 << 44)) & mask; - out[19] = ((w17 >> 14) | (w18 << 50)) & mask; - out[20] = ((w18 >> 8) | (w19 << 56)) & mask; - out[21] = (w19 >> 2) & mask; - out[22] = ((w19 >> 60) | (w20 << 4)) & mask; - out[23] = ((w20 >> 54) | (w21 << 10)) & mask; - out[24] = ((w21 >> 48) | (w22 << 16)) & mask; - out[25] = ((w22 >> 42) | (w23 << 22)) & mask; - out[26] = ((w23 >> 36) | (w24 << 28)) & mask; - out[27] = ((w24 >> 30) | (w25 << 34)) & mask; - out[28] = ((w25 >> 24) | (w26 << 40)) & mask; - out[29] = ((w26 >> 18) | (w27 << 46)) & mask; - out[30] = ((w27 >> 12) | (w28 << 52)) & mask; - out[31] = w28 >> 6; - - return in; -} - -inline const uint8_t* unpack59_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 576460752303423487ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - uint64_t w28 = util::SafeLoadAs(in); - w28 = arrow::bit_util::FromLittleEndian(w28); - in += 8; - uint64_t w29 = util::SafeLoadAs(in); - w29 = arrow::bit_util::FromLittleEndian(w29); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 59) | (w1 << 5)) & mask; - out[2] = ((w1 >> 54) | (w2 << 10)) & mask; - out[3] = ((w2 >> 49) | (w3 << 15)) & mask; - out[4] = ((w3 >> 44) | (w4 << 20)) & mask; - out[5] = ((w4 >> 39) | (w5 << 25)) & mask; - out[6] = ((w5 >> 34) | (w6 << 30)) & mask; - out[7] = ((w6 >> 29) | (w7 << 35)) & mask; - out[8] = ((w7 >> 24) | (w8 << 40)) & mask; - out[9] = ((w8 >> 19) | (w9 << 45)) & mask; - out[10] = ((w9 >> 14) | (w10 << 50)) & mask; - out[11] = ((w10 >> 9) | (w11 << 55)) & mask; - out[12] = (w11 >> 4) & mask; - out[13] = ((w11 >> 63) | (w12 << 1)) & mask; - out[14] = ((w12 >> 58) | (w13 << 6)) & mask; - out[15] = ((w13 >> 53) | (w14 << 11)) & mask; - out[16] = ((w14 >> 48) | (w15 << 16)) & mask; - out[17] = ((w15 >> 43) | (w16 << 21)) & mask; - out[18] = ((w16 >> 38) | (w17 << 26)) & mask; - out[19] = ((w17 >> 33) | (w18 << 31)) & mask; - out[20] = ((w18 >> 28) | (w19 << 36)) & mask; - out[21] = ((w19 >> 23) | (w20 << 41)) & mask; - out[22] = ((w20 >> 18) | (w21 << 46)) & mask; - out[23] = ((w21 >> 13) | (w22 << 51)) & mask; - out[24] = ((w22 >> 8) | (w23 << 56)) & mask; - out[25] = (w23 >> 3) & mask; - out[26] = ((w23 >> 62) | (w24 << 2)) & mask; - out[27] = ((w24 >> 57) | (w25 << 7)) & mask; - out[28] = ((w25 >> 52) | (w26 << 12)) & mask; - out[29] = ((w26 >> 47) | (w27 << 17)) & mask; - out[30] = ((w27 >> 42) | (w28 << 22)) & mask; - out[31] = ((w28 >> 37) | (w29 << 27)) & mask; - - return in; -} - -inline const uint8_t* unpack60_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 1152921504606846975ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - uint64_t w28 = util::SafeLoadAs(in); - w28 = arrow::bit_util::FromLittleEndian(w28); - in += 8; - uint64_t w29 = util::SafeLoadAs(in); - w29 = arrow::bit_util::FromLittleEndian(w29); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 60) | (w1 << 4)) & mask; - out[2] = ((w1 >> 56) | (w2 << 8)) & mask; - out[3] = ((w2 >> 52) | (w3 << 12)) & mask; - out[4] = ((w3 >> 48) | (w4 << 16)) & mask; - out[5] = ((w4 >> 44) | (w5 << 20)) & mask; - out[6] = ((w5 >> 40) | (w6 << 24)) & mask; - out[7] = ((w6 >> 36) | (w7 << 28)) & mask; - out[8] = ((w7 >> 32) | (w8 << 32)) & mask; - out[9] = ((w8 >> 28) | (w9 << 36)) & mask; - out[10] = ((w9 >> 24) | (w10 << 40)) & mask; - out[11] = ((w10 >> 20) | (w11 << 44)) & mask; - out[12] = ((w11 >> 16) | (w12 << 48)) & mask; - out[13] = ((w12 >> 12) | (w13 << 52)) & mask; - out[14] = ((w13 >> 8) | (w14 << 56)) & mask; - out[15] = w14 >> 4; - out[16] = (w15)&mask; - out[17] = ((w15 >> 60) | (w16 << 4)) & mask; - out[18] = ((w16 >> 56) | (w17 << 8)) & mask; - out[19] = ((w17 >> 52) | (w18 << 12)) & mask; - out[20] = ((w18 >> 48) | (w19 << 16)) & mask; - out[21] = ((w19 >> 44) | (w20 << 20)) & mask; - out[22] = ((w20 >> 40) | (w21 << 24)) & mask; - out[23] = ((w21 >> 36) | (w22 << 28)) & mask; - out[24] = ((w22 >> 32) | (w23 << 32)) & mask; - out[25] = ((w23 >> 28) | (w24 << 36)) & mask; - out[26] = ((w24 >> 24) | (w25 << 40)) & mask; - out[27] = ((w25 >> 20) | (w26 << 44)) & mask; - out[28] = ((w26 >> 16) | (w27 << 48)) & mask; - out[29] = ((w27 >> 12) | (w28 << 52)) & mask; - out[30] = ((w28 >> 8) | (w29 << 56)) & mask; - out[31] = w29 >> 4; - - return in; -} - -inline const uint8_t* unpack61_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 2305843009213693951ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - uint64_t w28 = util::SafeLoadAs(in); - w28 = arrow::bit_util::FromLittleEndian(w28); - in += 8; - uint64_t w29 = util::SafeLoadAs(in); - w29 = arrow::bit_util::FromLittleEndian(w29); - in += 8; - uint64_t w30 = util::SafeLoadAs(in); - w30 = arrow::bit_util::FromLittleEndian(w30); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 61) | (w1 << 3)) & mask; - out[2] = ((w1 >> 58) | (w2 << 6)) & mask; - out[3] = ((w2 >> 55) | (w3 << 9)) & mask; - out[4] = ((w3 >> 52) | (w4 << 12)) & mask; - out[5] = ((w4 >> 49) | (w5 << 15)) & mask; - out[6] = ((w5 >> 46) | (w6 << 18)) & mask; - out[7] = ((w6 >> 43) | (w7 << 21)) & mask; - out[8] = ((w7 >> 40) | (w8 << 24)) & mask; - out[9] = ((w8 >> 37) | (w9 << 27)) & mask; - out[10] = ((w9 >> 34) | (w10 << 30)) & mask; - out[11] = ((w10 >> 31) | (w11 << 33)) & mask; - out[12] = ((w11 >> 28) | (w12 << 36)) & mask; - out[13] = ((w12 >> 25) | (w13 << 39)) & mask; - out[14] = ((w13 >> 22) | (w14 << 42)) & mask; - out[15] = ((w14 >> 19) | (w15 << 45)) & mask; - out[16] = ((w15 >> 16) | (w16 << 48)) & mask; - out[17] = ((w16 >> 13) | (w17 << 51)) & mask; - out[18] = ((w17 >> 10) | (w18 << 54)) & mask; - out[19] = ((w18 >> 7) | (w19 << 57)) & mask; - out[20] = ((w19 >> 4) | (w20 << 60)) & mask; - out[21] = (w20 >> 1) & mask; - out[22] = ((w20 >> 62) | (w21 << 2)) & mask; - out[23] = ((w21 >> 59) | (w22 << 5)) & mask; - out[24] = ((w22 >> 56) | (w23 << 8)) & mask; - out[25] = ((w23 >> 53) | (w24 << 11)) & mask; - out[26] = ((w24 >> 50) | (w25 << 14)) & mask; - out[27] = ((w25 >> 47) | (w26 << 17)) & mask; - out[28] = ((w26 >> 44) | (w27 << 20)) & mask; - out[29] = ((w27 >> 41) | (w28 << 23)) & mask; - out[30] = ((w28 >> 38) | (w29 << 26)) & mask; - out[31] = ((w29 >> 35) | (w30 << 29)) & mask; - - return in; -} - -inline const uint8_t* unpack62_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 4611686018427387903ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - uint64_t w28 = util::SafeLoadAs(in); - w28 = arrow::bit_util::FromLittleEndian(w28); - in += 8; - uint64_t w29 = util::SafeLoadAs(in); - w29 = arrow::bit_util::FromLittleEndian(w29); - in += 8; - uint64_t w30 = util::SafeLoadAs(in); - w30 = arrow::bit_util::FromLittleEndian(w30); - in += 8; - out[0] = (w0)&mask; - out[1] = ((w0 >> 62) | (w1 << 2)) & mask; - out[2] = ((w1 >> 60) | (w2 << 4)) & mask; - out[3] = ((w2 >> 58) | (w3 << 6)) & mask; - out[4] = ((w3 >> 56) | (w4 << 8)) & mask; - out[5] = ((w4 >> 54) | (w5 << 10)) & mask; - out[6] = ((w5 >> 52) | (w6 << 12)) & mask; - out[7] = ((w6 >> 50) | (w7 << 14)) & mask; - out[8] = ((w7 >> 48) | (w8 << 16)) & mask; - out[9] = ((w8 >> 46) | (w9 << 18)) & mask; - out[10] = ((w9 >> 44) | (w10 << 20)) & mask; - out[11] = ((w10 >> 42) | (w11 << 22)) & mask; - out[12] = ((w11 >> 40) | (w12 << 24)) & mask; - out[13] = ((w12 >> 38) | (w13 << 26)) & mask; - out[14] = ((w13 >> 36) | (w14 << 28)) & mask; - out[15] = ((w14 >> 34) | (w15 << 30)) & mask; - out[16] = ((w15 >> 32) | (w16 << 32)) & mask; - out[17] = ((w16 >> 30) | (w17 << 34)) & mask; - out[18] = ((w17 >> 28) | (w18 << 36)) & mask; - out[19] = ((w18 >> 26) | (w19 << 38)) & mask; - out[20] = ((w19 >> 24) | (w20 << 40)) & mask; - out[21] = ((w20 >> 22) | (w21 << 42)) & mask; - out[22] = ((w21 >> 20) | (w22 << 44)) & mask; - out[23] = ((w22 >> 18) | (w23 << 46)) & mask; - out[24] = ((w23 >> 16) | (w24 << 48)) & mask; - out[25] = ((w24 >> 14) | (w25 << 50)) & mask; - out[26] = ((w25 >> 12) | (w26 << 52)) & mask; - out[27] = ((w26 >> 10) | (w27 << 54)) & mask; - out[28] = ((w27 >> 8) | (w28 << 56)) & mask; - out[29] = ((w28 >> 6) | (w29 << 58)) & mask; - out[30] = ((w29 >> 4) | (w30 << 60)) & mask; - out[31] = w30 >> 2; - - return in; -} - -inline const uint8_t* unpack63_64(const uint8_t* in, uint64_t* out) { - const uint64_t mask = 9223372036854775807ULL; - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - uint64_t w28 = util::SafeLoadAs(in); - w28 = arrow::bit_util::FromLittleEndian(w28); - in += 8; - uint64_t w29 = util::SafeLoadAs(in); - w29 = arrow::bit_util::FromLittleEndian(w29); - in += 8; - uint64_t w30 = util::SafeLoadAs(in); - w30 = arrow::bit_util::FromLittleEndian(w30); - in += 8; - uint64_t w31 = util::SafeLoadAs(in); - w31 = arrow::bit_util::FromLittleEndian(w31); - in += 4; - out[0] = (w0)&mask; - out[1] = ((w0 >> 63) | (w1 << 1)) & mask; - out[2] = ((w1 >> 62) | (w2 << 2)) & mask; - out[3] = ((w2 >> 61) | (w3 << 3)) & mask; - out[4] = ((w3 >> 60) | (w4 << 4)) & mask; - out[5] = ((w4 >> 59) | (w5 << 5)) & mask; - out[6] = ((w5 >> 58) | (w6 << 6)) & mask; - out[7] = ((w6 >> 57) | (w7 << 7)) & mask; - out[8] = ((w7 >> 56) | (w8 << 8)) & mask; - out[9] = ((w8 >> 55) | (w9 << 9)) & mask; - out[10] = ((w9 >> 54) | (w10 << 10)) & mask; - out[11] = ((w10 >> 53) | (w11 << 11)) & mask; - out[12] = ((w11 >> 52) | (w12 << 12)) & mask; - out[13] = ((w12 >> 51) | (w13 << 13)) & mask; - out[14] = ((w13 >> 50) | (w14 << 14)) & mask; - out[15] = ((w14 >> 49) | (w15 << 15)) & mask; - out[16] = ((w15 >> 48) | (w16 << 16)) & mask; - out[17] = ((w16 >> 47) | (w17 << 17)) & mask; - out[18] = ((w17 >> 46) | (w18 << 18)) & mask; - out[19] = ((w18 >> 45) | (w19 << 19)) & mask; - out[20] = ((w19 >> 44) | (w20 << 20)) & mask; - out[21] = ((w20 >> 43) | (w21 << 21)) & mask; - out[22] = ((w21 >> 42) | (w22 << 22)) & mask; - out[23] = ((w22 >> 41) | (w23 << 23)) & mask; - out[24] = ((w23 >> 40) | (w24 << 24)) & mask; - out[25] = ((w24 >> 39) | (w25 << 25)) & mask; - out[26] = ((w25 >> 38) | (w26 << 26)) & mask; - out[27] = ((w26 >> 37) | (w27 << 27)) & mask; - out[28] = ((w27 >> 36) | (w28 << 28)) & mask; - out[29] = ((w28 >> 35) | (w29 << 29)) & mask; - out[30] = ((w29 >> 34) | (w30 << 30)) & mask; - out[31] = ((w30 >> 33) | (w31 << 31)) & mask; - - return in; -} - -inline const uint8_t* unpack64_64(const uint8_t* in, uint64_t* out) { - uint64_t w0 = util::SafeLoadAs(in); - w0 = arrow::bit_util::FromLittleEndian(w0); - in += 8; - uint64_t w1 = util::SafeLoadAs(in); - w1 = arrow::bit_util::FromLittleEndian(w1); - in += 8; - uint64_t w2 = util::SafeLoadAs(in); - w2 = arrow::bit_util::FromLittleEndian(w2); - in += 8; - uint64_t w3 = util::SafeLoadAs(in); - w3 = arrow::bit_util::FromLittleEndian(w3); - in += 8; - uint64_t w4 = util::SafeLoadAs(in); - w4 = arrow::bit_util::FromLittleEndian(w4); - in += 8; - uint64_t w5 = util::SafeLoadAs(in); - w5 = arrow::bit_util::FromLittleEndian(w5); - in += 8; - uint64_t w6 = util::SafeLoadAs(in); - w6 = arrow::bit_util::FromLittleEndian(w6); - in += 8; - uint64_t w7 = util::SafeLoadAs(in); - w7 = arrow::bit_util::FromLittleEndian(w7); - in += 8; - uint64_t w8 = util::SafeLoadAs(in); - w8 = arrow::bit_util::FromLittleEndian(w8); - in += 8; - uint64_t w9 = util::SafeLoadAs(in); - w9 = arrow::bit_util::FromLittleEndian(w9); - in += 8; - uint64_t w10 = util::SafeLoadAs(in); - w10 = arrow::bit_util::FromLittleEndian(w10); - in += 8; - uint64_t w11 = util::SafeLoadAs(in); - w11 = arrow::bit_util::FromLittleEndian(w11); - in += 8; - uint64_t w12 = util::SafeLoadAs(in); - w12 = arrow::bit_util::FromLittleEndian(w12); - in += 8; - uint64_t w13 = util::SafeLoadAs(in); - w13 = arrow::bit_util::FromLittleEndian(w13); - in += 8; - uint64_t w14 = util::SafeLoadAs(in); - w14 = arrow::bit_util::FromLittleEndian(w14); - in += 8; - uint64_t w15 = util::SafeLoadAs(in); - w15 = arrow::bit_util::FromLittleEndian(w15); - in += 8; - uint64_t w16 = util::SafeLoadAs(in); - w16 = arrow::bit_util::FromLittleEndian(w16); - in += 8; - uint64_t w17 = util::SafeLoadAs(in); - w17 = arrow::bit_util::FromLittleEndian(w17); - in += 8; - uint64_t w18 = util::SafeLoadAs(in); - w18 = arrow::bit_util::FromLittleEndian(w18); - in += 8; - uint64_t w19 = util::SafeLoadAs(in); - w19 = arrow::bit_util::FromLittleEndian(w19); - in += 8; - uint64_t w20 = util::SafeLoadAs(in); - w20 = arrow::bit_util::FromLittleEndian(w20); - in += 8; - uint64_t w21 = util::SafeLoadAs(in); - w21 = arrow::bit_util::FromLittleEndian(w21); - in += 8; - uint64_t w22 = util::SafeLoadAs(in); - w22 = arrow::bit_util::FromLittleEndian(w22); - in += 8; - uint64_t w23 = util::SafeLoadAs(in); - w23 = arrow::bit_util::FromLittleEndian(w23); - in += 8; - uint64_t w24 = util::SafeLoadAs(in); - w24 = arrow::bit_util::FromLittleEndian(w24); - in += 8; - uint64_t w25 = util::SafeLoadAs(in); - w25 = arrow::bit_util::FromLittleEndian(w25); - in += 8; - uint64_t w26 = util::SafeLoadAs(in); - w26 = arrow::bit_util::FromLittleEndian(w26); - in += 8; - uint64_t w27 = util::SafeLoadAs(in); - w27 = arrow::bit_util::FromLittleEndian(w27); - in += 8; - uint64_t w28 = util::SafeLoadAs(in); - w28 = arrow::bit_util::FromLittleEndian(w28); - in += 8; - uint64_t w29 = util::SafeLoadAs(in); - w29 = arrow::bit_util::FromLittleEndian(w29); - in += 8; - uint64_t w30 = util::SafeLoadAs(in); - w30 = arrow::bit_util::FromLittleEndian(w30); - in += 8; - uint64_t w31 = util::SafeLoadAs(in); - w31 = arrow::bit_util::FromLittleEndian(w31); - in += 8; - out[0] = w0; - out[1] = w1; - out[2] = w2; - out[3] = w3; - out[4] = w4; - out[5] = w5; - out[6] = w6; - out[7] = w7; - out[8] = w8; - out[9] = w9; - out[10] = w10; - out[11] = w11; - out[12] = w12; - out[13] = w13; - out[14] = w14; - out[15] = w15; - out[16] = w16; - out[17] = w17; - out[18] = w18; - out[19] = w19; - out[20] = w20; - out[21] = w21; - out[22] = w22; - out[23] = w23; - out[24] = w24; - out[25] = w25; - out[26] = w26; - out[27] = w27; - out[28] = w28; - out[29] = w29; - out[30] = w30; - out[31] = w31; - - return in; -} - -} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_default_internal.h b/cpp/src/arrow/util/bpacking_default_internal.h deleted file mode 100644 index 4c661dcce37..00000000000 --- a/cpp/src/arrow/util/bpacking_default_internal.h +++ /dev/null @@ -1,4251 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This file was modified from its original version for inclusion in parquet-cpp. -// Original source: -// https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp -// The original copyright notice follows. - -// This code is released under the -// Apache License Version 2.0 http://www.apache.org/licenses/. -// (c) Daniel Lemire 2013 - -#pragma once - -#include "arrow/util/bit_util.h" -#include "arrow/util/ubsan.h" - -namespace arrow { -namespace internal { - -inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) & 1; - out++; - *out = (inl >> 1) & 1; - out++; - *out = (inl >> 2) & 1; - out++; - *out = (inl >> 3) & 1; - out++; - *out = (inl >> 4) & 1; - out++; - *out = (inl >> 5) & 1; - out++; - *out = (inl >> 6) & 1; - out++; - *out = (inl >> 7) & 1; - out++; - *out = (inl >> 8) & 1; - out++; - *out = (inl >> 9) & 1; - out++; - *out = (inl >> 10) & 1; - out++; - *out = (inl >> 11) & 1; - out++; - *out = (inl >> 12) & 1; - out++; - *out = (inl >> 13) & 1; - out++; - *out = (inl >> 14) & 1; - out++; - *out = (inl >> 15) & 1; - out++; - *out = (inl >> 16) & 1; - out++; - *out = (inl >> 17) & 1; - out++; - *out = (inl >> 18) & 1; - out++; - *out = (inl >> 19) & 1; - out++; - *out = (inl >> 20) & 1; - out++; - *out = (inl >> 21) & 1; - out++; - *out = (inl >> 22) & 1; - out++; - *out = (inl >> 23) & 1; - out++; - *out = (inl >> 24) & 1; - out++; - *out = (inl >> 25) & 1; - out++; - *out = (inl >> 26) & 1; - out++; - *out = (inl >> 27) & 1; - out++; - *out = (inl >> 28) & 1; - out++; - *out = (inl >> 29) & 1; - out++; - *out = (inl >> 30) & 1; - out++; - *out = (inl >> 31); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 2); - out++; - *out = (inl >> 2) % (1U << 2); - out++; - *out = (inl >> 4) % (1U << 2); - out++; - *out = (inl >> 6) % (1U << 2); - out++; - *out = (inl >> 8) % (1U << 2); - out++; - *out = (inl >> 10) % (1U << 2); - out++; - *out = (inl >> 12) % (1U << 2); - out++; - *out = (inl >> 14) % (1U << 2); - out++; - *out = (inl >> 16) % (1U << 2); - out++; - *out = (inl >> 18) % (1U << 2); - out++; - *out = (inl >> 20) % (1U << 2); - out++; - *out = (inl >> 22) % (1U << 2); - out++; - *out = (inl >> 24) % (1U << 2); - out++; - *out = (inl >> 26) % (1U << 2); - out++; - *out = (inl >> 28) % (1U << 2); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 2); - out++; - *out = (inl >> 2) % (1U << 2); - out++; - *out = (inl >> 4) % (1U << 2); - out++; - *out = (inl >> 6) % (1U << 2); - out++; - *out = (inl >> 8) % (1U << 2); - out++; - *out = (inl >> 10) % (1U << 2); - out++; - *out = (inl >> 12) % (1U << 2); - out++; - *out = (inl >> 14) % (1U << 2); - out++; - *out = (inl >> 16) % (1U << 2); - out++; - *out = (inl >> 18) % (1U << 2); - out++; - *out = (inl >> 20) % (1U << 2); - out++; - *out = (inl >> 22) % (1U << 2); - out++; - *out = (inl >> 24) % (1U << 2); - out++; - *out = (inl >> 26) % (1U << 2); - out++; - *out = (inl >> 28) % (1U << 2); - out++; - *out = (inl >> 30); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 3); - out++; - *out = (inl >> 3) % (1U << 3); - out++; - *out = (inl >> 6) % (1U << 3); - out++; - *out = (inl >> 9) % (1U << 3); - out++; - *out = (inl >> 12) % (1U << 3); - out++; - *out = (inl >> 15) % (1U << 3); - out++; - *out = (inl >> 18) % (1U << 3); - out++; - *out = (inl >> 21) % (1U << 3); - out++; - *out = (inl >> 24) % (1U << 3); - out++; - *out = (inl >> 27) % (1U << 3); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 1)) << (3 - 1); - out++; - *out = (inl >> 1) % (1U << 3); - out++; - *out = (inl >> 4) % (1U << 3); - out++; - *out = (inl >> 7) % (1U << 3); - out++; - *out = (inl >> 10) % (1U << 3); - out++; - *out = (inl >> 13) % (1U << 3); - out++; - *out = (inl >> 16) % (1U << 3); - out++; - *out = (inl >> 19) % (1U << 3); - out++; - *out = (inl >> 22) % (1U << 3); - out++; - *out = (inl >> 25) % (1U << 3); - out++; - *out = (inl >> 28) % (1U << 3); - out++; - *out = (inl >> 31); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (3 - 2); - out++; - *out = (inl >> 2) % (1U << 3); - out++; - *out = (inl >> 5) % (1U << 3); - out++; - *out = (inl >> 8) % (1U << 3); - out++; - *out = (inl >> 11) % (1U << 3); - out++; - *out = (inl >> 14) % (1U << 3); - out++; - *out = (inl >> 17) % (1U << 3); - out++; - *out = (inl >> 20) % (1U << 3); - out++; - *out = (inl >> 23) % (1U << 3); - out++; - *out = (inl >> 26) % (1U << 3); - out++; - *out = (inl >> 29); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 4); - out++; - *out = (inl >> 4) % (1U << 4); - out++; - *out = (inl >> 8) % (1U << 4); - out++; - *out = (inl >> 12) % (1U << 4); - out++; - *out = (inl >> 16) % (1U << 4); - out++; - *out = (inl >> 20) % (1U << 4); - out++; - *out = (inl >> 24) % (1U << 4); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 4); - out++; - *out = (inl >> 4) % (1U << 4); - out++; - *out = (inl >> 8) % (1U << 4); - out++; - *out = (inl >> 12) % (1U << 4); - out++; - *out = (inl >> 16) % (1U << 4); - out++; - *out = (inl >> 20) % (1U << 4); - out++; - *out = (inl >> 24) % (1U << 4); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 4); - out++; - *out = (inl >> 4) % (1U << 4); - out++; - *out = (inl >> 8) % (1U << 4); - out++; - *out = (inl >> 12) % (1U << 4); - out++; - *out = (inl >> 16) % (1U << 4); - out++; - *out = (inl >> 20) % (1U << 4); - out++; - *out = (inl >> 24) % (1U << 4); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 4); - out++; - *out = (inl >> 4) % (1U << 4); - out++; - *out = (inl >> 8) % (1U << 4); - out++; - *out = (inl >> 12) % (1U << 4); - out++; - *out = (inl >> 16) % (1U << 4); - out++; - *out = (inl >> 20) % (1U << 4); - out++; - *out = (inl >> 24) % (1U << 4); - out++; - *out = (inl >> 28); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 5); - out++; - *out = (inl >> 5) % (1U << 5); - out++; - *out = (inl >> 10) % (1U << 5); - out++; - *out = (inl >> 15) % (1U << 5); - out++; - *out = (inl >> 20) % (1U << 5); - out++; - *out = (inl >> 25) % (1U << 5); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 3)) << (5 - 3); - out++; - *out = (inl >> 3) % (1U << 5); - out++; - *out = (inl >> 8) % (1U << 5); - out++; - *out = (inl >> 13) % (1U << 5); - out++; - *out = (inl >> 18) % (1U << 5); - out++; - *out = (inl >> 23) % (1U << 5); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 1)) << (5 - 1); - out++; - *out = (inl >> 1) % (1U << 5); - out++; - *out = (inl >> 6) % (1U << 5); - out++; - *out = (inl >> 11) % (1U << 5); - out++; - *out = (inl >> 16) % (1U << 5); - out++; - *out = (inl >> 21) % (1U << 5); - out++; - *out = (inl >> 26) % (1U << 5); - out++; - *out = (inl >> 31); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (5 - 4); - out++; - *out = (inl >> 4) % (1U << 5); - out++; - *out = (inl >> 9) % (1U << 5); - out++; - *out = (inl >> 14) % (1U << 5); - out++; - *out = (inl >> 19) % (1U << 5); - out++; - *out = (inl >> 24) % (1U << 5); - out++; - *out = (inl >> 29); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (5 - 2); - out++; - *out = (inl >> 2) % (1U << 5); - out++; - *out = (inl >> 7) % (1U << 5); - out++; - *out = (inl >> 12) % (1U << 5); - out++; - *out = (inl >> 17) % (1U << 5); - out++; - *out = (inl >> 22) % (1U << 5); - out++; - *out = (inl >> 27); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 6); - out++; - *out = (inl >> 6) % (1U << 6); - out++; - *out = (inl >> 12) % (1U << 6); - out++; - *out = (inl >> 18) % (1U << 6); - out++; - *out = (inl >> 24) % (1U << 6); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (6 - 4); - out++; - *out = (inl >> 4) % (1U << 6); - out++; - *out = (inl >> 10) % (1U << 6); - out++; - *out = (inl >> 16) % (1U << 6); - out++; - *out = (inl >> 22) % (1U << 6); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (6 - 2); - out++; - *out = (inl >> 2) % (1U << 6); - out++; - *out = (inl >> 8) % (1U << 6); - out++; - *out = (inl >> 14) % (1U << 6); - out++; - *out = (inl >> 20) % (1U << 6); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 6); - out++; - *out = (inl >> 6) % (1U << 6); - out++; - *out = (inl >> 12) % (1U << 6); - out++; - *out = (inl >> 18) % (1U << 6); - out++; - *out = (inl >> 24) % (1U << 6); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (6 - 4); - out++; - *out = (inl >> 4) % (1U << 6); - out++; - *out = (inl >> 10) % (1U << 6); - out++; - *out = (inl >> 16) % (1U << 6); - out++; - *out = (inl >> 22) % (1U << 6); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (6 - 2); - out++; - *out = (inl >> 2) % (1U << 6); - out++; - *out = (inl >> 8) % (1U << 6); - out++; - *out = (inl >> 14) % (1U << 6); - out++; - *out = (inl >> 20) % (1U << 6); - out++; - *out = (inl >> 26); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 7); - out++; - *out = (inl >> 7) % (1U << 7); - out++; - *out = (inl >> 14) % (1U << 7); - out++; - *out = (inl >> 21) % (1U << 7); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 3)) << (7 - 3); - out++; - *out = (inl >> 3) % (1U << 7); - out++; - *out = (inl >> 10) % (1U << 7); - out++; - *out = (inl >> 17) % (1U << 7); - out++; - *out = (inl >> 24) % (1U << 7); - out++; - *out = (inl >> 31); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (7 - 6); - out++; - *out = (inl >> 6) % (1U << 7); - out++; - *out = (inl >> 13) % (1U << 7); - out++; - *out = (inl >> 20) % (1U << 7); - out++; - *out = (inl >> 27); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (7 - 2); - out++; - *out = (inl >> 2) % (1U << 7); - out++; - *out = (inl >> 9) % (1U << 7); - out++; - *out = (inl >> 16) % (1U << 7); - out++; - *out = (inl >> 23) % (1U << 7); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 5)) << (7 - 5); - out++; - *out = (inl >> 5) % (1U << 7); - out++; - *out = (inl >> 12) % (1U << 7); - out++; - *out = (inl >> 19) % (1U << 7); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 1)) << (7 - 1); - out++; - *out = (inl >> 1) % (1U << 7); - out++; - *out = (inl >> 8) % (1U << 7); - out++; - *out = (inl >> 15) % (1U << 7); - out++; - *out = (inl >> 22) % (1U << 7); - out++; - *out = (inl >> 29); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (7 - 4); - out++; - *out = (inl >> 4) % (1U << 7); - out++; - *out = (inl >> 11) % (1U << 7); - out++; - *out = (inl >> 18) % (1U << 7); - out++; - *out = (inl >> 25); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 8); - out++; - *out = (inl >> 8) % (1U << 8); - out++; - *out = (inl >> 16) % (1U << 8); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 8); - out++; - *out = (inl >> 8) % (1U << 8); - out++; - *out = (inl >> 16) % (1U << 8); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 8); - out++; - *out = (inl >> 8) % (1U << 8); - out++; - *out = (inl >> 16) % (1U << 8); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 8); - out++; - *out = (inl >> 8) % (1U << 8); - out++; - *out = (inl >> 16) % (1U << 8); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 8); - out++; - *out = (inl >> 8) % (1U << 8); - out++; - *out = (inl >> 16) % (1U << 8); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 8); - out++; - *out = (inl >> 8) % (1U << 8); - out++; - *out = (inl >> 16) % (1U << 8); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 8); - out++; - *out = (inl >> 8) % (1U << 8); - out++; - *out = (inl >> 16) % (1U << 8); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 8); - out++; - *out = (inl >> 8) % (1U << 8); - out++; - *out = (inl >> 16) % (1U << 8); - out++; - *out = (inl >> 24); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 9); - out++; - *out = (inl >> 9) % (1U << 9); - out++; - *out = (inl >> 18) % (1U << 9); - out++; - *out = (inl >> 27); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (9 - 4); - out++; - *out = (inl >> 4) % (1U << 9); - out++; - *out = (inl >> 13) % (1U << 9); - out++; - *out = (inl >> 22) % (1U << 9); - out++; - *out = (inl >> 31); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (9 - 8); - out++; - *out = (inl >> 8) % (1U << 9); - out++; - *out = (inl >> 17) % (1U << 9); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 3)) << (9 - 3); - out++; - *out = (inl >> 3) % (1U << 9); - out++; - *out = (inl >> 12) % (1U << 9); - out++; - *out = (inl >> 21) % (1U << 9); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 7)) << (9 - 7); - out++; - *out = (inl >> 7) % (1U << 9); - out++; - *out = (inl >> 16) % (1U << 9); - out++; - *out = (inl >> 25); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (9 - 2); - out++; - *out = (inl >> 2) % (1U << 9); - out++; - *out = (inl >> 11) % (1U << 9); - out++; - *out = (inl >> 20) % (1U << 9); - out++; - *out = (inl >> 29); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (9 - 6); - out++; - *out = (inl >> 6) % (1U << 9); - out++; - *out = (inl >> 15) % (1U << 9); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 1)) << (9 - 1); - out++; - *out = (inl >> 1) % (1U << 9); - out++; - *out = (inl >> 10) % (1U << 9); - out++; - *out = (inl >> 19) % (1U << 9); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 5)) << (9 - 5); - out++; - *out = (inl >> 5) % (1U << 9); - out++; - *out = (inl >> 14) % (1U << 9); - out++; - *out = (inl >> 23); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 10); - out++; - *out = (inl >> 10) % (1U << 10); - out++; - *out = (inl >> 20) % (1U << 10); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (10 - 8); - out++; - *out = (inl >> 8) % (1U << 10); - out++; - *out = (inl >> 18) % (1U << 10); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (10 - 6); - out++; - *out = (inl >> 6) % (1U << 10); - out++; - *out = (inl >> 16) % (1U << 10); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (10 - 4); - out++; - *out = (inl >> 4) % (1U << 10); - out++; - *out = (inl >> 14) % (1U << 10); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (10 - 2); - out++; - *out = (inl >> 2) % (1U << 10); - out++; - *out = (inl >> 12) % (1U << 10); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 10); - out++; - *out = (inl >> 10) % (1U << 10); - out++; - *out = (inl >> 20) % (1U << 10); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (10 - 8); - out++; - *out = (inl >> 8) % (1U << 10); - out++; - *out = (inl >> 18) % (1U << 10); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (10 - 6); - out++; - *out = (inl >> 6) % (1U << 10); - out++; - *out = (inl >> 16) % (1U << 10); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (10 - 4); - out++; - *out = (inl >> 4) % (1U << 10); - out++; - *out = (inl >> 14) % (1U << 10); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (10 - 2); - out++; - *out = (inl >> 2) % (1U << 10); - out++; - *out = (inl >> 12) % (1U << 10); - out++; - *out = (inl >> 22); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 11); - out++; - *out = (inl >> 11) % (1U << 11); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 1)) << (11 - 1); - out++; - *out = (inl >> 1) % (1U << 11); - out++; - *out = (inl >> 12) % (1U << 11); - out++; - *out = (inl >> 23); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (11 - 2); - out++; - *out = (inl >> 2) % (1U << 11); - out++; - *out = (inl >> 13) % (1U << 11); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 3)) << (11 - 3); - out++; - *out = (inl >> 3) % (1U << 11); - out++; - *out = (inl >> 14) % (1U << 11); - out++; - *out = (inl >> 25); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (11 - 4); - out++; - *out = (inl >> 4) % (1U << 11); - out++; - *out = (inl >> 15) % (1U << 11); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 5)) << (11 - 5); - out++; - *out = (inl >> 5) % (1U << 11); - out++; - *out = (inl >> 16) % (1U << 11); - out++; - *out = (inl >> 27); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (11 - 6); - out++; - *out = (inl >> 6) % (1U << 11); - out++; - *out = (inl >> 17) % (1U << 11); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 7)) << (11 - 7); - out++; - *out = (inl >> 7) % (1U << 11); - out++; - *out = (inl >> 18) % (1U << 11); - out++; - *out = (inl >> 29); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (11 - 8); - out++; - *out = (inl >> 8) % (1U << 11); - out++; - *out = (inl >> 19) % (1U << 11); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 9)) << (11 - 9); - out++; - *out = (inl >> 9) % (1U << 11); - out++; - *out = (inl >> 20) % (1U << 11); - out++; - *out = (inl >> 31); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (11 - 10); - out++; - *out = (inl >> 10) % (1U << 11); - out++; - *out = (inl >> 21); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 12); - out++; - *out = (inl >> 12) % (1U << 12); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (12 - 4); - out++; - *out = (inl >> 4) % (1U << 12); - out++; - *out = (inl >> 16) % (1U << 12); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (12 - 8); - out++; - *out = (inl >> 8) % (1U << 12); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 12); - out++; - *out = (inl >> 12) % (1U << 12); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (12 - 4); - out++; - *out = (inl >> 4) % (1U << 12); - out++; - *out = (inl >> 16) % (1U << 12); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (12 - 8); - out++; - *out = (inl >> 8) % (1U << 12); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 12); - out++; - *out = (inl >> 12) % (1U << 12); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (12 - 4); - out++; - *out = (inl >> 4) % (1U << 12); - out++; - *out = (inl >> 16) % (1U << 12); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (12 - 8); - out++; - *out = (inl >> 8) % (1U << 12); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 12); - out++; - *out = (inl >> 12) % (1U << 12); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (12 - 4); - out++; - *out = (inl >> 4) % (1U << 12); - out++; - *out = (inl >> 16) % (1U << 12); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (12 - 8); - out++; - *out = (inl >> 8) % (1U << 12); - out++; - *out = (inl >> 20); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 13); - out++; - *out = (inl >> 13) % (1U << 13); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 7)) << (13 - 7); - out++; - *out = (inl >> 7) % (1U << 13); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 1)) << (13 - 1); - out++; - *out = (inl >> 1) % (1U << 13); - out++; - *out = (inl >> 14) % (1U << 13); - out++; - *out = (inl >> 27); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (13 - 8); - out++; - *out = (inl >> 8) % (1U << 13); - out++; - *out = (inl >> 21); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (13 - 2); - out++; - *out = (inl >> 2) % (1U << 13); - out++; - *out = (inl >> 15) % (1U << 13); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 9)) << (13 - 9); - out++; - *out = (inl >> 9) % (1U << 13); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 3)) << (13 - 3); - out++; - *out = (inl >> 3) % (1U << 13); - out++; - *out = (inl >> 16) % (1U << 13); - out++; - *out = (inl >> 29); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (13 - 10); - out++; - *out = (inl >> 10) % (1U << 13); - out++; - *out = (inl >> 23); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (13 - 4); - out++; - *out = (inl >> 4) % (1U << 13); - out++; - *out = (inl >> 17) % (1U << 13); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 11)) << (13 - 11); - out++; - *out = (inl >> 11) % (1U << 13); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 5)) << (13 - 5); - out++; - *out = (inl >> 5) % (1U << 13); - out++; - *out = (inl >> 18) % (1U << 13); - out++; - *out = (inl >> 31); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (13 - 12); - out++; - *out = (inl >> 12) % (1U << 13); - out++; - *out = (inl >> 25); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (13 - 6); - out++; - *out = (inl >> 6) % (1U << 13); - out++; - *out = (inl >> 19); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 14); - out++; - *out = (inl >> 14) % (1U << 14); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (14 - 10); - out++; - *out = (inl >> 10) % (1U << 14); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (14 - 6); - out++; - *out = (inl >> 6) % (1U << 14); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (14 - 2); - out++; - *out = (inl >> 2) % (1U << 14); - out++; - *out = (inl >> 16) % (1U << 14); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (14 - 12); - out++; - *out = (inl >> 12) % (1U << 14); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (14 - 8); - out++; - *out = (inl >> 8) % (1U << 14); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (14 - 4); - out++; - *out = (inl >> 4) % (1U << 14); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 14); - out++; - *out = (inl >> 14) % (1U << 14); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (14 - 10); - out++; - *out = (inl >> 10) % (1U << 14); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (14 - 6); - out++; - *out = (inl >> 6) % (1U << 14); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (14 - 2); - out++; - *out = (inl >> 2) % (1U << 14); - out++; - *out = (inl >> 16) % (1U << 14); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (14 - 12); - out++; - *out = (inl >> 12) % (1U << 14); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (14 - 8); - out++; - *out = (inl >> 8) % (1U << 14); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (14 - 4); - out++; - *out = (inl >> 4) % (1U << 14); - out++; - *out = (inl >> 18); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 15); - out++; - *out = (inl >> 15) % (1U << 15); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 13)) << (15 - 13); - out++; - *out = (inl >> 13) % (1U << 15); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 11)) << (15 - 11); - out++; - *out = (inl >> 11) % (1U << 15); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 9)) << (15 - 9); - out++; - *out = (inl >> 9) % (1U << 15); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 7)) << (15 - 7); - out++; - *out = (inl >> 7) % (1U << 15); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 5)) << (15 - 5); - out++; - *out = (inl >> 5) % (1U << 15); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 3)) << (15 - 3); - out++; - *out = (inl >> 3) % (1U << 15); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 1)) << (15 - 1); - out++; - *out = (inl >> 1) % (1U << 15); - out++; - *out = (inl >> 16) % (1U << 15); - out++; - *out = (inl >> 31); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (15 - 14); - out++; - *out = (inl >> 14) % (1U << 15); - out++; - *out = (inl >> 29); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (15 - 12); - out++; - *out = (inl >> 12) % (1U << 15); - out++; - *out = (inl >> 27); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (15 - 10); - out++; - *out = (inl >> 10) % (1U << 15); - out++; - *out = (inl >> 25); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (15 - 8); - out++; - *out = (inl >> 8) % (1U << 15); - out++; - *out = (inl >> 23); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (15 - 6); - out++; - *out = (inl >> 6) % (1U << 15); - out++; - *out = (inl >> 21); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (15 - 4); - out++; - *out = (inl >> 4) % (1U << 15); - out++; - *out = (inl >> 19); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (15 - 2); - out++; - *out = (inl >> 2) % (1U << 15); - out++; - *out = (inl >> 17); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 16); - out++; - *out = (inl >> 16); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 17); - out++; - *out = (inl >> 17); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (17 - 2); - out++; - *out = (inl >> 2) % (1U << 17); - out++; - *out = (inl >> 19); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (17 - 4); - out++; - *out = (inl >> 4) % (1U << 17); - out++; - *out = (inl >> 21); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (17 - 6); - out++; - *out = (inl >> 6) % (1U << 17); - out++; - *out = (inl >> 23); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (17 - 8); - out++; - *out = (inl >> 8) % (1U << 17); - out++; - *out = (inl >> 25); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (17 - 10); - out++; - *out = (inl >> 10) % (1U << 17); - out++; - *out = (inl >> 27); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (17 - 12); - out++; - *out = (inl >> 12) % (1U << 17); - out++; - *out = (inl >> 29); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (17 - 14); - out++; - *out = (inl >> 14) % (1U << 17); - out++; - *out = (inl >> 31); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (17 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 1)) << (17 - 1); - out++; - *out = (inl >> 1) % (1U << 17); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 3)) << (17 - 3); - out++; - *out = (inl >> 3) % (1U << 17); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 5)) << (17 - 5); - out++; - *out = (inl >> 5) % (1U << 17); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 7)) << (17 - 7); - out++; - *out = (inl >> 7) % (1U << 17); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 9)) << (17 - 9); - out++; - *out = (inl >> 9) % (1U << 17); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 11)) << (17 - 11); - out++; - *out = (inl >> 11) % (1U << 17); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 13)) << (17 - 13); - out++; - *out = (inl >> 13) % (1U << 17); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 15)) << (17 - 15); - out++; - *out = (inl >> 15); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 18); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (18 - 4); - out++; - *out = (inl >> 4) % (1U << 18); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (18 - 8); - out++; - *out = (inl >> 8) % (1U << 18); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (18 - 12); - out++; - *out = (inl >> 12) % (1U << 18); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (18 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (18 - 2); - out++; - *out = (inl >> 2) % (1U << 18); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (18 - 6); - out++; - *out = (inl >> 6) % (1U << 18); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (18 - 10); - out++; - *out = (inl >> 10) % (1U << 18); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (18 - 14); - out++; - *out = (inl >> 14); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 18); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (18 - 4); - out++; - *out = (inl >> 4) % (1U << 18); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (18 - 8); - out++; - *out = (inl >> 8) % (1U << 18); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (18 - 12); - out++; - *out = (inl >> 12) % (1U << 18); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (18 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (18 - 2); - out++; - *out = (inl >> 2) % (1U << 18); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (18 - 6); - out++; - *out = (inl >> 6) % (1U << 18); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (18 - 10); - out++; - *out = (inl >> 10) % (1U << 18); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (18 - 14); - out++; - *out = (inl >> 14); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 19); - out++; - *out = (inl >> 19); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (19 - 6); - out++; - *out = (inl >> 6) % (1U << 19); - out++; - *out = (inl >> 25); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (19 - 12); - out++; - *out = (inl >> 12) % (1U << 19); - out++; - *out = (inl >> 31); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 18)) << (19 - 18); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 5)) << (19 - 5); - out++; - *out = (inl >> 5) % (1U << 19); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 11)) << (19 - 11); - out++; - *out = (inl >> 11) % (1U << 19); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 17)) << (19 - 17); - out++; - *out = (inl >> 17); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (19 - 4); - out++; - *out = (inl >> 4) % (1U << 19); - out++; - *out = (inl >> 23); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (19 - 10); - out++; - *out = (inl >> 10) % (1U << 19); - out++; - *out = (inl >> 29); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (19 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 3)) << (19 - 3); - out++; - *out = (inl >> 3) % (1U << 19); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 9)) << (19 - 9); - out++; - *out = (inl >> 9) % (1U << 19); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 15)) << (19 - 15); - out++; - *out = (inl >> 15); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (19 - 2); - out++; - *out = (inl >> 2) % (1U << 19); - out++; - *out = (inl >> 21); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (19 - 8); - out++; - *out = (inl >> 8) % (1U << 19); - out++; - *out = (inl >> 27); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (19 - 14); - out++; - *out = (inl >> 14); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 1)) << (19 - 1); - out++; - *out = (inl >> 1) % (1U << 19); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 7)) << (19 - 7); - out++; - *out = (inl >> 7) % (1U << 19); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 13)) << (19 - 13); - out++; - *out = (inl >> 13); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (20 - 8); - out++; - *out = (inl >> 8) % (1U << 20); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (20 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (20 - 4); - out++; - *out = (inl >> 4) % (1U << 20); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (20 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (20 - 8); - out++; - *out = (inl >> 8) % (1U << 20); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (20 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (20 - 4); - out++; - *out = (inl >> 4) % (1U << 20); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (20 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (20 - 8); - out++; - *out = (inl >> 8) % (1U << 20); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (20 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (20 - 4); - out++; - *out = (inl >> 4) % (1U << 20); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (20 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (20 - 8); - out++; - *out = (inl >> 8) % (1U << 20); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (20 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (20 - 4); - out++; - *out = (inl >> 4) % (1U << 20); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (20 - 12); - out++; - *out = (inl >> 12); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 21); - out++; - *out = (inl >> 21); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (21 - 10); - out++; - *out = (inl >> 10) % (1U << 21); - out++; - *out = (inl >> 31); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 20)) << (21 - 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 9)) << (21 - 9); - out++; - *out = (inl >> 9) % (1U << 21); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 19)) << (21 - 19); - out++; - *out = (inl >> 19); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (21 - 8); - out++; - *out = (inl >> 8) % (1U << 21); - out++; - *out = (inl >> 29); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 18)) << (21 - 18); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 7)) << (21 - 7); - out++; - *out = (inl >> 7) % (1U << 21); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 17)) << (21 - 17); - out++; - *out = (inl >> 17); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (21 - 6); - out++; - *out = (inl >> 6) % (1U << 21); - out++; - *out = (inl >> 27); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (21 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 5)) << (21 - 5); - out++; - *out = (inl >> 5) % (1U << 21); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 15)) << (21 - 15); - out++; - *out = (inl >> 15); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (21 - 4); - out++; - *out = (inl >> 4) % (1U << 21); - out++; - *out = (inl >> 25); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (21 - 14); - out++; - *out = (inl >> 14); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 3)) << (21 - 3); - out++; - *out = (inl >> 3) % (1U << 21); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 13)) << (21 - 13); - out++; - *out = (inl >> 13); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (21 - 2); - out++; - *out = (inl >> 2) % (1U << 21); - out++; - *out = (inl >> 23); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (21 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 1)) << (21 - 1); - out++; - *out = (inl >> 1) % (1U << 21); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 11)) << (21 - 11); - out++; - *out = (inl >> 11); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 22); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (22 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (22 - 2); - out++; - *out = (inl >> 2) % (1U << 22); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (22 - 14); - out++; - *out = (inl >> 14); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (22 - 4); - out++; - *out = (inl >> 4) % (1U << 22); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (22 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (22 - 6); - out++; - *out = (inl >> 6) % (1U << 22); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 18)) << (22 - 18); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (22 - 8); - out++; - *out = (inl >> 8) % (1U << 22); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 20)) << (22 - 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (22 - 10); - out++; - *out = (inl >> 10); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 22); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (22 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (22 - 2); - out++; - *out = (inl >> 2) % (1U << 22); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (22 - 14); - out++; - *out = (inl >> 14); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (22 - 4); - out++; - *out = (inl >> 4) % (1U << 22); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (22 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (22 - 6); - out++; - *out = (inl >> 6) % (1U << 22); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 18)) << (22 - 18); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (22 - 8); - out++; - *out = (inl >> 8) % (1U << 22); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 20)) << (22 - 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (22 - 10); - out++; - *out = (inl >> 10); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 23); - out++; - *out = (inl >> 23); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (23 - 14); - out++; - *out = (inl >> 14); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 5)) << (23 - 5); - out++; - *out = (inl >> 5) % (1U << 23); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 19)) << (23 - 19); - out++; - *out = (inl >> 19); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (23 - 10); - out++; - *out = (inl >> 10); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 1)) << (23 - 1); - out++; - *out = (inl >> 1) % (1U << 23); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 15)) << (23 - 15); - out++; - *out = (inl >> 15); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (23 - 6); - out++; - *out = (inl >> 6) % (1U << 23); - out++; - *out = (inl >> 29); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 20)) << (23 - 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 11)) << (23 - 11); - out++; - *out = (inl >> 11); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (23 - 2); - out++; - *out = (inl >> 2) % (1U << 23); - out++; - *out = (inl >> 25); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (23 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 7)) << (23 - 7); - out++; - *out = (inl >> 7) % (1U << 23); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 21)) << (23 - 21); - out++; - *out = (inl >> 21); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (23 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 3)) << (23 - 3); - out++; - *out = (inl >> 3) % (1U << 23); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 17)) << (23 - 17); - out++; - *out = (inl >> 17); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (23 - 8); - out++; - *out = (inl >> 8) % (1U << 23); - out++; - *out = (inl >> 31); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 22)) << (23 - 22); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 13)) << (23 - 13); - out++; - *out = (inl >> 13); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (23 - 4); - out++; - *out = (inl >> 4) % (1U << 23); - out++; - *out = (inl >> 27); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 18)) << (23 - 18); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 9)) << (23 - 9); - out++; - *out = (inl >> 9); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (24 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (24 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (24 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (24 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (24 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (24 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (24 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (24 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (24 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (24 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (24 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (24 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (24 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (24 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (24 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (24 - 8); - out++; - *out = (inl >> 8); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 25); - out++; - *out = (inl >> 25); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 18)) << (25 - 18); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 11)) << (25 - 11); - out++; - *out = (inl >> 11); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (25 - 4); - out++; - *out = (inl >> 4) % (1U << 25); - out++; - *out = (inl >> 29); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 22)) << (25 - 22); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 15)) << (25 - 15); - out++; - *out = (inl >> 15); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (25 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 1)) << (25 - 1); - out++; - *out = (inl >> 1) % (1U << 25); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 19)) << (25 - 19); - out++; - *out = (inl >> 19); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (25 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 5)) << (25 - 5); - out++; - *out = (inl >> 5) % (1U << 25); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 23)) << (25 - 23); - out++; - *out = (inl >> 23); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (25 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 9)) << (25 - 9); - out++; - *out = (inl >> 9); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (25 - 2); - out++; - *out = (inl >> 2) % (1U << 25); - out++; - *out = (inl >> 27); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 20)) << (25 - 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 13)) << (25 - 13); - out++; - *out = (inl >> 13); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (25 - 6); - out++; - *out = (inl >> 6) % (1U << 25); - out++; - *out = (inl >> 31); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 24)) << (25 - 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 17)) << (25 - 17); - out++; - *out = (inl >> 17); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (25 - 10); - out++; - *out = (inl >> 10); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 3)) << (25 - 3); - out++; - *out = (inl >> 3) % (1U << 25); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 21)) << (25 - 21); - out++; - *out = (inl >> 21); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (25 - 14); - out++; - *out = (inl >> 14); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 7)) << (25 - 7); - out++; - *out = (inl >> 7); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 26); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 20)) << (26 - 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (26 - 14); - out++; - *out = (inl >> 14); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (26 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (26 - 2); - out++; - *out = (inl >> 2) % (1U << 26); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 22)) << (26 - 22); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (26 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (26 - 10); - out++; - *out = (inl >> 10); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (26 - 4); - out++; - *out = (inl >> 4) % (1U << 26); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 24)) << (26 - 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 18)) << (26 - 18); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (26 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (26 - 6); - out++; - *out = (inl >> 6); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 26); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 20)) << (26 - 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (26 - 14); - out++; - *out = (inl >> 14); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (26 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (26 - 2); - out++; - *out = (inl >> 2) % (1U << 26); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 22)) << (26 - 22); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (26 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (26 - 10); - out++; - *out = (inl >> 10); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (26 - 4); - out++; - *out = (inl >> 4) % (1U << 26); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 24)) << (26 - 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 18)) << (26 - 18); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (26 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (26 - 6); - out++; - *out = (inl >> 6); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 27); - out++; - *out = (inl >> 27); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 22)) << (27 - 22); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 17)) << (27 - 17); - out++; - *out = (inl >> 17); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (27 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 7)) << (27 - 7); - out++; - *out = (inl >> 7); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (27 - 2); - out++; - *out = (inl >> 2) % (1U << 27); - out++; - *out = (inl >> 29); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 24)) << (27 - 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 19)) << (27 - 19); - out++; - *out = (inl >> 19); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (27 - 14); - out++; - *out = (inl >> 14); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 9)) << (27 - 9); - out++; - *out = (inl >> 9); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (27 - 4); - out++; - *out = (inl >> 4) % (1U << 27); - out++; - *out = (inl >> 31); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 26)) << (27 - 26); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 21)) << (27 - 21); - out++; - *out = (inl >> 21); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (27 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 11)) << (27 - 11); - out++; - *out = (inl >> 11); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (27 - 6); - out++; - *out = (inl >> 6); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 1)) << (27 - 1); - out++; - *out = (inl >> 1) % (1U << 27); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 23)) << (27 - 23); - out++; - *out = (inl >> 23); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 18)) << (27 - 18); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 13)) << (27 - 13); - out++; - *out = (inl >> 13); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (27 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 3)) << (27 - 3); - out++; - *out = (inl >> 3) % (1U << 27); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 25)) << (27 - 25); - out++; - *out = (inl >> 25); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 20)) << (27 - 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 15)) << (27 - 15); - out++; - *out = (inl >> 15); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (27 - 10); - out++; - *out = (inl >> 10); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 5)) << (27 - 5); - out++; - *out = (inl >> 5); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 28); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 24)) << (28 - 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 20)) << (28 - 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (28 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (28 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (28 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (28 - 4); - out++; - *out = (inl >> 4); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 28); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 24)) << (28 - 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 20)) << (28 - 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (28 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (28 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (28 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (28 - 4); - out++; - *out = (inl >> 4); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 28); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 24)) << (28 - 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 20)) << (28 - 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (28 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (28 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (28 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (28 - 4); - out++; - *out = (inl >> 4); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 28); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 24)) << (28 - 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 20)) << (28 - 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (28 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (28 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (28 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (28 - 4); - out++; - *out = (inl >> 4); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 29); - out++; - *out = (inl >> 29); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 26)) << (29 - 26); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 23)) << (29 - 23); - out++; - *out = (inl >> 23); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 20)) << (29 - 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 17)) << (29 - 17); - out++; - *out = (inl >> 17); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (29 - 14); - out++; - *out = (inl >> 14); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 11)) << (29 - 11); - out++; - *out = (inl >> 11); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (29 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 5)) << (29 - 5); - out++; - *out = (inl >> 5); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (29 - 2); - out++; - *out = (inl >> 2) % (1U << 29); - out++; - *out = (inl >> 31); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 28)) << (29 - 28); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 25)) << (29 - 25); - out++; - *out = (inl >> 25); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 22)) << (29 - 22); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 19)) << (29 - 19); - out++; - *out = (inl >> 19); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (29 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 13)) << (29 - 13); - out++; - *out = (inl >> 13); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (29 - 10); - out++; - *out = (inl >> 10); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 7)) << (29 - 7); - out++; - *out = (inl >> 7); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (29 - 4); - out++; - *out = (inl >> 4); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 1)) << (29 - 1); - out++; - *out = (inl >> 1) % (1U << 29); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 27)) << (29 - 27); - out++; - *out = (inl >> 27); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 24)) << (29 - 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 21)) << (29 - 21); - out++; - *out = (inl >> 21); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 18)) << (29 - 18); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 15)) << (29 - 15); - out++; - *out = (inl >> 15); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (29 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 9)) << (29 - 9); - out++; - *out = (inl >> 9); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (29 - 6); - out++; - *out = (inl >> 6); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 3)) << (29 - 3); - out++; - *out = (inl >> 3); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 30); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 28)) << (30 - 28); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 26)) << (30 - 26); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 24)) << (30 - 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 22)) << (30 - 22); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 20)) << (30 - 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 18)) << (30 - 18); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (30 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (30 - 14); - out++; - *out = (inl >> 14); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (30 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (30 - 10); - out++; - *out = (inl >> 10); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (30 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (30 - 6); - out++; - *out = (inl >> 6); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (30 - 4); - out++; - *out = (inl >> 4); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (30 - 2); - out++; - *out = (inl >> 2); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0) % (1U << 30); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 28)) << (30 - 28); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 26)) << (30 - 26); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 24)) << (30 - 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 22)) << (30 - 22); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 20)) << (30 - 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 18)) << (30 - 18); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (30 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (30 - 14); - out++; - *out = (inl >> 14); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (30 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (30 - 10); - out++; - *out = (inl >> 10); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (30 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (30 - 6); - out++; - *out = (inl >> 6); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (30 - 4); - out++; - *out = (inl >> 4); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (30 - 2); - out++; - *out = (inl >> 2); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0) % (1U << 31); - out++; - *out = (inl >> 31); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 30)) << (31 - 30); - out++; - *out = (inl >> 30); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 29)) << (31 - 29); - out++; - *out = (inl >> 29); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 28)) << (31 - 28); - out++; - *out = (inl >> 28); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 27)) << (31 - 27); - out++; - *out = (inl >> 27); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 26)) << (31 - 26); - out++; - *out = (inl >> 26); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 25)) << (31 - 25); - out++; - *out = (inl >> 25); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 24)) << (31 - 24); - out++; - *out = (inl >> 24); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 23)) << (31 - 23); - out++; - *out = (inl >> 23); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 22)) << (31 - 22); - out++; - *out = (inl >> 22); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 21)) << (31 - 21); - out++; - *out = (inl >> 21); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 20)) << (31 - 20); - out++; - *out = (inl >> 20); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 19)) << (31 - 19); - out++; - *out = (inl >> 19); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 18)) << (31 - 18); - out++; - *out = (inl >> 18); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 17)) << (31 - 17); - out++; - *out = (inl >> 17); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 16)) << (31 - 16); - out++; - *out = (inl >> 16); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 15)) << (31 - 15); - out++; - *out = (inl >> 15); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 14)) << (31 - 14); - out++; - *out = (inl >> 14); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 13)) << (31 - 13); - out++; - *out = (inl >> 13); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 12)) << (31 - 12); - out++; - *out = (inl >> 12); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 11)) << (31 - 11); - out++; - *out = (inl >> 11); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 10)) << (31 - 10); - out++; - *out = (inl >> 10); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 9)) << (31 - 9); - out++; - *out = (inl >> 9); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 8)) << (31 - 8); - out++; - *out = (inl >> 8); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 7)) << (31 - 7); - out++; - *out = (inl >> 7); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 6)) << (31 - 6); - out++; - *out = (inl >> 6); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 5)) << (31 - 5); - out++; - *out = (inl >> 5); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 4)) << (31 - 4); - out++; - *out = (inl >> 4); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 3)) << (31 - 3); - out++; - *out = (inl >> 3); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 2)) << (31 - 2); - out++; - *out = (inl >> 2); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out |= (inl % (1U << 1)) << (31 - 1); - out++; - *out = (inl >> 1); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) { - uint32_t inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - inl = util::SafeLoad(in); - inl = arrow::bit_util::FromLittleEndian(inl); - out++; - *out = (inl >> 0); - ++in; - out++; - - return in; -} - -inline const uint32_t* nullunpacker32(const uint32_t* in, uint32_t* out) { - for (int k = 0; k < 32; ++k) { - out[k] = 0; - } - return in; -} - -} // namespace internal -} // namespace arrow diff --git a/cpp/src/arrow/util/bpacking_scalar_generated_internal.h b/cpp/src/arrow/util/bpacking_scalar_generated_internal.h new file mode 100644 index 00000000000..74c25885402 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_scalar_generated_internal.h @@ -0,0 +1,6808 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file was generated by script which is modified from its original version in GitHub. +// Original source: +// https://github.com/lemire/FrameOfReference/blob/master/scripts/turbopacking64.py +// The original copyright notice follows. + +// This code is released under the +// Apache License Version 2.0 http://www.apache.org/licenses/. +// (c) Daniel Lemire 2013 + +// WARNING: this file is generated, DO NOT EDIT. +// Usage: +// python cpp/src/arrow/util/bpacking_scalar_codegen.py + +#pragma once + +#include +#include + +#include "arrow/util/endian.h" +#include "arrow/util/ubsan.h" + +namespace arrow::internal { + +inline const uint8_t* unpack0_32(const uint8_t* in, uint32_t* out){ + std::memset(out, 0, 32 * 4); + return in; +} + +inline const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 1) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 1) & mask; + out[2] = (w0 >> 2) & mask; + out[3] = (w0 >> 3) & mask; + out[4] = (w0 >> 4) & mask; + out[5] = (w0 >> 5) & mask; + out[6] = (w0 >> 6) & mask; + out[7] = (w0 >> 7) & mask; + out[8] = (w0 >> 8) & mask; + out[9] = (w0 >> 9) & mask; + out[10] = (w0 >> 10) & mask; + out[11] = (w0 >> 11) & mask; + out[12] = (w0 >> 12) & mask; + out[13] = (w0 >> 13) & mask; + out[14] = (w0 >> 14) & mask; + out[15] = (w0 >> 15) & mask; + out[16] = (w0 >> 16) & mask; + out[17] = (w0 >> 17) & mask; + out[18] = (w0 >> 18) & mask; + out[19] = (w0 >> 19) & mask; + out[20] = (w0 >> 20) & mask; + out[21] = (w0 >> 21) & mask; + out[22] = (w0 >> 22) & mask; + out[23] = (w0 >> 23) & mask; + out[24] = (w0 >> 24) & mask; + out[25] = (w0 >> 25) & mask; + out[26] = (w0 >> 26) & mask; + out[27] = (w0 >> 27) & mask; + out[28] = (w0 >> 28) & mask; + out[29] = (w0 >> 29) & mask; + out[30] = (w0 >> 30) & mask; + out[31] = w0 >> 31; + + return in; +} + +inline const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 2) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 2) & mask; + out[2] = (w0 >> 4) & mask; + out[3] = (w0 >> 6) & mask; + out[4] = (w0 >> 8) & mask; + out[5] = (w0 >> 10) & mask; + out[6] = (w0 >> 12) & mask; + out[7] = (w0 >> 14) & mask; + out[8] = (w0 >> 16) & mask; + out[9] = (w0 >> 18) & mask; + out[10] = (w0 >> 20) & mask; + out[11] = (w0 >> 22) & mask; + out[12] = (w0 >> 24) & mask; + out[13] = (w0 >> 26) & mask; + out[14] = (w0 >> 28) & mask; + out[15] = w0 >> 30; + out[16] = (w1) & mask; + out[17] = (w1 >> 2) & mask; + out[18] = (w1 >> 4) & mask; + out[19] = (w1 >> 6) & mask; + out[20] = (w1 >> 8) & mask; + out[21] = (w1 >> 10) & mask; + out[22] = (w1 >> 12) & mask; + out[23] = (w1 >> 14) & mask; + out[24] = (w1 >> 16) & mask; + out[25] = (w1 >> 18) & mask; + out[26] = (w1 >> 20) & mask; + out[27] = (w1 >> 22) & mask; + out[28] = (w1 >> 24) & mask; + out[29] = (w1 >> 26) & mask; + out[30] = (w1 >> 28) & mask; + out[31] = w1 >> 30; + + return in; +} + +inline const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 3) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 3) & mask; + out[2] = (w0 >> 6) & mask; + out[3] = (w0 >> 9) & mask; + out[4] = (w0 >> 12) & mask; + out[5] = (w0 >> 15) & mask; + out[6] = (w0 >> 18) & mask; + out[7] = (w0 >> 21) & mask; + out[8] = (w0 >> 24) & mask; + out[9] = (w0 >> 27) & mask; + out[10] = ((w0 >> 30) | (w1 << 2)) & mask; + out[11] = (w1 >> 1) & mask; + out[12] = (w1 >> 4) & mask; + out[13] = (w1 >> 7) & mask; + out[14] = (w1 >> 10) & mask; + out[15] = (w1 >> 13) & mask; + out[16] = (w1 >> 16) & mask; + out[17] = (w1 >> 19) & mask; + out[18] = (w1 >> 22) & mask; + out[19] = (w1 >> 25) & mask; + out[20] = (w1 >> 28) & mask; + out[21] = ((w1 >> 31) | (w2 << 1)) & mask; + out[22] = (w2 >> 2) & mask; + out[23] = (w2 >> 5) & mask; + out[24] = (w2 >> 8) & mask; + out[25] = (w2 >> 11) & mask; + out[26] = (w2 >> 14) & mask; + out[27] = (w2 >> 17) & mask; + out[28] = (w2 >> 20) & mask; + out[29] = (w2 >> 23) & mask; + out[30] = (w2 >> 26) & mask; + out[31] = w2 >> 29; + + return in; +} + +inline const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 4) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 4) & mask; + out[2] = (w0 >> 8) & mask; + out[3] = (w0 >> 12) & mask; + out[4] = (w0 >> 16) & mask; + out[5] = (w0 >> 20) & mask; + out[6] = (w0 >> 24) & mask; + out[7] = w0 >> 28; + out[8] = (w1) & mask; + out[9] = (w1 >> 4) & mask; + out[10] = (w1 >> 8) & mask; + out[11] = (w1 >> 12) & mask; + out[12] = (w1 >> 16) & mask; + out[13] = (w1 >> 20) & mask; + out[14] = (w1 >> 24) & mask; + out[15] = w1 >> 28; + out[16] = (w2) & mask; + out[17] = (w2 >> 4) & mask; + out[18] = (w2 >> 8) & mask; + out[19] = (w2 >> 12) & mask; + out[20] = (w2 >> 16) & mask; + out[21] = (w2 >> 20) & mask; + out[22] = (w2 >> 24) & mask; + out[23] = w2 >> 28; + out[24] = (w3) & mask; + out[25] = (w3 >> 4) & mask; + out[26] = (w3 >> 8) & mask; + out[27] = (w3 >> 12) & mask; + out[28] = (w3 >> 16) & mask; + out[29] = (w3 >> 20) & mask; + out[30] = (w3 >> 24) & mask; + out[31] = w3 >> 28; + + return in; +} + +inline const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 5) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 5) & mask; + out[2] = (w0 >> 10) & mask; + out[3] = (w0 >> 15) & mask; + out[4] = (w0 >> 20) & mask; + out[5] = (w0 >> 25) & mask; + out[6] = ((w0 >> 30) | (w1 << 2)) & mask; + out[7] = (w1 >> 3) & mask; + out[8] = (w1 >> 8) & mask; + out[9] = (w1 >> 13) & mask; + out[10] = (w1 >> 18) & mask; + out[11] = (w1 >> 23) & mask; + out[12] = ((w1 >> 28) | (w2 << 4)) & mask; + out[13] = (w2 >> 1) & mask; + out[14] = (w2 >> 6) & mask; + out[15] = (w2 >> 11) & mask; + out[16] = (w2 >> 16) & mask; + out[17] = (w2 >> 21) & mask; + out[18] = (w2 >> 26) & mask; + out[19] = ((w2 >> 31) | (w3 << 1)) & mask; + out[20] = (w3 >> 4) & mask; + out[21] = (w3 >> 9) & mask; + out[22] = (w3 >> 14) & mask; + out[23] = (w3 >> 19) & mask; + out[24] = (w3 >> 24) & mask; + out[25] = ((w3 >> 29) | (w4 << 3)) & mask; + out[26] = (w4 >> 2) & mask; + out[27] = (w4 >> 7) & mask; + out[28] = (w4 >> 12) & mask; + out[29] = (w4 >> 17) & mask; + out[30] = (w4 >> 22) & mask; + out[31] = w4 >> 27; + + return in; +} + +inline const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 6) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 6) & mask; + out[2] = (w0 >> 12) & mask; + out[3] = (w0 >> 18) & mask; + out[4] = (w0 >> 24) & mask; + out[5] = ((w0 >> 30) | (w1 << 2)) & mask; + out[6] = (w1 >> 4) & mask; + out[7] = (w1 >> 10) & mask; + out[8] = (w1 >> 16) & mask; + out[9] = (w1 >> 22) & mask; + out[10] = ((w1 >> 28) | (w2 << 4)) & mask; + out[11] = (w2 >> 2) & mask; + out[12] = (w2 >> 8) & mask; + out[13] = (w2 >> 14) & mask; + out[14] = (w2 >> 20) & mask; + out[15] = w2 >> 26; + out[16] = (w3) & mask; + out[17] = (w3 >> 6) & mask; + out[18] = (w3 >> 12) & mask; + out[19] = (w3 >> 18) & mask; + out[20] = (w3 >> 24) & mask; + out[21] = ((w3 >> 30) | (w4 << 2)) & mask; + out[22] = (w4 >> 4) & mask; + out[23] = (w4 >> 10) & mask; + out[24] = (w4 >> 16) & mask; + out[25] = (w4 >> 22) & mask; + out[26] = ((w4 >> 28) | (w5 << 4)) & mask; + out[27] = (w5 >> 2) & mask; + out[28] = (w5 >> 8) & mask; + out[29] = (w5 >> 14) & mask; + out[30] = (w5 >> 20) & mask; + out[31] = w5 >> 26; + + return in; +} + +inline const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 7) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 7) & mask; + out[2] = (w0 >> 14) & mask; + out[3] = (w0 >> 21) & mask; + out[4] = ((w0 >> 28) | (w1 << 4)) & mask; + out[5] = (w1 >> 3) & mask; + out[6] = (w1 >> 10) & mask; + out[7] = (w1 >> 17) & mask; + out[8] = (w1 >> 24) & mask; + out[9] = ((w1 >> 31) | (w2 << 1)) & mask; + out[10] = (w2 >> 6) & mask; + out[11] = (w2 >> 13) & mask; + out[12] = (w2 >> 20) & mask; + out[13] = ((w2 >> 27) | (w3 << 5)) & mask; + out[14] = (w3 >> 2) & mask; + out[15] = (w3 >> 9) & mask; + out[16] = (w3 >> 16) & mask; + out[17] = (w3 >> 23) & mask; + out[18] = ((w3 >> 30) | (w4 << 2)) & mask; + out[19] = (w4 >> 5) & mask; + out[20] = (w4 >> 12) & mask; + out[21] = (w4 >> 19) & mask; + out[22] = ((w4 >> 26) | (w5 << 6)) & mask; + out[23] = (w5 >> 1) & mask; + out[24] = (w5 >> 8) & mask; + out[25] = (w5 >> 15) & mask; + out[26] = (w5 >> 22) & mask; + out[27] = ((w5 >> 29) | (w6 << 3)) & mask; + out[28] = (w6 >> 4) & mask; + out[29] = (w6 >> 11) & mask; + out[30] = (w6 >> 18) & mask; + out[31] = w6 >> 25; + + return in; +} + +inline const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 8) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 8) & mask; + out[2] = (w0 >> 16) & mask; + out[3] = w0 >> 24; + out[4] = (w1) & mask; + out[5] = (w1 >> 8) & mask; + out[6] = (w1 >> 16) & mask; + out[7] = w1 >> 24; + out[8] = (w2) & mask; + out[9] = (w2 >> 8) & mask; + out[10] = (w2 >> 16) & mask; + out[11] = w2 >> 24; + out[12] = (w3) & mask; + out[13] = (w3 >> 8) & mask; + out[14] = (w3 >> 16) & mask; + out[15] = w3 >> 24; + out[16] = (w4) & mask; + out[17] = (w4 >> 8) & mask; + out[18] = (w4 >> 16) & mask; + out[19] = w4 >> 24; + out[20] = (w5) & mask; + out[21] = (w5 >> 8) & mask; + out[22] = (w5 >> 16) & mask; + out[23] = w5 >> 24; + out[24] = (w6) & mask; + out[25] = (w6 >> 8) & mask; + out[26] = (w6 >> 16) & mask; + out[27] = w6 >> 24; + out[28] = (w7) & mask; + out[29] = (w7 >> 8) & mask; + out[30] = (w7 >> 16) & mask; + out[31] = w7 >> 24; + + return in; +} + +inline const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 9) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 9) & mask; + out[2] = (w0 >> 18) & mask; + out[3] = ((w0 >> 27) | (w1 << 5)) & mask; + out[4] = (w1 >> 4) & mask; + out[5] = (w1 >> 13) & mask; + out[6] = (w1 >> 22) & mask; + out[7] = ((w1 >> 31) | (w2 << 1)) & mask; + out[8] = (w2 >> 8) & mask; + out[9] = (w2 >> 17) & mask; + out[10] = ((w2 >> 26) | (w3 << 6)) & mask; + out[11] = (w3 >> 3) & mask; + out[12] = (w3 >> 12) & mask; + out[13] = (w3 >> 21) & mask; + out[14] = ((w3 >> 30) | (w4 << 2)) & mask; + out[15] = (w4 >> 7) & mask; + out[16] = (w4 >> 16) & mask; + out[17] = ((w4 >> 25) | (w5 << 7)) & mask; + out[18] = (w5 >> 2) & mask; + out[19] = (w5 >> 11) & mask; + out[20] = (w5 >> 20) & mask; + out[21] = ((w5 >> 29) | (w6 << 3)) & mask; + out[22] = (w6 >> 6) & mask; + out[23] = (w6 >> 15) & mask; + out[24] = ((w6 >> 24) | (w7 << 8)) & mask; + out[25] = (w7 >> 1) & mask; + out[26] = (w7 >> 10) & mask; + out[27] = (w7 >> 19) & mask; + out[28] = ((w7 >> 28) | (w8 << 4)) & mask; + out[29] = (w8 >> 5) & mask; + out[30] = (w8 >> 14) & mask; + out[31] = w8 >> 23; + + return in; +} + +inline const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 10) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 10) & mask; + out[2] = (w0 >> 20) & mask; + out[3] = ((w0 >> 30) | (w1 << 2)) & mask; + out[4] = (w1 >> 8) & mask; + out[5] = (w1 >> 18) & mask; + out[6] = ((w1 >> 28) | (w2 << 4)) & mask; + out[7] = (w2 >> 6) & mask; + out[8] = (w2 >> 16) & mask; + out[9] = ((w2 >> 26) | (w3 << 6)) & mask; + out[10] = (w3 >> 4) & mask; + out[11] = (w3 >> 14) & mask; + out[12] = ((w3 >> 24) | (w4 << 8)) & mask; + out[13] = (w4 >> 2) & mask; + out[14] = (w4 >> 12) & mask; + out[15] = w4 >> 22; + out[16] = (w5) & mask; + out[17] = (w5 >> 10) & mask; + out[18] = (w5 >> 20) & mask; + out[19] = ((w5 >> 30) | (w6 << 2)) & mask; + out[20] = (w6 >> 8) & mask; + out[21] = (w6 >> 18) & mask; + out[22] = ((w6 >> 28) | (w7 << 4)) & mask; + out[23] = (w7 >> 6) & mask; + out[24] = (w7 >> 16) & mask; + out[25] = ((w7 >> 26) | (w8 << 6)) & mask; + out[26] = (w8 >> 4) & mask; + out[27] = (w8 >> 14) & mask; + out[28] = ((w8 >> 24) | (w9 << 8)) & mask; + out[29] = (w9 >> 2) & mask; + out[30] = (w9 >> 12) & mask; + out[31] = w9 >> 22; + + return in; +} + +inline const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 11) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 11) & mask; + out[2] = ((w0 >> 22) | (w1 << 10)) & mask; + out[3] = (w1 >> 1) & mask; + out[4] = (w1 >> 12) & mask; + out[5] = ((w1 >> 23) | (w2 << 9)) & mask; + out[6] = (w2 >> 2) & mask; + out[7] = (w2 >> 13) & mask; + out[8] = ((w2 >> 24) | (w3 << 8)) & mask; + out[9] = (w3 >> 3) & mask; + out[10] = (w3 >> 14) & mask; + out[11] = ((w3 >> 25) | (w4 << 7)) & mask; + out[12] = (w4 >> 4) & mask; + out[13] = (w4 >> 15) & mask; + out[14] = ((w4 >> 26) | (w5 << 6)) & mask; + out[15] = (w5 >> 5) & mask; + out[16] = (w5 >> 16) & mask; + out[17] = ((w5 >> 27) | (w6 << 5)) & mask; + out[18] = (w6 >> 6) & mask; + out[19] = (w6 >> 17) & mask; + out[20] = ((w6 >> 28) | (w7 << 4)) & mask; + out[21] = (w7 >> 7) & mask; + out[22] = (w7 >> 18) & mask; + out[23] = ((w7 >> 29) | (w8 << 3)) & mask; + out[24] = (w8 >> 8) & mask; + out[25] = (w8 >> 19) & mask; + out[26] = ((w8 >> 30) | (w9 << 2)) & mask; + out[27] = (w9 >> 9) & mask; + out[28] = (w9 >> 20) & mask; + out[29] = ((w9 >> 31) | (w10 << 1)) & mask; + out[30] = (w10 >> 10) & mask; + out[31] = w10 >> 21; + + return in; +} + +inline const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 12) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 12) & mask; + out[2] = ((w0 >> 24) | (w1 << 8)) & mask; + out[3] = (w1 >> 4) & mask; + out[4] = (w1 >> 16) & mask; + out[5] = ((w1 >> 28) | (w2 << 4)) & mask; + out[6] = (w2 >> 8) & mask; + out[7] = w2 >> 20; + out[8] = (w3) & mask; + out[9] = (w3 >> 12) & mask; + out[10] = ((w3 >> 24) | (w4 << 8)) & mask; + out[11] = (w4 >> 4) & mask; + out[12] = (w4 >> 16) & mask; + out[13] = ((w4 >> 28) | (w5 << 4)) & mask; + out[14] = (w5 >> 8) & mask; + out[15] = w5 >> 20; + out[16] = (w6) & mask; + out[17] = (w6 >> 12) & mask; + out[18] = ((w6 >> 24) | (w7 << 8)) & mask; + out[19] = (w7 >> 4) & mask; + out[20] = (w7 >> 16) & mask; + out[21] = ((w7 >> 28) | (w8 << 4)) & mask; + out[22] = (w8 >> 8) & mask; + out[23] = w8 >> 20; + out[24] = (w9) & mask; + out[25] = (w9 >> 12) & mask; + out[26] = ((w9 >> 24) | (w10 << 8)) & mask; + out[27] = (w10 >> 4) & mask; + out[28] = (w10 >> 16) & mask; + out[29] = ((w10 >> 28) | (w11 << 4)) & mask; + out[30] = (w11 >> 8) & mask; + out[31] = w11 >> 20; + + return in; +} + +inline const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 13) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 13) & mask; + out[2] = ((w0 >> 26) | (w1 << 6)) & mask; + out[3] = (w1 >> 7) & mask; + out[4] = ((w1 >> 20) | (w2 << 12)) & mask; + out[5] = (w2 >> 1) & mask; + out[6] = (w2 >> 14) & mask; + out[7] = ((w2 >> 27) | (w3 << 5)) & mask; + out[8] = (w3 >> 8) & mask; + out[9] = ((w3 >> 21) | (w4 << 11)) & mask; + out[10] = (w4 >> 2) & mask; + out[11] = (w4 >> 15) & mask; + out[12] = ((w4 >> 28) | (w5 << 4)) & mask; + out[13] = (w5 >> 9) & mask; + out[14] = ((w5 >> 22) | (w6 << 10)) & mask; + out[15] = (w6 >> 3) & mask; + out[16] = (w6 >> 16) & mask; + out[17] = ((w6 >> 29) | (w7 << 3)) & mask; + out[18] = (w7 >> 10) & mask; + out[19] = ((w7 >> 23) | (w8 << 9)) & mask; + out[20] = (w8 >> 4) & mask; + out[21] = (w8 >> 17) & mask; + out[22] = ((w8 >> 30) | (w9 << 2)) & mask; + out[23] = (w9 >> 11) & mask; + out[24] = ((w9 >> 24) | (w10 << 8)) & mask; + out[25] = (w10 >> 5) & mask; + out[26] = (w10 >> 18) & mask; + out[27] = ((w10 >> 31) | (w11 << 1)) & mask; + out[28] = (w11 >> 12) & mask; + out[29] = ((w11 >> 25) | (w12 << 7)) & mask; + out[30] = (w12 >> 6) & mask; + out[31] = w12 >> 19; + + return in; +} + +inline const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 14) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 14) & mask; + out[2] = ((w0 >> 28) | (w1 << 4)) & mask; + out[3] = (w1 >> 10) & mask; + out[4] = ((w1 >> 24) | (w2 << 8)) & mask; + out[5] = (w2 >> 6) & mask; + out[6] = ((w2 >> 20) | (w3 << 12)) & mask; + out[7] = (w3 >> 2) & mask; + out[8] = (w3 >> 16) & mask; + out[9] = ((w3 >> 30) | (w4 << 2)) & mask; + out[10] = (w4 >> 12) & mask; + out[11] = ((w4 >> 26) | (w5 << 6)) & mask; + out[12] = (w5 >> 8) & mask; + out[13] = ((w5 >> 22) | (w6 << 10)) & mask; + out[14] = (w6 >> 4) & mask; + out[15] = w6 >> 18; + out[16] = (w7) & mask; + out[17] = (w7 >> 14) & mask; + out[18] = ((w7 >> 28) | (w8 << 4)) & mask; + out[19] = (w8 >> 10) & mask; + out[20] = ((w8 >> 24) | (w9 << 8)) & mask; + out[21] = (w9 >> 6) & mask; + out[22] = ((w9 >> 20) | (w10 << 12)) & mask; + out[23] = (w10 >> 2) & mask; + out[24] = (w10 >> 16) & mask; + out[25] = ((w10 >> 30) | (w11 << 2)) & mask; + out[26] = (w11 >> 12) & mask; + out[27] = ((w11 >> 26) | (w12 << 6)) & mask; + out[28] = (w12 >> 8) & mask; + out[29] = ((w12 >> 22) | (w13 << 10)) & mask; + out[30] = (w13 >> 4) & mask; + out[31] = w13 >> 18; + + return in; +} + +inline const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 15) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 15) & mask; + out[2] = ((w0 >> 30) | (w1 << 2)) & mask; + out[3] = (w1 >> 13) & mask; + out[4] = ((w1 >> 28) | (w2 << 4)) & mask; + out[5] = (w2 >> 11) & mask; + out[6] = ((w2 >> 26) | (w3 << 6)) & mask; + out[7] = (w3 >> 9) & mask; + out[8] = ((w3 >> 24) | (w4 << 8)) & mask; + out[9] = (w4 >> 7) & mask; + out[10] = ((w4 >> 22) | (w5 << 10)) & mask; + out[11] = (w5 >> 5) & mask; + out[12] = ((w5 >> 20) | (w6 << 12)) & mask; + out[13] = (w6 >> 3) & mask; + out[14] = ((w6 >> 18) | (w7 << 14)) & mask; + out[15] = (w7 >> 1) & mask; + out[16] = (w7 >> 16) & mask; + out[17] = ((w7 >> 31) | (w8 << 1)) & mask; + out[18] = (w8 >> 14) & mask; + out[19] = ((w8 >> 29) | (w9 << 3)) & mask; + out[20] = (w9 >> 12) & mask; + out[21] = ((w9 >> 27) | (w10 << 5)) & mask; + out[22] = (w10 >> 10) & mask; + out[23] = ((w10 >> 25) | (w11 << 7)) & mask; + out[24] = (w11 >> 8) & mask; + out[25] = ((w11 >> 23) | (w12 << 9)) & mask; + out[26] = (w12 >> 6) & mask; + out[27] = ((w12 >> 21) | (w13 << 11)) & mask; + out[28] = (w13 >> 4) & mask; + out[29] = ((w13 >> 19) | (w14 << 13)) & mask; + out[30] = (w14 >> 2) & mask; + out[31] = w14 >> 17; + + return in; +} + +inline const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 16) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = w0 >> 16; + out[2] = (w1) & mask; + out[3] = w1 >> 16; + out[4] = (w2) & mask; + out[5] = w2 >> 16; + out[6] = (w3) & mask; + out[7] = w3 >> 16; + out[8] = (w4) & mask; + out[9] = w4 >> 16; + out[10] = (w5) & mask; + out[11] = w5 >> 16; + out[12] = (w6) & mask; + out[13] = w6 >> 16; + out[14] = (w7) & mask; + out[15] = w7 >> 16; + out[16] = (w8) & mask; + out[17] = w8 >> 16; + out[18] = (w9) & mask; + out[19] = w9 >> 16; + out[20] = (w10) & mask; + out[21] = w10 >> 16; + out[22] = (w11) & mask; + out[23] = w11 >> 16; + out[24] = (w12) & mask; + out[25] = w12 >> 16; + out[26] = (w13) & mask; + out[27] = w13 >> 16; + out[28] = (w14) & mask; + out[29] = w14 >> 16; + out[30] = (w15) & mask; + out[31] = w15 >> 16; + + return in; +} + +inline const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 17) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 17) | (w1 << 15)) & mask; + out[2] = (w1 >> 2) & mask; + out[3] = ((w1 >> 19) | (w2 << 13)) & mask; + out[4] = (w2 >> 4) & mask; + out[5] = ((w2 >> 21) | (w3 << 11)) & mask; + out[6] = (w3 >> 6) & mask; + out[7] = ((w3 >> 23) | (w4 << 9)) & mask; + out[8] = (w4 >> 8) & mask; + out[9] = ((w4 >> 25) | (w5 << 7)) & mask; + out[10] = (w5 >> 10) & mask; + out[11] = ((w5 >> 27) | (w6 << 5)) & mask; + out[12] = (w6 >> 12) & mask; + out[13] = ((w6 >> 29) | (w7 << 3)) & mask; + out[14] = (w7 >> 14) & mask; + out[15] = ((w7 >> 31) | (w8 << 1)) & mask; + out[16] = ((w8 >> 16) | (w9 << 16)) & mask; + out[17] = (w9 >> 1) & mask; + out[18] = ((w9 >> 18) | (w10 << 14)) & mask; + out[19] = (w10 >> 3) & mask; + out[20] = ((w10 >> 20) | (w11 << 12)) & mask; + out[21] = (w11 >> 5) & mask; + out[22] = ((w11 >> 22) | (w12 << 10)) & mask; + out[23] = (w12 >> 7) & mask; + out[24] = ((w12 >> 24) | (w13 << 8)) & mask; + out[25] = (w13 >> 9) & mask; + out[26] = ((w13 >> 26) | (w14 << 6)) & mask; + out[27] = (w14 >> 11) & mask; + out[28] = ((w14 >> 28) | (w15 << 4)) & mask; + out[29] = (w15 >> 13) & mask; + out[30] = ((w15 >> 30) | (w16 << 2)) & mask; + out[31] = w16 >> 15; + + return in; +} + +inline const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 18) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 18) | (w1 << 14)) & mask; + out[2] = (w1 >> 4) & mask; + out[3] = ((w1 >> 22) | (w2 << 10)) & mask; + out[4] = (w2 >> 8) & mask; + out[5] = ((w2 >> 26) | (w3 << 6)) & mask; + out[6] = (w3 >> 12) & mask; + out[7] = ((w3 >> 30) | (w4 << 2)) & mask; + out[8] = ((w4 >> 16) | (w5 << 16)) & mask; + out[9] = (w5 >> 2) & mask; + out[10] = ((w5 >> 20) | (w6 << 12)) & mask; + out[11] = (w6 >> 6) & mask; + out[12] = ((w6 >> 24) | (w7 << 8)) & mask; + out[13] = (w7 >> 10) & mask; + out[14] = ((w7 >> 28) | (w8 << 4)) & mask; + out[15] = w8 >> 14; + out[16] = (w9) & mask; + out[17] = ((w9 >> 18) | (w10 << 14)) & mask; + out[18] = (w10 >> 4) & mask; + out[19] = ((w10 >> 22) | (w11 << 10)) & mask; + out[20] = (w11 >> 8) & mask; + out[21] = ((w11 >> 26) | (w12 << 6)) & mask; + out[22] = (w12 >> 12) & mask; + out[23] = ((w12 >> 30) | (w13 << 2)) & mask; + out[24] = ((w13 >> 16) | (w14 << 16)) & mask; + out[25] = (w14 >> 2) & mask; + out[26] = ((w14 >> 20) | (w15 << 12)) & mask; + out[27] = (w15 >> 6) & mask; + out[28] = ((w15 >> 24) | (w16 << 8)) & mask; + out[29] = (w16 >> 10) & mask; + out[30] = ((w16 >> 28) | (w17 << 4)) & mask; + out[31] = w17 >> 14; + + return in; +} + +inline const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 19) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 19) | (w1 << 13)) & mask; + out[2] = (w1 >> 6) & mask; + out[3] = ((w1 >> 25) | (w2 << 7)) & mask; + out[4] = (w2 >> 12) & mask; + out[5] = ((w2 >> 31) | (w3 << 1)) & mask; + out[6] = ((w3 >> 18) | (w4 << 14)) & mask; + out[7] = (w4 >> 5) & mask; + out[8] = ((w4 >> 24) | (w5 << 8)) & mask; + out[9] = (w5 >> 11) & mask; + out[10] = ((w5 >> 30) | (w6 << 2)) & mask; + out[11] = ((w6 >> 17) | (w7 << 15)) & mask; + out[12] = (w7 >> 4) & mask; + out[13] = ((w7 >> 23) | (w8 << 9)) & mask; + out[14] = (w8 >> 10) & mask; + out[15] = ((w8 >> 29) | (w9 << 3)) & mask; + out[16] = ((w9 >> 16) | (w10 << 16)) & mask; + out[17] = (w10 >> 3) & mask; + out[18] = ((w10 >> 22) | (w11 << 10)) & mask; + out[19] = (w11 >> 9) & mask; + out[20] = ((w11 >> 28) | (w12 << 4)) & mask; + out[21] = ((w12 >> 15) | (w13 << 17)) & mask; + out[22] = (w13 >> 2) & mask; + out[23] = ((w13 >> 21) | (w14 << 11)) & mask; + out[24] = (w14 >> 8) & mask; + out[25] = ((w14 >> 27) | (w15 << 5)) & mask; + out[26] = ((w15 >> 14) | (w16 << 18)) & mask; + out[27] = (w16 >> 1) & mask; + out[28] = ((w16 >> 20) | (w17 << 12)) & mask; + out[29] = (w17 >> 7) & mask; + out[30] = ((w17 >> 26) | (w18 << 6)) & mask; + out[31] = w18 >> 13; + + return in; +} + +inline const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 20) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 20) | (w1 << 12)) & mask; + out[2] = (w1 >> 8) & mask; + out[3] = ((w1 >> 28) | (w2 << 4)) & mask; + out[4] = ((w2 >> 16) | (w3 << 16)) & mask; + out[5] = (w3 >> 4) & mask; + out[6] = ((w3 >> 24) | (w4 << 8)) & mask; + out[7] = w4 >> 12; + out[8] = (w5) & mask; + out[9] = ((w5 >> 20) | (w6 << 12)) & mask; + out[10] = (w6 >> 8) & mask; + out[11] = ((w6 >> 28) | (w7 << 4)) & mask; + out[12] = ((w7 >> 16) | (w8 << 16)) & mask; + out[13] = (w8 >> 4) & mask; + out[14] = ((w8 >> 24) | (w9 << 8)) & mask; + out[15] = w9 >> 12; + out[16] = (w10) & mask; + out[17] = ((w10 >> 20) | (w11 << 12)) & mask; + out[18] = (w11 >> 8) & mask; + out[19] = ((w11 >> 28) | (w12 << 4)) & mask; + out[20] = ((w12 >> 16) | (w13 << 16)) & mask; + out[21] = (w13 >> 4) & mask; + out[22] = ((w13 >> 24) | (w14 << 8)) & mask; + out[23] = w14 >> 12; + out[24] = (w15) & mask; + out[25] = ((w15 >> 20) | (w16 << 12)) & mask; + out[26] = (w16 >> 8) & mask; + out[27] = ((w16 >> 28) | (w17 << 4)) & mask; + out[28] = ((w17 >> 16) | (w18 << 16)) & mask; + out[29] = (w18 >> 4) & mask; + out[30] = ((w18 >> 24) | (w19 << 8)) & mask; + out[31] = w19 >> 12; + + return in; +} + +inline const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 21) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 21) | (w1 << 11)) & mask; + out[2] = (w1 >> 10) & mask; + out[3] = ((w1 >> 31) | (w2 << 1)) & mask; + out[4] = ((w2 >> 20) | (w3 << 12)) & mask; + out[5] = (w3 >> 9) & mask; + out[6] = ((w3 >> 30) | (w4 << 2)) & mask; + out[7] = ((w4 >> 19) | (w5 << 13)) & mask; + out[8] = (w5 >> 8) & mask; + out[9] = ((w5 >> 29) | (w6 << 3)) & mask; + out[10] = ((w6 >> 18) | (w7 << 14)) & mask; + out[11] = (w7 >> 7) & mask; + out[12] = ((w7 >> 28) | (w8 << 4)) & mask; + out[13] = ((w8 >> 17) | (w9 << 15)) & mask; + out[14] = (w9 >> 6) & mask; + out[15] = ((w9 >> 27) | (w10 << 5)) & mask; + out[16] = ((w10 >> 16) | (w11 << 16)) & mask; + out[17] = (w11 >> 5) & mask; + out[18] = ((w11 >> 26) | (w12 << 6)) & mask; + out[19] = ((w12 >> 15) | (w13 << 17)) & mask; + out[20] = (w13 >> 4) & mask; + out[21] = ((w13 >> 25) | (w14 << 7)) & mask; + out[22] = ((w14 >> 14) | (w15 << 18)) & mask; + out[23] = (w15 >> 3) & mask; + out[24] = ((w15 >> 24) | (w16 << 8)) & mask; + out[25] = ((w16 >> 13) | (w17 << 19)) & mask; + out[26] = (w17 >> 2) & mask; + out[27] = ((w17 >> 23) | (w18 << 9)) & mask; + out[28] = ((w18 >> 12) | (w19 << 20)) & mask; + out[29] = (w19 >> 1) & mask; + out[30] = ((w19 >> 22) | (w20 << 10)) & mask; + out[31] = w20 >> 11; + + return in; +} + +inline const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 22) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 22) | (w1 << 10)) & mask; + out[2] = ((w1 >> 12) | (w2 << 20)) & mask; + out[3] = (w2 >> 2) & mask; + out[4] = ((w2 >> 24) | (w3 << 8)) & mask; + out[5] = ((w3 >> 14) | (w4 << 18)) & mask; + out[6] = (w4 >> 4) & mask; + out[7] = ((w4 >> 26) | (w5 << 6)) & mask; + out[8] = ((w5 >> 16) | (w6 << 16)) & mask; + out[9] = (w6 >> 6) & mask; + out[10] = ((w6 >> 28) | (w7 << 4)) & mask; + out[11] = ((w7 >> 18) | (w8 << 14)) & mask; + out[12] = (w8 >> 8) & mask; + out[13] = ((w8 >> 30) | (w9 << 2)) & mask; + out[14] = ((w9 >> 20) | (w10 << 12)) & mask; + out[15] = w10 >> 10; + out[16] = (w11) & mask; + out[17] = ((w11 >> 22) | (w12 << 10)) & mask; + out[18] = ((w12 >> 12) | (w13 << 20)) & mask; + out[19] = (w13 >> 2) & mask; + out[20] = ((w13 >> 24) | (w14 << 8)) & mask; + out[21] = ((w14 >> 14) | (w15 << 18)) & mask; + out[22] = (w15 >> 4) & mask; + out[23] = ((w15 >> 26) | (w16 << 6)) & mask; + out[24] = ((w16 >> 16) | (w17 << 16)) & mask; + out[25] = (w17 >> 6) & mask; + out[26] = ((w17 >> 28) | (w18 << 4)) & mask; + out[27] = ((w18 >> 18) | (w19 << 14)) & mask; + out[28] = (w19 >> 8) & mask; + out[29] = ((w19 >> 30) | (w20 << 2)) & mask; + out[30] = ((w20 >> 20) | (w21 << 12)) & mask; + out[31] = w21 >> 10; + + return in; +} + +inline const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 23) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 23) | (w1 << 9)) & mask; + out[2] = ((w1 >> 14) | (w2 << 18)) & mask; + out[3] = (w2 >> 5) & mask; + out[4] = ((w2 >> 28) | (w3 << 4)) & mask; + out[5] = ((w3 >> 19) | (w4 << 13)) & mask; + out[6] = ((w4 >> 10) | (w5 << 22)) & mask; + out[7] = (w5 >> 1) & mask; + out[8] = ((w5 >> 24) | (w6 << 8)) & mask; + out[9] = ((w6 >> 15) | (w7 << 17)) & mask; + out[10] = (w7 >> 6) & mask; + out[11] = ((w7 >> 29) | (w8 << 3)) & mask; + out[12] = ((w8 >> 20) | (w9 << 12)) & mask; + out[13] = ((w9 >> 11) | (w10 << 21)) & mask; + out[14] = (w10 >> 2) & mask; + out[15] = ((w10 >> 25) | (w11 << 7)) & mask; + out[16] = ((w11 >> 16) | (w12 << 16)) & mask; + out[17] = (w12 >> 7) & mask; + out[18] = ((w12 >> 30) | (w13 << 2)) & mask; + out[19] = ((w13 >> 21) | (w14 << 11)) & mask; + out[20] = ((w14 >> 12) | (w15 << 20)) & mask; + out[21] = (w15 >> 3) & mask; + out[22] = ((w15 >> 26) | (w16 << 6)) & mask; + out[23] = ((w16 >> 17) | (w17 << 15)) & mask; + out[24] = (w17 >> 8) & mask; + out[25] = ((w17 >> 31) | (w18 << 1)) & mask; + out[26] = ((w18 >> 22) | (w19 << 10)) & mask; + out[27] = ((w19 >> 13) | (w20 << 19)) & mask; + out[28] = (w20 >> 4) & mask; + out[29] = ((w20 >> 27) | (w21 << 5)) & mask; + out[30] = ((w21 >> 18) | (w22 << 14)) & mask; + out[31] = w22 >> 9; + + return in; +} + +inline const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 24) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 24) | (w1 << 8)) & mask; + out[2] = ((w1 >> 16) | (w2 << 16)) & mask; + out[3] = w2 >> 8; + out[4] = (w3) & mask; + out[5] = ((w3 >> 24) | (w4 << 8)) & mask; + out[6] = ((w4 >> 16) | (w5 << 16)) & mask; + out[7] = w5 >> 8; + out[8] = (w6) & mask; + out[9] = ((w6 >> 24) | (w7 << 8)) & mask; + out[10] = ((w7 >> 16) | (w8 << 16)) & mask; + out[11] = w8 >> 8; + out[12] = (w9) & mask; + out[13] = ((w9 >> 24) | (w10 << 8)) & mask; + out[14] = ((w10 >> 16) | (w11 << 16)) & mask; + out[15] = w11 >> 8; + out[16] = (w12) & mask; + out[17] = ((w12 >> 24) | (w13 << 8)) & mask; + out[18] = ((w13 >> 16) | (w14 << 16)) & mask; + out[19] = w14 >> 8; + out[20] = (w15) & mask; + out[21] = ((w15 >> 24) | (w16 << 8)) & mask; + out[22] = ((w16 >> 16) | (w17 << 16)) & mask; + out[23] = w17 >> 8; + out[24] = (w18) & mask; + out[25] = ((w18 >> 24) | (w19 << 8)) & mask; + out[26] = ((w19 >> 16) | (w20 << 16)) & mask; + out[27] = w20 >> 8; + out[28] = (w21) & mask; + out[29] = ((w21 >> 24) | (w22 << 8)) & mask; + out[30] = ((w22 >> 16) | (w23 << 16)) & mask; + out[31] = w23 >> 8; + + return in; +} + +inline const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 25) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 25) | (w1 << 7)) & mask; + out[2] = ((w1 >> 18) | (w2 << 14)) & mask; + out[3] = ((w2 >> 11) | (w3 << 21)) & mask; + out[4] = (w3 >> 4) & mask; + out[5] = ((w3 >> 29) | (w4 << 3)) & mask; + out[6] = ((w4 >> 22) | (w5 << 10)) & mask; + out[7] = ((w5 >> 15) | (w6 << 17)) & mask; + out[8] = ((w6 >> 8) | (w7 << 24)) & mask; + out[9] = (w7 >> 1) & mask; + out[10] = ((w7 >> 26) | (w8 << 6)) & mask; + out[11] = ((w8 >> 19) | (w9 << 13)) & mask; + out[12] = ((w9 >> 12) | (w10 << 20)) & mask; + out[13] = (w10 >> 5) & mask; + out[14] = ((w10 >> 30) | (w11 << 2)) & mask; + out[15] = ((w11 >> 23) | (w12 << 9)) & mask; + out[16] = ((w12 >> 16) | (w13 << 16)) & mask; + out[17] = ((w13 >> 9) | (w14 << 23)) & mask; + out[18] = (w14 >> 2) & mask; + out[19] = ((w14 >> 27) | (w15 << 5)) & mask; + out[20] = ((w15 >> 20) | (w16 << 12)) & mask; + out[21] = ((w16 >> 13) | (w17 << 19)) & mask; + out[22] = (w17 >> 6) & mask; + out[23] = ((w17 >> 31) | (w18 << 1)) & mask; + out[24] = ((w18 >> 24) | (w19 << 8)) & mask; + out[25] = ((w19 >> 17) | (w20 << 15)) & mask; + out[26] = ((w20 >> 10) | (w21 << 22)) & mask; + out[27] = (w21 >> 3) & mask; + out[28] = ((w21 >> 28) | (w22 << 4)) & mask; + out[29] = ((w22 >> 21) | (w23 << 11)) & mask; + out[30] = ((w23 >> 14) | (w24 << 18)) & mask; + out[31] = w24 >> 7; + + return in; +} + +inline const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 26) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 26) | (w1 << 6)) & mask; + out[2] = ((w1 >> 20) | (w2 << 12)) & mask; + out[3] = ((w2 >> 14) | (w3 << 18)) & mask; + out[4] = ((w3 >> 8) | (w4 << 24)) & mask; + out[5] = (w4 >> 2) & mask; + out[6] = ((w4 >> 28) | (w5 << 4)) & mask; + out[7] = ((w5 >> 22) | (w6 << 10)) & mask; + out[8] = ((w6 >> 16) | (w7 << 16)) & mask; + out[9] = ((w7 >> 10) | (w8 << 22)) & mask; + out[10] = (w8 >> 4) & mask; + out[11] = ((w8 >> 30) | (w9 << 2)) & mask; + out[12] = ((w9 >> 24) | (w10 << 8)) & mask; + out[13] = ((w10 >> 18) | (w11 << 14)) & mask; + out[14] = ((w11 >> 12) | (w12 << 20)) & mask; + out[15] = w12 >> 6; + out[16] = (w13) & mask; + out[17] = ((w13 >> 26) | (w14 << 6)) & mask; + out[18] = ((w14 >> 20) | (w15 << 12)) & mask; + out[19] = ((w15 >> 14) | (w16 << 18)) & mask; + out[20] = ((w16 >> 8) | (w17 << 24)) & mask; + out[21] = (w17 >> 2) & mask; + out[22] = ((w17 >> 28) | (w18 << 4)) & mask; + out[23] = ((w18 >> 22) | (w19 << 10)) & mask; + out[24] = ((w19 >> 16) | (w20 << 16)) & mask; + out[25] = ((w20 >> 10) | (w21 << 22)) & mask; + out[26] = (w21 >> 4) & mask; + out[27] = ((w21 >> 30) | (w22 << 2)) & mask; + out[28] = ((w22 >> 24) | (w23 << 8)) & mask; + out[29] = ((w23 >> 18) | (w24 << 14)) & mask; + out[30] = ((w24 >> 12) | (w25 << 20)) & mask; + out[31] = w25 >> 6; + + return in; +} + +inline const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 27) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 27) | (w1 << 5)) & mask; + out[2] = ((w1 >> 22) | (w2 << 10)) & mask; + out[3] = ((w2 >> 17) | (w3 << 15)) & mask; + out[4] = ((w3 >> 12) | (w4 << 20)) & mask; + out[5] = ((w4 >> 7) | (w5 << 25)) & mask; + out[6] = (w5 >> 2) & mask; + out[7] = ((w5 >> 29) | (w6 << 3)) & mask; + out[8] = ((w6 >> 24) | (w7 << 8)) & mask; + out[9] = ((w7 >> 19) | (w8 << 13)) & mask; + out[10] = ((w8 >> 14) | (w9 << 18)) & mask; + out[11] = ((w9 >> 9) | (w10 << 23)) & mask; + out[12] = (w10 >> 4) & mask; + out[13] = ((w10 >> 31) | (w11 << 1)) & mask; + out[14] = ((w11 >> 26) | (w12 << 6)) & mask; + out[15] = ((w12 >> 21) | (w13 << 11)) & mask; + out[16] = ((w13 >> 16) | (w14 << 16)) & mask; + out[17] = ((w14 >> 11) | (w15 << 21)) & mask; + out[18] = ((w15 >> 6) | (w16 << 26)) & mask; + out[19] = (w16 >> 1) & mask; + out[20] = ((w16 >> 28) | (w17 << 4)) & mask; + out[21] = ((w17 >> 23) | (w18 << 9)) & mask; + out[22] = ((w18 >> 18) | (w19 << 14)) & mask; + out[23] = ((w19 >> 13) | (w20 << 19)) & mask; + out[24] = ((w20 >> 8) | (w21 << 24)) & mask; + out[25] = (w21 >> 3) & mask; + out[26] = ((w21 >> 30) | (w22 << 2)) & mask; + out[27] = ((w22 >> 25) | (w23 << 7)) & mask; + out[28] = ((w23 >> 20) | (w24 << 12)) & mask; + out[29] = ((w24 >> 15) | (w25 << 17)) & mask; + out[30] = ((w25 >> 10) | (w26 << 22)) & mask; + out[31] = w26 >> 5; + + return in; +} + +inline const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 28) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 28) | (w1 << 4)) & mask; + out[2] = ((w1 >> 24) | (w2 << 8)) & mask; + out[3] = ((w2 >> 20) | (w3 << 12)) & mask; + out[4] = ((w3 >> 16) | (w4 << 16)) & mask; + out[5] = ((w4 >> 12) | (w5 << 20)) & mask; + out[6] = ((w5 >> 8) | (w6 << 24)) & mask; + out[7] = w6 >> 4; + out[8] = (w7) & mask; + out[9] = ((w7 >> 28) | (w8 << 4)) & mask; + out[10] = ((w8 >> 24) | (w9 << 8)) & mask; + out[11] = ((w9 >> 20) | (w10 << 12)) & mask; + out[12] = ((w10 >> 16) | (w11 << 16)) & mask; + out[13] = ((w11 >> 12) | (w12 << 20)) & mask; + out[14] = ((w12 >> 8) | (w13 << 24)) & mask; + out[15] = w13 >> 4; + out[16] = (w14) & mask; + out[17] = ((w14 >> 28) | (w15 << 4)) & mask; + out[18] = ((w15 >> 24) | (w16 << 8)) & mask; + out[19] = ((w16 >> 20) | (w17 << 12)) & mask; + out[20] = ((w17 >> 16) | (w18 << 16)) & mask; + out[21] = ((w18 >> 12) | (w19 << 20)) & mask; + out[22] = ((w19 >> 8) | (w20 << 24)) & mask; + out[23] = w20 >> 4; + out[24] = (w21) & mask; + out[25] = ((w21 >> 28) | (w22 << 4)) & mask; + out[26] = ((w22 >> 24) | (w23 << 8)) & mask; + out[27] = ((w23 >> 20) | (w24 << 12)) & mask; + out[28] = ((w24 >> 16) | (w25 << 16)) & mask; + out[29] = ((w25 >> 12) | (w26 << 20)) & mask; + out[30] = ((w26 >> 8) | (w27 << 24)) & mask; + out[31] = w27 >> 4; + + return in; +} + +inline const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 29) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 29) | (w1 << 3)) & mask; + out[2] = ((w1 >> 26) | (w2 << 6)) & mask; + out[3] = ((w2 >> 23) | (w3 << 9)) & mask; + out[4] = ((w3 >> 20) | (w4 << 12)) & mask; + out[5] = ((w4 >> 17) | (w5 << 15)) & mask; + out[6] = ((w5 >> 14) | (w6 << 18)) & mask; + out[7] = ((w6 >> 11) | (w7 << 21)) & mask; + out[8] = ((w7 >> 8) | (w8 << 24)) & mask; + out[9] = ((w8 >> 5) | (w9 << 27)) & mask; + out[10] = (w9 >> 2) & mask; + out[11] = ((w9 >> 31) | (w10 << 1)) & mask; + out[12] = ((w10 >> 28) | (w11 << 4)) & mask; + out[13] = ((w11 >> 25) | (w12 << 7)) & mask; + out[14] = ((w12 >> 22) | (w13 << 10)) & mask; + out[15] = ((w13 >> 19) | (w14 << 13)) & mask; + out[16] = ((w14 >> 16) | (w15 << 16)) & mask; + out[17] = ((w15 >> 13) | (w16 << 19)) & mask; + out[18] = ((w16 >> 10) | (w17 << 22)) & mask; + out[19] = ((w17 >> 7) | (w18 << 25)) & mask; + out[20] = ((w18 >> 4) | (w19 << 28)) & mask; + out[21] = (w19 >> 1) & mask; + out[22] = ((w19 >> 30) | (w20 << 2)) & mask; + out[23] = ((w20 >> 27) | (w21 << 5)) & mask; + out[24] = ((w21 >> 24) | (w22 << 8)) & mask; + out[25] = ((w22 >> 21) | (w23 << 11)) & mask; + out[26] = ((w23 >> 18) | (w24 << 14)) & mask; + out[27] = ((w24 >> 15) | (w25 << 17)) & mask; + out[28] = ((w25 >> 12) | (w26 << 20)) & mask; + out[29] = ((w26 >> 9) | (w27 << 23)) & mask; + out[30] = ((w27 >> 6) | (w28 << 26)) & mask; + out[31] = w28 >> 3; + + return in; +} + +inline const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 30) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w29 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 30) | (w1 << 2)) & mask; + out[2] = ((w1 >> 28) | (w2 << 4)) & mask; + out[3] = ((w2 >> 26) | (w3 << 6)) & mask; + out[4] = ((w3 >> 24) | (w4 << 8)) & mask; + out[5] = ((w4 >> 22) | (w5 << 10)) & mask; + out[6] = ((w5 >> 20) | (w6 << 12)) & mask; + out[7] = ((w6 >> 18) | (w7 << 14)) & mask; + out[8] = ((w7 >> 16) | (w8 << 16)) & mask; + out[9] = ((w8 >> 14) | (w9 << 18)) & mask; + out[10] = ((w9 >> 12) | (w10 << 20)) & mask; + out[11] = ((w10 >> 10) | (w11 << 22)) & mask; + out[12] = ((w11 >> 8) | (w12 << 24)) & mask; + out[13] = ((w12 >> 6) | (w13 << 26)) & mask; + out[14] = ((w13 >> 4) | (w14 << 28)) & mask; + out[15] = w14 >> 2; + out[16] = (w15) & mask; + out[17] = ((w15 >> 30) | (w16 << 2)) & mask; + out[18] = ((w16 >> 28) | (w17 << 4)) & mask; + out[19] = ((w17 >> 26) | (w18 << 6)) & mask; + out[20] = ((w18 >> 24) | (w19 << 8)) & mask; + out[21] = ((w19 >> 22) | (w20 << 10)) & mask; + out[22] = ((w20 >> 20) | (w21 << 12)) & mask; + out[23] = ((w21 >> 18) | (w22 << 14)) & mask; + out[24] = ((w22 >> 16) | (w23 << 16)) & mask; + out[25] = ((w23 >> 14) | (w24 << 18)) & mask; + out[26] = ((w24 >> 12) | (w25 << 20)) & mask; + out[27] = ((w25 >> 10) | (w26 << 22)) & mask; + out[28] = ((w26 >> 8) | (w27 << 24)) & mask; + out[29] = ((w27 >> 6) | (w28 << 26)) & mask; + out[30] = ((w28 >> 4) | (w29 << 28)) & mask; + out[31] = w29 >> 2; + + return in; +} + +inline const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out){ + constexpr uint32_t mask = ((uint32_t{1} << 31) - uint32_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w29 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + const auto w30 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 31) | (w1 << 1)) & mask; + out[2] = ((w1 >> 30) | (w2 << 2)) & mask; + out[3] = ((w2 >> 29) | (w3 << 3)) & mask; + out[4] = ((w3 >> 28) | (w4 << 4)) & mask; + out[5] = ((w4 >> 27) | (w5 << 5)) & mask; + out[6] = ((w5 >> 26) | (w6 << 6)) & mask; + out[7] = ((w6 >> 25) | (w7 << 7)) & mask; + out[8] = ((w7 >> 24) | (w8 << 8)) & mask; + out[9] = ((w8 >> 23) | (w9 << 9)) & mask; + out[10] = ((w9 >> 22) | (w10 << 10)) & mask; + out[11] = ((w10 >> 21) | (w11 << 11)) & mask; + out[12] = ((w11 >> 20) | (w12 << 12)) & mask; + out[13] = ((w12 >> 19) | (w13 << 13)) & mask; + out[14] = ((w13 >> 18) | (w14 << 14)) & mask; + out[15] = ((w14 >> 17) | (w15 << 15)) & mask; + out[16] = ((w15 >> 16) | (w16 << 16)) & mask; + out[17] = ((w16 >> 15) | (w17 << 17)) & mask; + out[18] = ((w17 >> 14) | (w18 << 18)) & mask; + out[19] = ((w18 >> 13) | (w19 << 19)) & mask; + out[20] = ((w19 >> 12) | (w20 << 20)) & mask; + out[21] = ((w20 >> 11) | (w21 << 21)) & mask; + out[22] = ((w21 >> 10) | (w22 << 22)) & mask; + out[23] = ((w22 >> 9) | (w23 << 23)) & mask; + out[24] = ((w23 >> 8) | (w24 << 24)) & mask; + out[25] = ((w24 >> 7) | (w25 << 25)) & mask; + out[26] = ((w25 >> 6) | (w26 << 26)) & mask; + out[27] = ((w26 >> 5) | (w27 << 27)) & mask; + out[28] = ((w27 >> 4) | (w28 << 28)) & mask; + out[29] = ((w28 >> 3) | (w29 << 29)) & mask; + out[30] = ((w29 >> 2) | (w30 << 30)) & mask; + out[31] = w30 >> 1; + + return in; +} + +inline const uint8_t* unpack32_32(const uint8_t* in, uint32_t* out){ + for(int k = 0; k < 32; k += 1) { + auto w = util::SafeLoadAs(in); + out[k] = bit_util::FromLittleEndian(w); + in += 4; + } + return in; +} + +inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out){ + std::memset(out, 0, 32 * 8); + return in; +} + +inline const uint8_t* unpack1_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 1) - uint64_t{1}); + + auto w0 = static_cast(util::SafeLoadAs(in)); + w0 = bit_util::FromLittleEndian(w0); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 1) & mask; + out[2] = (w0 >> 2) & mask; + out[3] = (w0 >> 3) & mask; + out[4] = (w0 >> 4) & mask; + out[5] = (w0 >> 5) & mask; + out[6] = (w0 >> 6) & mask; + out[7] = (w0 >> 7) & mask; + out[8] = (w0 >> 8) & mask; + out[9] = (w0 >> 9) & mask; + out[10] = (w0 >> 10) & mask; + out[11] = (w0 >> 11) & mask; + out[12] = (w0 >> 12) & mask; + out[13] = (w0 >> 13) & mask; + out[14] = (w0 >> 14) & mask; + out[15] = (w0 >> 15) & mask; + out[16] = (w0 >> 16) & mask; + out[17] = (w0 >> 17) & mask; + out[18] = (w0 >> 18) & mask; + out[19] = (w0 >> 19) & mask; + out[20] = (w0 >> 20) & mask; + out[21] = (w0 >> 21) & mask; + out[22] = (w0 >> 22) & mask; + out[23] = (w0 >> 23) & mask; + out[24] = (w0 >> 24) & mask; + out[25] = (w0 >> 25) & mask; + out[26] = (w0 >> 26) & mask; + out[27] = (w0 >> 27) & mask; + out[28] = (w0 >> 28) & mask; + out[29] = (w0 >> 29) & mask; + out[30] = (w0 >> 30) & mask; + out[31] = (w0 >> 31) & mask; + + return in; +} + +inline const uint8_t* unpack2_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 2) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = (w0 >> 2) & mask; + out[2] = (w0 >> 4) & mask; + out[3] = (w0 >> 6) & mask; + out[4] = (w0 >> 8) & mask; + out[5] = (w0 >> 10) & mask; + out[6] = (w0 >> 12) & mask; + out[7] = (w0 >> 14) & mask; + out[8] = (w0 >> 16) & mask; + out[9] = (w0 >> 18) & mask; + out[10] = (w0 >> 20) & mask; + out[11] = (w0 >> 22) & mask; + out[12] = (w0 >> 24) & mask; + out[13] = (w0 >> 26) & mask; + out[14] = (w0 >> 28) & mask; + out[15] = (w0 >> 30) & mask; + out[16] = (w0 >> 32) & mask; + out[17] = (w0 >> 34) & mask; + out[18] = (w0 >> 36) & mask; + out[19] = (w0 >> 38) & mask; + out[20] = (w0 >> 40) & mask; + out[21] = (w0 >> 42) & mask; + out[22] = (w0 >> 44) & mask; + out[23] = (w0 >> 46) & mask; + out[24] = (w0 >> 48) & mask; + out[25] = (w0 >> 50) & mask; + out[26] = (w0 >> 52) & mask; + out[27] = (w0 >> 54) & mask; + out[28] = (w0 >> 56) & mask; + out[29] = (w0 >> 58) & mask; + out[30] = (w0 >> 60) & mask; + out[31] = w0 >> 62; + + return in; +} + +inline const uint8_t* unpack3_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 3) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w1 = static_cast(util::SafeLoadAs(in)); + w1 = bit_util::FromLittleEndian(w1); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 3) & mask; + out[2] = (w0 >> 6) & mask; + out[3] = (w0 >> 9) & mask; + out[4] = (w0 >> 12) & mask; + out[5] = (w0 >> 15) & mask; + out[6] = (w0 >> 18) & mask; + out[7] = (w0 >> 21) & mask; + out[8] = (w0 >> 24) & mask; + out[9] = (w0 >> 27) & mask; + out[10] = (w0 >> 30) & mask; + out[11] = (w0 >> 33) & mask; + out[12] = (w0 >> 36) & mask; + out[13] = (w0 >> 39) & mask; + out[14] = (w0 >> 42) & mask; + out[15] = (w0 >> 45) & mask; + out[16] = (w0 >> 48) & mask; + out[17] = (w0 >> 51) & mask; + out[18] = (w0 >> 54) & mask; + out[19] = (w0 >> 57) & mask; + out[20] = (w0 >> 60) & mask; + out[21] = ((w0 >> 63) | (w1 << 1)) & mask; + out[22] = (w1 >> 2) & mask; + out[23] = (w1 >> 5) & mask; + out[24] = (w1 >> 8) & mask; + out[25] = (w1 >> 11) & mask; + out[26] = (w1 >> 14) & mask; + out[27] = (w1 >> 17) & mask; + out[28] = (w1 >> 20) & mask; + out[29] = (w1 >> 23) & mask; + out[30] = (w1 >> 26) & mask; + out[31] = (w1 >> 29) & mask; + + return in; +} + +inline const uint8_t* unpack4_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 4) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = (w0 >> 4) & mask; + out[2] = (w0 >> 8) & mask; + out[3] = (w0 >> 12) & mask; + out[4] = (w0 >> 16) & mask; + out[5] = (w0 >> 20) & mask; + out[6] = (w0 >> 24) & mask; + out[7] = (w0 >> 28) & mask; + out[8] = (w0 >> 32) & mask; + out[9] = (w0 >> 36) & mask; + out[10] = (w0 >> 40) & mask; + out[11] = (w0 >> 44) & mask; + out[12] = (w0 >> 48) & mask; + out[13] = (w0 >> 52) & mask; + out[14] = (w0 >> 56) & mask; + out[15] = w0 >> 60; + out[16] = (w1) & mask; + out[17] = (w1 >> 4) & mask; + out[18] = (w1 >> 8) & mask; + out[19] = (w1 >> 12) & mask; + out[20] = (w1 >> 16) & mask; + out[21] = (w1 >> 20) & mask; + out[22] = (w1 >> 24) & mask; + out[23] = (w1 >> 28) & mask; + out[24] = (w1 >> 32) & mask; + out[25] = (w1 >> 36) & mask; + out[26] = (w1 >> 40) & mask; + out[27] = (w1 >> 44) & mask; + out[28] = (w1 >> 48) & mask; + out[29] = (w1 >> 52) & mask; + out[30] = (w1 >> 56) & mask; + out[31] = w1 >> 60; + + return in; +} + +inline const uint8_t* unpack5_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 5) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w2 = static_cast(util::SafeLoadAs(in)); + w2 = bit_util::FromLittleEndian(w2); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 5) & mask; + out[2] = (w0 >> 10) & mask; + out[3] = (w0 >> 15) & mask; + out[4] = (w0 >> 20) & mask; + out[5] = (w0 >> 25) & mask; + out[6] = (w0 >> 30) & mask; + out[7] = (w0 >> 35) & mask; + out[8] = (w0 >> 40) & mask; + out[9] = (w0 >> 45) & mask; + out[10] = (w0 >> 50) & mask; + out[11] = (w0 >> 55) & mask; + out[12] = ((w0 >> 60) | (w1 << 4)) & mask; + out[13] = (w1 >> 1) & mask; + out[14] = (w1 >> 6) & mask; + out[15] = (w1 >> 11) & mask; + out[16] = (w1 >> 16) & mask; + out[17] = (w1 >> 21) & mask; + out[18] = (w1 >> 26) & mask; + out[19] = (w1 >> 31) & mask; + out[20] = (w1 >> 36) & mask; + out[21] = (w1 >> 41) & mask; + out[22] = (w1 >> 46) & mask; + out[23] = (w1 >> 51) & mask; + out[24] = (w1 >> 56) & mask; + out[25] = ((w1 >> 61) | (w2 << 3)) & mask; + out[26] = (w2 >> 2) & mask; + out[27] = (w2 >> 7) & mask; + out[28] = (w2 >> 12) & mask; + out[29] = (w2 >> 17) & mask; + out[30] = (w2 >> 22) & mask; + out[31] = (w2 >> 27) & mask; + + return in; +} + +inline const uint8_t* unpack6_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 6) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = (w0 >> 6) & mask; + out[2] = (w0 >> 12) & mask; + out[3] = (w0 >> 18) & mask; + out[4] = (w0 >> 24) & mask; + out[5] = (w0 >> 30) & mask; + out[6] = (w0 >> 36) & mask; + out[7] = (w0 >> 42) & mask; + out[8] = (w0 >> 48) & mask; + out[9] = (w0 >> 54) & mask; + out[10] = ((w0 >> 60) | (w1 << 4)) & mask; + out[11] = (w1 >> 2) & mask; + out[12] = (w1 >> 8) & mask; + out[13] = (w1 >> 14) & mask; + out[14] = (w1 >> 20) & mask; + out[15] = (w1 >> 26) & mask; + out[16] = (w1 >> 32) & mask; + out[17] = (w1 >> 38) & mask; + out[18] = (w1 >> 44) & mask; + out[19] = (w1 >> 50) & mask; + out[20] = (w1 >> 56) & mask; + out[21] = ((w1 >> 62) | (w2 << 2)) & mask; + out[22] = (w2 >> 4) & mask; + out[23] = (w2 >> 10) & mask; + out[24] = (w2 >> 16) & mask; + out[25] = (w2 >> 22) & mask; + out[26] = (w2 >> 28) & mask; + out[27] = (w2 >> 34) & mask; + out[28] = (w2 >> 40) & mask; + out[29] = (w2 >> 46) & mask; + out[30] = (w2 >> 52) & mask; + out[31] = w2 >> 58; + + return in; +} + +inline const uint8_t* unpack7_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 7) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w3 = static_cast(util::SafeLoadAs(in)); + w3 = bit_util::FromLittleEndian(w3); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 7) & mask; + out[2] = (w0 >> 14) & mask; + out[3] = (w0 >> 21) & mask; + out[4] = (w0 >> 28) & mask; + out[5] = (w0 >> 35) & mask; + out[6] = (w0 >> 42) & mask; + out[7] = (w0 >> 49) & mask; + out[8] = (w0 >> 56) & mask; + out[9] = ((w0 >> 63) | (w1 << 1)) & mask; + out[10] = (w1 >> 6) & mask; + out[11] = (w1 >> 13) & mask; + out[12] = (w1 >> 20) & mask; + out[13] = (w1 >> 27) & mask; + out[14] = (w1 >> 34) & mask; + out[15] = (w1 >> 41) & mask; + out[16] = (w1 >> 48) & mask; + out[17] = (w1 >> 55) & mask; + out[18] = ((w1 >> 62) | (w2 << 2)) & mask; + out[19] = (w2 >> 5) & mask; + out[20] = (w2 >> 12) & mask; + out[21] = (w2 >> 19) & mask; + out[22] = (w2 >> 26) & mask; + out[23] = (w2 >> 33) & mask; + out[24] = (w2 >> 40) & mask; + out[25] = (w2 >> 47) & mask; + out[26] = (w2 >> 54) & mask; + out[27] = ((w2 >> 61) | (w3 << 3)) & mask; + out[28] = (w3 >> 4) & mask; + out[29] = (w3 >> 11) & mask; + out[30] = (w3 >> 18) & mask; + out[31] = (w3 >> 25) & mask; + + return in; +} + +inline const uint8_t* unpack8_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 8) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = (w0 >> 8) & mask; + out[2] = (w0 >> 16) & mask; + out[3] = (w0 >> 24) & mask; + out[4] = (w0 >> 32) & mask; + out[5] = (w0 >> 40) & mask; + out[6] = (w0 >> 48) & mask; + out[7] = w0 >> 56; + out[8] = (w1) & mask; + out[9] = (w1 >> 8) & mask; + out[10] = (w1 >> 16) & mask; + out[11] = (w1 >> 24) & mask; + out[12] = (w1 >> 32) & mask; + out[13] = (w1 >> 40) & mask; + out[14] = (w1 >> 48) & mask; + out[15] = w1 >> 56; + out[16] = (w2) & mask; + out[17] = (w2 >> 8) & mask; + out[18] = (w2 >> 16) & mask; + out[19] = (w2 >> 24) & mask; + out[20] = (w2 >> 32) & mask; + out[21] = (w2 >> 40) & mask; + out[22] = (w2 >> 48) & mask; + out[23] = w2 >> 56; + out[24] = (w3) & mask; + out[25] = (w3 >> 8) & mask; + out[26] = (w3 >> 16) & mask; + out[27] = (w3 >> 24) & mask; + out[28] = (w3 >> 32) & mask; + out[29] = (w3 >> 40) & mask; + out[30] = (w3 >> 48) & mask; + out[31] = w3 >> 56; + + return in; +} + +inline const uint8_t* unpack9_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 9) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w4 = static_cast(util::SafeLoadAs(in)); + w4 = bit_util::FromLittleEndian(w4); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 9) & mask; + out[2] = (w0 >> 18) & mask; + out[3] = (w0 >> 27) & mask; + out[4] = (w0 >> 36) & mask; + out[5] = (w0 >> 45) & mask; + out[6] = (w0 >> 54) & mask; + out[7] = ((w0 >> 63) | (w1 << 1)) & mask; + out[8] = (w1 >> 8) & mask; + out[9] = (w1 >> 17) & mask; + out[10] = (w1 >> 26) & mask; + out[11] = (w1 >> 35) & mask; + out[12] = (w1 >> 44) & mask; + out[13] = (w1 >> 53) & mask; + out[14] = ((w1 >> 62) | (w2 << 2)) & mask; + out[15] = (w2 >> 7) & mask; + out[16] = (w2 >> 16) & mask; + out[17] = (w2 >> 25) & mask; + out[18] = (w2 >> 34) & mask; + out[19] = (w2 >> 43) & mask; + out[20] = (w2 >> 52) & mask; + out[21] = ((w2 >> 61) | (w3 << 3)) & mask; + out[22] = (w3 >> 6) & mask; + out[23] = (w3 >> 15) & mask; + out[24] = (w3 >> 24) & mask; + out[25] = (w3 >> 33) & mask; + out[26] = (w3 >> 42) & mask; + out[27] = (w3 >> 51) & mask; + out[28] = ((w3 >> 60) | (w4 << 4)) & mask; + out[29] = (w4 >> 5) & mask; + out[30] = (w4 >> 14) & mask; + out[31] = (w4 >> 23) & mask; + + return in; +} + +inline const uint8_t* unpack10_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 10) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = (w0 >> 10) & mask; + out[2] = (w0 >> 20) & mask; + out[3] = (w0 >> 30) & mask; + out[4] = (w0 >> 40) & mask; + out[5] = (w0 >> 50) & mask; + out[6] = ((w0 >> 60) | (w1 << 4)) & mask; + out[7] = (w1 >> 6) & mask; + out[8] = (w1 >> 16) & mask; + out[9] = (w1 >> 26) & mask; + out[10] = (w1 >> 36) & mask; + out[11] = (w1 >> 46) & mask; + out[12] = ((w1 >> 56) | (w2 << 8)) & mask; + out[13] = (w2 >> 2) & mask; + out[14] = (w2 >> 12) & mask; + out[15] = (w2 >> 22) & mask; + out[16] = (w2 >> 32) & mask; + out[17] = (w2 >> 42) & mask; + out[18] = (w2 >> 52) & mask; + out[19] = ((w2 >> 62) | (w3 << 2)) & mask; + out[20] = (w3 >> 8) & mask; + out[21] = (w3 >> 18) & mask; + out[22] = (w3 >> 28) & mask; + out[23] = (w3 >> 38) & mask; + out[24] = (w3 >> 48) & mask; + out[25] = ((w3 >> 58) | (w4 << 6)) & mask; + out[26] = (w4 >> 4) & mask; + out[27] = (w4 >> 14) & mask; + out[28] = (w4 >> 24) & mask; + out[29] = (w4 >> 34) & mask; + out[30] = (w4 >> 44) & mask; + out[31] = w4 >> 54; + + return in; +} + +inline const uint8_t* unpack11_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 11) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w5 = static_cast(util::SafeLoadAs(in)); + w5 = bit_util::FromLittleEndian(w5); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 11) & mask; + out[2] = (w0 >> 22) & mask; + out[3] = (w0 >> 33) & mask; + out[4] = (w0 >> 44) & mask; + out[5] = ((w0 >> 55) | (w1 << 9)) & mask; + out[6] = (w1 >> 2) & mask; + out[7] = (w1 >> 13) & mask; + out[8] = (w1 >> 24) & mask; + out[9] = (w1 >> 35) & mask; + out[10] = (w1 >> 46) & mask; + out[11] = ((w1 >> 57) | (w2 << 7)) & mask; + out[12] = (w2 >> 4) & mask; + out[13] = (w2 >> 15) & mask; + out[14] = (w2 >> 26) & mask; + out[15] = (w2 >> 37) & mask; + out[16] = (w2 >> 48) & mask; + out[17] = ((w2 >> 59) | (w3 << 5)) & mask; + out[18] = (w3 >> 6) & mask; + out[19] = (w3 >> 17) & mask; + out[20] = (w3 >> 28) & mask; + out[21] = (w3 >> 39) & mask; + out[22] = (w3 >> 50) & mask; + out[23] = ((w3 >> 61) | (w4 << 3)) & mask; + out[24] = (w4 >> 8) & mask; + out[25] = (w4 >> 19) & mask; + out[26] = (w4 >> 30) & mask; + out[27] = (w4 >> 41) & mask; + out[28] = (w4 >> 52) & mask; + out[29] = ((w4 >> 63) | (w5 << 1)) & mask; + out[30] = (w5 >> 10) & mask; + out[31] = (w5 >> 21) & mask; + + return in; +} + +inline const uint8_t* unpack12_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 12) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = (w0 >> 12) & mask; + out[2] = (w0 >> 24) & mask; + out[3] = (w0 >> 36) & mask; + out[4] = (w0 >> 48) & mask; + out[5] = ((w0 >> 60) | (w1 << 4)) & mask; + out[6] = (w1 >> 8) & mask; + out[7] = (w1 >> 20) & mask; + out[8] = (w1 >> 32) & mask; + out[9] = (w1 >> 44) & mask; + out[10] = ((w1 >> 56) | (w2 << 8)) & mask; + out[11] = (w2 >> 4) & mask; + out[12] = (w2 >> 16) & mask; + out[13] = (w2 >> 28) & mask; + out[14] = (w2 >> 40) & mask; + out[15] = w2 >> 52; + out[16] = (w3) & mask; + out[17] = (w3 >> 12) & mask; + out[18] = (w3 >> 24) & mask; + out[19] = (w3 >> 36) & mask; + out[20] = (w3 >> 48) & mask; + out[21] = ((w3 >> 60) | (w4 << 4)) & mask; + out[22] = (w4 >> 8) & mask; + out[23] = (w4 >> 20) & mask; + out[24] = (w4 >> 32) & mask; + out[25] = (w4 >> 44) & mask; + out[26] = ((w4 >> 56) | (w5 << 8)) & mask; + out[27] = (w5 >> 4) & mask; + out[28] = (w5 >> 16) & mask; + out[29] = (w5 >> 28) & mask; + out[30] = (w5 >> 40) & mask; + out[31] = w5 >> 52; + + return in; +} + +inline const uint8_t* unpack13_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 13) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w6 = static_cast(util::SafeLoadAs(in)); + w6 = bit_util::FromLittleEndian(w6); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 13) & mask; + out[2] = (w0 >> 26) & mask; + out[3] = (w0 >> 39) & mask; + out[4] = ((w0 >> 52) | (w1 << 12)) & mask; + out[5] = (w1 >> 1) & mask; + out[6] = (w1 >> 14) & mask; + out[7] = (w1 >> 27) & mask; + out[8] = (w1 >> 40) & mask; + out[9] = ((w1 >> 53) | (w2 << 11)) & mask; + out[10] = (w2 >> 2) & mask; + out[11] = (w2 >> 15) & mask; + out[12] = (w2 >> 28) & mask; + out[13] = (w2 >> 41) & mask; + out[14] = ((w2 >> 54) | (w3 << 10)) & mask; + out[15] = (w3 >> 3) & mask; + out[16] = (w3 >> 16) & mask; + out[17] = (w3 >> 29) & mask; + out[18] = (w3 >> 42) & mask; + out[19] = ((w3 >> 55) | (w4 << 9)) & mask; + out[20] = (w4 >> 4) & mask; + out[21] = (w4 >> 17) & mask; + out[22] = (w4 >> 30) & mask; + out[23] = (w4 >> 43) & mask; + out[24] = ((w4 >> 56) | (w5 << 8)) & mask; + out[25] = (w5 >> 5) & mask; + out[26] = (w5 >> 18) & mask; + out[27] = (w5 >> 31) & mask; + out[28] = (w5 >> 44) & mask; + out[29] = ((w5 >> 57) | (w6 << 7)) & mask; + out[30] = (w6 >> 6) & mask; + out[31] = (w6 >> 19) & mask; + + return in; +} + +inline const uint8_t* unpack14_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 14) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = (w0 >> 14) & mask; + out[2] = (w0 >> 28) & mask; + out[3] = (w0 >> 42) & mask; + out[4] = ((w0 >> 56) | (w1 << 8)) & mask; + out[5] = (w1 >> 6) & mask; + out[6] = (w1 >> 20) & mask; + out[7] = (w1 >> 34) & mask; + out[8] = (w1 >> 48) & mask; + out[9] = ((w1 >> 62) | (w2 << 2)) & mask; + out[10] = (w2 >> 12) & mask; + out[11] = (w2 >> 26) & mask; + out[12] = (w2 >> 40) & mask; + out[13] = ((w2 >> 54) | (w3 << 10)) & mask; + out[14] = (w3 >> 4) & mask; + out[15] = (w3 >> 18) & mask; + out[16] = (w3 >> 32) & mask; + out[17] = (w3 >> 46) & mask; + out[18] = ((w3 >> 60) | (w4 << 4)) & mask; + out[19] = (w4 >> 10) & mask; + out[20] = (w4 >> 24) & mask; + out[21] = (w4 >> 38) & mask; + out[22] = ((w4 >> 52) | (w5 << 12)) & mask; + out[23] = (w5 >> 2) & mask; + out[24] = (w5 >> 16) & mask; + out[25] = (w5 >> 30) & mask; + out[26] = (w5 >> 44) & mask; + out[27] = ((w5 >> 58) | (w6 << 6)) & mask; + out[28] = (w6 >> 8) & mask; + out[29] = (w6 >> 22) & mask; + out[30] = (w6 >> 36) & mask; + out[31] = w6 >> 50; + + return in; +} + +inline const uint8_t* unpack15_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 15) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w7 = static_cast(util::SafeLoadAs(in)); + w7 = bit_util::FromLittleEndian(w7); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 15) & mask; + out[2] = (w0 >> 30) & mask; + out[3] = (w0 >> 45) & mask; + out[4] = ((w0 >> 60) | (w1 << 4)) & mask; + out[5] = (w1 >> 11) & mask; + out[6] = (w1 >> 26) & mask; + out[7] = (w1 >> 41) & mask; + out[8] = ((w1 >> 56) | (w2 << 8)) & mask; + out[9] = (w2 >> 7) & mask; + out[10] = (w2 >> 22) & mask; + out[11] = (w2 >> 37) & mask; + out[12] = ((w2 >> 52) | (w3 << 12)) & mask; + out[13] = (w3 >> 3) & mask; + out[14] = (w3 >> 18) & mask; + out[15] = (w3 >> 33) & mask; + out[16] = (w3 >> 48) & mask; + out[17] = ((w3 >> 63) | (w4 << 1)) & mask; + out[18] = (w4 >> 14) & mask; + out[19] = (w4 >> 29) & mask; + out[20] = (w4 >> 44) & mask; + out[21] = ((w4 >> 59) | (w5 << 5)) & mask; + out[22] = (w5 >> 10) & mask; + out[23] = (w5 >> 25) & mask; + out[24] = (w5 >> 40) & mask; + out[25] = ((w5 >> 55) | (w6 << 9)) & mask; + out[26] = (w6 >> 6) & mask; + out[27] = (w6 >> 21) & mask; + out[28] = (w6 >> 36) & mask; + out[29] = ((w6 >> 51) | (w7 << 13)) & mask; + out[30] = (w7 >> 2) & mask; + out[31] = (w7 >> 17) & mask; + + return in; +} + +inline const uint8_t* unpack16_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 16) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = (w0 >> 16) & mask; + out[2] = (w0 >> 32) & mask; + out[3] = w0 >> 48; + out[4] = (w1) & mask; + out[5] = (w1 >> 16) & mask; + out[6] = (w1 >> 32) & mask; + out[7] = w1 >> 48; + out[8] = (w2) & mask; + out[9] = (w2 >> 16) & mask; + out[10] = (w2 >> 32) & mask; + out[11] = w2 >> 48; + out[12] = (w3) & mask; + out[13] = (w3 >> 16) & mask; + out[14] = (w3 >> 32) & mask; + out[15] = w3 >> 48; + out[16] = (w4) & mask; + out[17] = (w4 >> 16) & mask; + out[18] = (w4 >> 32) & mask; + out[19] = w4 >> 48; + out[20] = (w5) & mask; + out[21] = (w5 >> 16) & mask; + out[22] = (w5 >> 32) & mask; + out[23] = w5 >> 48; + out[24] = (w6) & mask; + out[25] = (w6 >> 16) & mask; + out[26] = (w6 >> 32) & mask; + out[27] = w6 >> 48; + out[28] = (w7) & mask; + out[29] = (w7 >> 16) & mask; + out[30] = (w7 >> 32) & mask; + out[31] = w7 >> 48; + + return in; +} + +inline const uint8_t* unpack17_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 17) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w8 = static_cast(util::SafeLoadAs(in)); + w8 = bit_util::FromLittleEndian(w8); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 17) & mask; + out[2] = (w0 >> 34) & mask; + out[3] = ((w0 >> 51) | (w1 << 13)) & mask; + out[4] = (w1 >> 4) & mask; + out[5] = (w1 >> 21) & mask; + out[6] = (w1 >> 38) & mask; + out[7] = ((w1 >> 55) | (w2 << 9)) & mask; + out[8] = (w2 >> 8) & mask; + out[9] = (w2 >> 25) & mask; + out[10] = (w2 >> 42) & mask; + out[11] = ((w2 >> 59) | (w3 << 5)) & mask; + out[12] = (w3 >> 12) & mask; + out[13] = (w3 >> 29) & mask; + out[14] = (w3 >> 46) & mask; + out[15] = ((w3 >> 63) | (w4 << 1)) & mask; + out[16] = (w4 >> 16) & mask; + out[17] = (w4 >> 33) & mask; + out[18] = ((w4 >> 50) | (w5 << 14)) & mask; + out[19] = (w5 >> 3) & mask; + out[20] = (w5 >> 20) & mask; + out[21] = (w5 >> 37) & mask; + out[22] = ((w5 >> 54) | (w6 << 10)) & mask; + out[23] = (w6 >> 7) & mask; + out[24] = (w6 >> 24) & mask; + out[25] = (w6 >> 41) & mask; + out[26] = ((w6 >> 58) | (w7 << 6)) & mask; + out[27] = (w7 >> 11) & mask; + out[28] = (w7 >> 28) & mask; + out[29] = (w7 >> 45) & mask; + out[30] = ((w7 >> 62) | (w8 << 2)) & mask; + out[31] = (w8 >> 15) & mask; + + return in; +} + +inline const uint8_t* unpack18_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 18) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = (w0 >> 18) & mask; + out[2] = (w0 >> 36) & mask; + out[3] = ((w0 >> 54) | (w1 << 10)) & mask; + out[4] = (w1 >> 8) & mask; + out[5] = (w1 >> 26) & mask; + out[6] = (w1 >> 44) & mask; + out[7] = ((w1 >> 62) | (w2 << 2)) & mask; + out[8] = (w2 >> 16) & mask; + out[9] = (w2 >> 34) & mask; + out[10] = ((w2 >> 52) | (w3 << 12)) & mask; + out[11] = (w3 >> 6) & mask; + out[12] = (w3 >> 24) & mask; + out[13] = (w3 >> 42) & mask; + out[14] = ((w3 >> 60) | (w4 << 4)) & mask; + out[15] = (w4 >> 14) & mask; + out[16] = (w4 >> 32) & mask; + out[17] = ((w4 >> 50) | (w5 << 14)) & mask; + out[18] = (w5 >> 4) & mask; + out[19] = (w5 >> 22) & mask; + out[20] = (w5 >> 40) & mask; + out[21] = ((w5 >> 58) | (w6 << 6)) & mask; + out[22] = (w6 >> 12) & mask; + out[23] = (w6 >> 30) & mask; + out[24] = ((w6 >> 48) | (w7 << 16)) & mask; + out[25] = (w7 >> 2) & mask; + out[26] = (w7 >> 20) & mask; + out[27] = (w7 >> 38) & mask; + out[28] = ((w7 >> 56) | (w8 << 8)) & mask; + out[29] = (w8 >> 10) & mask; + out[30] = (w8 >> 28) & mask; + out[31] = w8 >> 46; + + return in; +} + +inline const uint8_t* unpack19_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 19) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w9 = static_cast(util::SafeLoadAs(in)); + w9 = bit_util::FromLittleEndian(w9); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 19) & mask; + out[2] = (w0 >> 38) & mask; + out[3] = ((w0 >> 57) | (w1 << 7)) & mask; + out[4] = (w1 >> 12) & mask; + out[5] = (w1 >> 31) & mask; + out[6] = ((w1 >> 50) | (w2 << 14)) & mask; + out[7] = (w2 >> 5) & mask; + out[8] = (w2 >> 24) & mask; + out[9] = (w2 >> 43) & mask; + out[10] = ((w2 >> 62) | (w3 << 2)) & mask; + out[11] = (w3 >> 17) & mask; + out[12] = (w3 >> 36) & mask; + out[13] = ((w3 >> 55) | (w4 << 9)) & mask; + out[14] = (w4 >> 10) & mask; + out[15] = (w4 >> 29) & mask; + out[16] = ((w4 >> 48) | (w5 << 16)) & mask; + out[17] = (w5 >> 3) & mask; + out[18] = (w5 >> 22) & mask; + out[19] = (w5 >> 41) & mask; + out[20] = ((w5 >> 60) | (w6 << 4)) & mask; + out[21] = (w6 >> 15) & mask; + out[22] = (w6 >> 34) & mask; + out[23] = ((w6 >> 53) | (w7 << 11)) & mask; + out[24] = (w7 >> 8) & mask; + out[25] = (w7 >> 27) & mask; + out[26] = ((w7 >> 46) | (w8 << 18)) & mask; + out[27] = (w8 >> 1) & mask; + out[28] = (w8 >> 20) & mask; + out[29] = (w8 >> 39) & mask; + out[30] = ((w8 >> 58) | (w9 << 6)) & mask; + out[31] = (w9 >> 13) & mask; + + return in; +} + +inline const uint8_t* unpack20_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 20) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = (w0 >> 20) & mask; + out[2] = (w0 >> 40) & mask; + out[3] = ((w0 >> 60) | (w1 << 4)) & mask; + out[4] = (w1 >> 16) & mask; + out[5] = (w1 >> 36) & mask; + out[6] = ((w1 >> 56) | (w2 << 8)) & mask; + out[7] = (w2 >> 12) & mask; + out[8] = (w2 >> 32) & mask; + out[9] = ((w2 >> 52) | (w3 << 12)) & mask; + out[10] = (w3 >> 8) & mask; + out[11] = (w3 >> 28) & mask; + out[12] = ((w3 >> 48) | (w4 << 16)) & mask; + out[13] = (w4 >> 4) & mask; + out[14] = (w4 >> 24) & mask; + out[15] = w4 >> 44; + out[16] = (w5) & mask; + out[17] = (w5 >> 20) & mask; + out[18] = (w5 >> 40) & mask; + out[19] = ((w5 >> 60) | (w6 << 4)) & mask; + out[20] = (w6 >> 16) & mask; + out[21] = (w6 >> 36) & mask; + out[22] = ((w6 >> 56) | (w7 << 8)) & mask; + out[23] = (w7 >> 12) & mask; + out[24] = (w7 >> 32) & mask; + out[25] = ((w7 >> 52) | (w8 << 12)) & mask; + out[26] = (w8 >> 8) & mask; + out[27] = (w8 >> 28) & mask; + out[28] = ((w8 >> 48) | (w9 << 16)) & mask; + out[29] = (w9 >> 4) & mask; + out[30] = (w9 >> 24) & mask; + out[31] = w9 >> 44; + + return in; +} + +inline const uint8_t* unpack21_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 21) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w10 = static_cast(util::SafeLoadAs(in)); + w10 = bit_util::FromLittleEndian(w10); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 21) & mask; + out[2] = (w0 >> 42) & mask; + out[3] = ((w0 >> 63) | (w1 << 1)) & mask; + out[4] = (w1 >> 20) & mask; + out[5] = (w1 >> 41) & mask; + out[6] = ((w1 >> 62) | (w2 << 2)) & mask; + out[7] = (w2 >> 19) & mask; + out[8] = (w2 >> 40) & mask; + out[9] = ((w2 >> 61) | (w3 << 3)) & mask; + out[10] = (w3 >> 18) & mask; + out[11] = (w3 >> 39) & mask; + out[12] = ((w3 >> 60) | (w4 << 4)) & mask; + out[13] = (w4 >> 17) & mask; + out[14] = (w4 >> 38) & mask; + out[15] = ((w4 >> 59) | (w5 << 5)) & mask; + out[16] = (w5 >> 16) & mask; + out[17] = (w5 >> 37) & mask; + out[18] = ((w5 >> 58) | (w6 << 6)) & mask; + out[19] = (w6 >> 15) & mask; + out[20] = (w6 >> 36) & mask; + out[21] = ((w6 >> 57) | (w7 << 7)) & mask; + out[22] = (w7 >> 14) & mask; + out[23] = (w7 >> 35) & mask; + out[24] = ((w7 >> 56) | (w8 << 8)) & mask; + out[25] = (w8 >> 13) & mask; + out[26] = (w8 >> 34) & mask; + out[27] = ((w8 >> 55) | (w9 << 9)) & mask; + out[28] = (w9 >> 12) & mask; + out[29] = (w9 >> 33) & mask; + out[30] = ((w9 >> 54) | (w10 << 10)) & mask; + out[31] = (w10 >> 11) & mask; + + return in; +} + +inline const uint8_t* unpack22_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 22) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = (w0 >> 22) & mask; + out[2] = ((w0 >> 44) | (w1 << 20)) & mask; + out[3] = (w1 >> 2) & mask; + out[4] = (w1 >> 24) & mask; + out[5] = ((w1 >> 46) | (w2 << 18)) & mask; + out[6] = (w2 >> 4) & mask; + out[7] = (w2 >> 26) & mask; + out[8] = ((w2 >> 48) | (w3 << 16)) & mask; + out[9] = (w3 >> 6) & mask; + out[10] = (w3 >> 28) & mask; + out[11] = ((w3 >> 50) | (w4 << 14)) & mask; + out[12] = (w4 >> 8) & mask; + out[13] = (w4 >> 30) & mask; + out[14] = ((w4 >> 52) | (w5 << 12)) & mask; + out[15] = (w5 >> 10) & mask; + out[16] = (w5 >> 32) & mask; + out[17] = ((w5 >> 54) | (w6 << 10)) & mask; + out[18] = (w6 >> 12) & mask; + out[19] = (w6 >> 34) & mask; + out[20] = ((w6 >> 56) | (w7 << 8)) & mask; + out[21] = (w7 >> 14) & mask; + out[22] = (w7 >> 36) & mask; + out[23] = ((w7 >> 58) | (w8 << 6)) & mask; + out[24] = (w8 >> 16) & mask; + out[25] = (w8 >> 38) & mask; + out[26] = ((w8 >> 60) | (w9 << 4)) & mask; + out[27] = (w9 >> 18) & mask; + out[28] = (w9 >> 40) & mask; + out[29] = ((w9 >> 62) | (w10 << 2)) & mask; + out[30] = (w10 >> 20) & mask; + out[31] = w10 >> 42; + + return in; +} + +inline const uint8_t* unpack23_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 23) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w11 = static_cast(util::SafeLoadAs(in)); + w11 = bit_util::FromLittleEndian(w11); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 23) & mask; + out[2] = ((w0 >> 46) | (w1 << 18)) & mask; + out[3] = (w1 >> 5) & mask; + out[4] = (w1 >> 28) & mask; + out[5] = ((w1 >> 51) | (w2 << 13)) & mask; + out[6] = (w2 >> 10) & mask; + out[7] = (w2 >> 33) & mask; + out[8] = ((w2 >> 56) | (w3 << 8)) & mask; + out[9] = (w3 >> 15) & mask; + out[10] = (w3 >> 38) & mask; + out[11] = ((w3 >> 61) | (w4 << 3)) & mask; + out[12] = (w4 >> 20) & mask; + out[13] = ((w4 >> 43) | (w5 << 21)) & mask; + out[14] = (w5 >> 2) & mask; + out[15] = (w5 >> 25) & mask; + out[16] = ((w5 >> 48) | (w6 << 16)) & mask; + out[17] = (w6 >> 7) & mask; + out[18] = (w6 >> 30) & mask; + out[19] = ((w6 >> 53) | (w7 << 11)) & mask; + out[20] = (w7 >> 12) & mask; + out[21] = (w7 >> 35) & mask; + out[22] = ((w7 >> 58) | (w8 << 6)) & mask; + out[23] = (w8 >> 17) & mask; + out[24] = (w8 >> 40) & mask; + out[25] = ((w8 >> 63) | (w9 << 1)) & mask; + out[26] = (w9 >> 22) & mask; + out[27] = ((w9 >> 45) | (w10 << 19)) & mask; + out[28] = (w10 >> 4) & mask; + out[29] = (w10 >> 27) & mask; + out[30] = ((w10 >> 50) | (w11 << 14)) & mask; + out[31] = (w11 >> 9) & mask; + + return in; +} + +inline const uint8_t* unpack24_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 24) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = (w0 >> 24) & mask; + out[2] = ((w0 >> 48) | (w1 << 16)) & mask; + out[3] = (w1 >> 8) & mask; + out[4] = (w1 >> 32) & mask; + out[5] = ((w1 >> 56) | (w2 << 8)) & mask; + out[6] = (w2 >> 16) & mask; + out[7] = w2 >> 40; + out[8] = (w3) & mask; + out[9] = (w3 >> 24) & mask; + out[10] = ((w3 >> 48) | (w4 << 16)) & mask; + out[11] = (w4 >> 8) & mask; + out[12] = (w4 >> 32) & mask; + out[13] = ((w4 >> 56) | (w5 << 8)) & mask; + out[14] = (w5 >> 16) & mask; + out[15] = w5 >> 40; + out[16] = (w6) & mask; + out[17] = (w6 >> 24) & mask; + out[18] = ((w6 >> 48) | (w7 << 16)) & mask; + out[19] = (w7 >> 8) & mask; + out[20] = (w7 >> 32) & mask; + out[21] = ((w7 >> 56) | (w8 << 8)) & mask; + out[22] = (w8 >> 16) & mask; + out[23] = w8 >> 40; + out[24] = (w9) & mask; + out[25] = (w9 >> 24) & mask; + out[26] = ((w9 >> 48) | (w10 << 16)) & mask; + out[27] = (w10 >> 8) & mask; + out[28] = (w10 >> 32) & mask; + out[29] = ((w10 >> 56) | (w11 << 8)) & mask; + out[30] = (w11 >> 16) & mask; + out[31] = w11 >> 40; + + return in; +} + +inline const uint8_t* unpack25_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 25) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w12 = static_cast(util::SafeLoadAs(in)); + w12 = bit_util::FromLittleEndian(w12); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 25) & mask; + out[2] = ((w0 >> 50) | (w1 << 14)) & mask; + out[3] = (w1 >> 11) & mask; + out[4] = (w1 >> 36) & mask; + out[5] = ((w1 >> 61) | (w2 << 3)) & mask; + out[6] = (w2 >> 22) & mask; + out[7] = ((w2 >> 47) | (w3 << 17)) & mask; + out[8] = (w3 >> 8) & mask; + out[9] = (w3 >> 33) & mask; + out[10] = ((w3 >> 58) | (w4 << 6)) & mask; + out[11] = (w4 >> 19) & mask; + out[12] = ((w4 >> 44) | (w5 << 20)) & mask; + out[13] = (w5 >> 5) & mask; + out[14] = (w5 >> 30) & mask; + out[15] = ((w5 >> 55) | (w6 << 9)) & mask; + out[16] = (w6 >> 16) & mask; + out[17] = ((w6 >> 41) | (w7 << 23)) & mask; + out[18] = (w7 >> 2) & mask; + out[19] = (w7 >> 27) & mask; + out[20] = ((w7 >> 52) | (w8 << 12)) & mask; + out[21] = (w8 >> 13) & mask; + out[22] = (w8 >> 38) & mask; + out[23] = ((w8 >> 63) | (w9 << 1)) & mask; + out[24] = (w9 >> 24) & mask; + out[25] = ((w9 >> 49) | (w10 << 15)) & mask; + out[26] = (w10 >> 10) & mask; + out[27] = (w10 >> 35) & mask; + out[28] = ((w10 >> 60) | (w11 << 4)) & mask; + out[29] = (w11 >> 21) & mask; + out[30] = ((w11 >> 46) | (w12 << 18)) & mask; + out[31] = (w12 >> 7) & mask; + + return in; +} + +inline const uint8_t* unpack26_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 26) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = (w0 >> 26) & mask; + out[2] = ((w0 >> 52) | (w1 << 12)) & mask; + out[3] = (w1 >> 14) & mask; + out[4] = ((w1 >> 40) | (w2 << 24)) & mask; + out[5] = (w2 >> 2) & mask; + out[6] = (w2 >> 28) & mask; + out[7] = ((w2 >> 54) | (w3 << 10)) & mask; + out[8] = (w3 >> 16) & mask; + out[9] = ((w3 >> 42) | (w4 << 22)) & mask; + out[10] = (w4 >> 4) & mask; + out[11] = (w4 >> 30) & mask; + out[12] = ((w4 >> 56) | (w5 << 8)) & mask; + out[13] = (w5 >> 18) & mask; + out[14] = ((w5 >> 44) | (w6 << 20)) & mask; + out[15] = (w6 >> 6) & mask; + out[16] = (w6 >> 32) & mask; + out[17] = ((w6 >> 58) | (w7 << 6)) & mask; + out[18] = (w7 >> 20) & mask; + out[19] = ((w7 >> 46) | (w8 << 18)) & mask; + out[20] = (w8 >> 8) & mask; + out[21] = (w8 >> 34) & mask; + out[22] = ((w8 >> 60) | (w9 << 4)) & mask; + out[23] = (w9 >> 22) & mask; + out[24] = ((w9 >> 48) | (w10 << 16)) & mask; + out[25] = (w10 >> 10) & mask; + out[26] = (w10 >> 36) & mask; + out[27] = ((w10 >> 62) | (w11 << 2)) & mask; + out[28] = (w11 >> 24) & mask; + out[29] = ((w11 >> 50) | (w12 << 14)) & mask; + out[30] = (w12 >> 12) & mask; + out[31] = w12 >> 38; + + return in; +} + +inline const uint8_t* unpack27_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 27) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w13 = static_cast(util::SafeLoadAs(in)); + w13 = bit_util::FromLittleEndian(w13); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 27) & mask; + out[2] = ((w0 >> 54) | (w1 << 10)) & mask; + out[3] = (w1 >> 17) & mask; + out[4] = ((w1 >> 44) | (w2 << 20)) & mask; + out[5] = (w2 >> 7) & mask; + out[6] = (w2 >> 34) & mask; + out[7] = ((w2 >> 61) | (w3 << 3)) & mask; + out[8] = (w3 >> 24) & mask; + out[9] = ((w3 >> 51) | (w4 << 13)) & mask; + out[10] = (w4 >> 14) & mask; + out[11] = ((w4 >> 41) | (w5 << 23)) & mask; + out[12] = (w5 >> 4) & mask; + out[13] = (w5 >> 31) & mask; + out[14] = ((w5 >> 58) | (w6 << 6)) & mask; + out[15] = (w6 >> 21) & mask; + out[16] = ((w6 >> 48) | (w7 << 16)) & mask; + out[17] = (w7 >> 11) & mask; + out[18] = ((w7 >> 38) | (w8 << 26)) & mask; + out[19] = (w8 >> 1) & mask; + out[20] = (w8 >> 28) & mask; + out[21] = ((w8 >> 55) | (w9 << 9)) & mask; + out[22] = (w9 >> 18) & mask; + out[23] = ((w9 >> 45) | (w10 << 19)) & mask; + out[24] = (w10 >> 8) & mask; + out[25] = (w10 >> 35) & mask; + out[26] = ((w10 >> 62) | (w11 << 2)) & mask; + out[27] = (w11 >> 25) & mask; + out[28] = ((w11 >> 52) | (w12 << 12)) & mask; + out[29] = (w12 >> 15) & mask; + out[30] = ((w12 >> 42) | (w13 << 22)) & mask; + out[31] = (w13 >> 5) & mask; + + return in; +} + +inline const uint8_t* unpack28_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 28) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = (w0 >> 28) & mask; + out[2] = ((w0 >> 56) | (w1 << 8)) & mask; + out[3] = (w1 >> 20) & mask; + out[4] = ((w1 >> 48) | (w2 << 16)) & mask; + out[5] = (w2 >> 12) & mask; + out[6] = ((w2 >> 40) | (w3 << 24)) & mask; + out[7] = (w3 >> 4) & mask; + out[8] = (w3 >> 32) & mask; + out[9] = ((w3 >> 60) | (w4 << 4)) & mask; + out[10] = (w4 >> 24) & mask; + out[11] = ((w4 >> 52) | (w5 << 12)) & mask; + out[12] = (w5 >> 16) & mask; + out[13] = ((w5 >> 44) | (w6 << 20)) & mask; + out[14] = (w6 >> 8) & mask; + out[15] = w6 >> 36; + out[16] = (w7) & mask; + out[17] = (w7 >> 28) & mask; + out[18] = ((w7 >> 56) | (w8 << 8)) & mask; + out[19] = (w8 >> 20) & mask; + out[20] = ((w8 >> 48) | (w9 << 16)) & mask; + out[21] = (w9 >> 12) & mask; + out[22] = ((w9 >> 40) | (w10 << 24)) & mask; + out[23] = (w10 >> 4) & mask; + out[24] = (w10 >> 32) & mask; + out[25] = ((w10 >> 60) | (w11 << 4)) & mask; + out[26] = (w11 >> 24) & mask; + out[27] = ((w11 >> 52) | (w12 << 12)) & mask; + out[28] = (w12 >> 16) & mask; + out[29] = ((w12 >> 44) | (w13 << 20)) & mask; + out[30] = (w13 >> 8) & mask; + out[31] = w13 >> 36; + + return in; +} + +inline const uint8_t* unpack29_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 29) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w14 = static_cast(util::SafeLoadAs(in)); + w14 = bit_util::FromLittleEndian(w14); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 29) & mask; + out[2] = ((w0 >> 58) | (w1 << 6)) & mask; + out[3] = (w1 >> 23) & mask; + out[4] = ((w1 >> 52) | (w2 << 12)) & mask; + out[5] = (w2 >> 17) & mask; + out[6] = ((w2 >> 46) | (w3 << 18)) & mask; + out[7] = (w3 >> 11) & mask; + out[8] = ((w3 >> 40) | (w4 << 24)) & mask; + out[9] = (w4 >> 5) & mask; + out[10] = (w4 >> 34) & mask; + out[11] = ((w4 >> 63) | (w5 << 1)) & mask; + out[12] = (w5 >> 28) & mask; + out[13] = ((w5 >> 57) | (w6 << 7)) & mask; + out[14] = (w6 >> 22) & mask; + out[15] = ((w6 >> 51) | (w7 << 13)) & mask; + out[16] = (w7 >> 16) & mask; + out[17] = ((w7 >> 45) | (w8 << 19)) & mask; + out[18] = (w8 >> 10) & mask; + out[19] = ((w8 >> 39) | (w9 << 25)) & mask; + out[20] = (w9 >> 4) & mask; + out[21] = (w9 >> 33) & mask; + out[22] = ((w9 >> 62) | (w10 << 2)) & mask; + out[23] = (w10 >> 27) & mask; + out[24] = ((w10 >> 56) | (w11 << 8)) & mask; + out[25] = (w11 >> 21) & mask; + out[26] = ((w11 >> 50) | (w12 << 14)) & mask; + out[27] = (w12 >> 15) & mask; + out[28] = ((w12 >> 44) | (w13 << 20)) & mask; + out[29] = (w13 >> 9) & mask; + out[30] = ((w13 >> 38) | (w14 << 26)) & mask; + out[31] = (w14 >> 3) & mask; + + return in; +} + +inline const uint8_t* unpack30_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 30) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = (w0 >> 30) & mask; + out[2] = ((w0 >> 60) | (w1 << 4)) & mask; + out[3] = (w1 >> 26) & mask; + out[4] = ((w1 >> 56) | (w2 << 8)) & mask; + out[5] = (w2 >> 22) & mask; + out[6] = ((w2 >> 52) | (w3 << 12)) & mask; + out[7] = (w3 >> 18) & mask; + out[8] = ((w3 >> 48) | (w4 << 16)) & mask; + out[9] = (w4 >> 14) & mask; + out[10] = ((w4 >> 44) | (w5 << 20)) & mask; + out[11] = (w5 >> 10) & mask; + out[12] = ((w5 >> 40) | (w6 << 24)) & mask; + out[13] = (w6 >> 6) & mask; + out[14] = ((w6 >> 36) | (w7 << 28)) & mask; + out[15] = (w7 >> 2) & mask; + out[16] = (w7 >> 32) & mask; + out[17] = ((w7 >> 62) | (w8 << 2)) & mask; + out[18] = (w8 >> 28) & mask; + out[19] = ((w8 >> 58) | (w9 << 6)) & mask; + out[20] = (w9 >> 24) & mask; + out[21] = ((w9 >> 54) | (w10 << 10)) & mask; + out[22] = (w10 >> 20) & mask; + out[23] = ((w10 >> 50) | (w11 << 14)) & mask; + out[24] = (w11 >> 16) & mask; + out[25] = ((w11 >> 46) | (w12 << 18)) & mask; + out[26] = (w12 >> 12) & mask; + out[27] = ((w12 >> 42) | (w13 << 22)) & mask; + out[28] = (w13 >> 8) & mask; + out[29] = ((w13 >> 38) | (w14 << 26)) & mask; + out[30] = (w14 >> 4) & mask; + out[31] = w14 >> 34; + + return in; +} + +inline const uint8_t* unpack31_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 31) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w15 = static_cast(util::SafeLoadAs(in)); + w15 = bit_util::FromLittleEndian(w15); + in += 4; + out[0] = (w0) & mask; + out[1] = (w0 >> 31) & mask; + out[2] = ((w0 >> 62) | (w1 << 2)) & mask; + out[3] = (w1 >> 29) & mask; + out[4] = ((w1 >> 60) | (w2 << 4)) & mask; + out[5] = (w2 >> 27) & mask; + out[6] = ((w2 >> 58) | (w3 << 6)) & mask; + out[7] = (w3 >> 25) & mask; + out[8] = ((w3 >> 56) | (w4 << 8)) & mask; + out[9] = (w4 >> 23) & mask; + out[10] = ((w4 >> 54) | (w5 << 10)) & mask; + out[11] = (w5 >> 21) & mask; + out[12] = ((w5 >> 52) | (w6 << 12)) & mask; + out[13] = (w6 >> 19) & mask; + out[14] = ((w6 >> 50) | (w7 << 14)) & mask; + out[15] = (w7 >> 17) & mask; + out[16] = ((w7 >> 48) | (w8 << 16)) & mask; + out[17] = (w8 >> 15) & mask; + out[18] = ((w8 >> 46) | (w9 << 18)) & mask; + out[19] = (w9 >> 13) & mask; + out[20] = ((w9 >> 44) | (w10 << 20)) & mask; + out[21] = (w10 >> 11) & mask; + out[22] = ((w10 >> 42) | (w11 << 22)) & mask; + out[23] = (w11 >> 9) & mask; + out[24] = ((w11 >> 40) | (w12 << 24)) & mask; + out[25] = (w12 >> 7) & mask; + out[26] = ((w12 >> 38) | (w13 << 26)) & mask; + out[27] = (w13 >> 5) & mask; + out[28] = ((w13 >> 36) | (w14 << 28)) & mask; + out[29] = (w14 >> 3) & mask; + out[30] = ((w14 >> 34) | (w15 << 30)) & mask; + out[31] = (w15 >> 1) & mask; + + return in; +} + +inline const uint8_t* unpack32_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 32) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = w0 >> 32; + out[2] = (w1) & mask; + out[3] = w1 >> 32; + out[4] = (w2) & mask; + out[5] = w2 >> 32; + out[6] = (w3) & mask; + out[7] = w3 >> 32; + out[8] = (w4) & mask; + out[9] = w4 >> 32; + out[10] = (w5) & mask; + out[11] = w5 >> 32; + out[12] = (w6) & mask; + out[13] = w6 >> 32; + out[14] = (w7) & mask; + out[15] = w7 >> 32; + out[16] = (w8) & mask; + out[17] = w8 >> 32; + out[18] = (w9) & mask; + out[19] = w9 >> 32; + out[20] = (w10) & mask; + out[21] = w10 >> 32; + out[22] = (w11) & mask; + out[23] = w11 >> 32; + out[24] = (w12) & mask; + out[25] = w12 >> 32; + out[26] = (w13) & mask; + out[27] = w13 >> 32; + out[28] = (w14) & mask; + out[29] = w14 >> 32; + out[30] = (w15) & mask; + out[31] = w15 >> 32; + + return in; +} + +inline const uint8_t* unpack33_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 33) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w16 = static_cast(util::SafeLoadAs(in)); + w16 = bit_util::FromLittleEndian(w16); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 33) | (w1 << 31)) & mask; + out[2] = (w1 >> 2) & mask; + out[3] = ((w1 >> 35) | (w2 << 29)) & mask; + out[4] = (w2 >> 4) & mask; + out[5] = ((w2 >> 37) | (w3 << 27)) & mask; + out[6] = (w3 >> 6) & mask; + out[7] = ((w3 >> 39) | (w4 << 25)) & mask; + out[8] = (w4 >> 8) & mask; + out[9] = ((w4 >> 41) | (w5 << 23)) & mask; + out[10] = (w5 >> 10) & mask; + out[11] = ((w5 >> 43) | (w6 << 21)) & mask; + out[12] = (w6 >> 12) & mask; + out[13] = ((w6 >> 45) | (w7 << 19)) & mask; + out[14] = (w7 >> 14) & mask; + out[15] = ((w7 >> 47) | (w8 << 17)) & mask; + out[16] = (w8 >> 16) & mask; + out[17] = ((w8 >> 49) | (w9 << 15)) & mask; + out[18] = (w9 >> 18) & mask; + out[19] = ((w9 >> 51) | (w10 << 13)) & mask; + out[20] = (w10 >> 20) & mask; + out[21] = ((w10 >> 53) | (w11 << 11)) & mask; + out[22] = (w11 >> 22) & mask; + out[23] = ((w11 >> 55) | (w12 << 9)) & mask; + out[24] = (w12 >> 24) & mask; + out[25] = ((w12 >> 57) | (w13 << 7)) & mask; + out[26] = (w13 >> 26) & mask; + out[27] = ((w13 >> 59) | (w14 << 5)) & mask; + out[28] = (w14 >> 28) & mask; + out[29] = ((w14 >> 61) | (w15 << 3)) & mask; + out[30] = (w15 >> 30) & mask; + out[31] = ((w15 >> 63) | (w16 << 1)) & mask; + + return in; +} + +inline const uint8_t* unpack34_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 34) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = ((w0 >> 34) | (w1 << 30)) & mask; + out[2] = (w1 >> 4) & mask; + out[3] = ((w1 >> 38) | (w2 << 26)) & mask; + out[4] = (w2 >> 8) & mask; + out[5] = ((w2 >> 42) | (w3 << 22)) & mask; + out[6] = (w3 >> 12) & mask; + out[7] = ((w3 >> 46) | (w4 << 18)) & mask; + out[8] = (w4 >> 16) & mask; + out[9] = ((w4 >> 50) | (w5 << 14)) & mask; + out[10] = (w5 >> 20) & mask; + out[11] = ((w5 >> 54) | (w6 << 10)) & mask; + out[12] = (w6 >> 24) & mask; + out[13] = ((w6 >> 58) | (w7 << 6)) & mask; + out[14] = (w7 >> 28) & mask; + out[15] = ((w7 >> 62) | (w8 << 2)) & mask; + out[16] = ((w8 >> 32) | (w9 << 32)) & mask; + out[17] = (w9 >> 2) & mask; + out[18] = ((w9 >> 36) | (w10 << 28)) & mask; + out[19] = (w10 >> 6) & mask; + out[20] = ((w10 >> 40) | (w11 << 24)) & mask; + out[21] = (w11 >> 10) & mask; + out[22] = ((w11 >> 44) | (w12 << 20)) & mask; + out[23] = (w12 >> 14) & mask; + out[24] = ((w12 >> 48) | (w13 << 16)) & mask; + out[25] = (w13 >> 18) & mask; + out[26] = ((w13 >> 52) | (w14 << 12)) & mask; + out[27] = (w14 >> 22) & mask; + out[28] = ((w14 >> 56) | (w15 << 8)) & mask; + out[29] = (w15 >> 26) & mask; + out[30] = ((w15 >> 60) | (w16 << 4)) & mask; + out[31] = w16 >> 30; + + return in; +} + +inline const uint8_t* unpack35_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 35) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w17 = static_cast(util::SafeLoadAs(in)); + w17 = bit_util::FromLittleEndian(w17); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 35) | (w1 << 29)) & mask; + out[2] = (w1 >> 6) & mask; + out[3] = ((w1 >> 41) | (w2 << 23)) & mask; + out[4] = (w2 >> 12) & mask; + out[5] = ((w2 >> 47) | (w3 << 17)) & mask; + out[6] = (w3 >> 18) & mask; + out[7] = ((w3 >> 53) | (w4 << 11)) & mask; + out[8] = (w4 >> 24) & mask; + out[9] = ((w4 >> 59) | (w5 << 5)) & mask; + out[10] = ((w5 >> 30) | (w6 << 34)) & mask; + out[11] = (w6 >> 1) & mask; + out[12] = ((w6 >> 36) | (w7 << 28)) & mask; + out[13] = (w7 >> 7) & mask; + out[14] = ((w7 >> 42) | (w8 << 22)) & mask; + out[15] = (w8 >> 13) & mask; + out[16] = ((w8 >> 48) | (w9 << 16)) & mask; + out[17] = (w9 >> 19) & mask; + out[18] = ((w9 >> 54) | (w10 << 10)) & mask; + out[19] = (w10 >> 25) & mask; + out[20] = ((w10 >> 60) | (w11 << 4)) & mask; + out[21] = ((w11 >> 31) | (w12 << 33)) & mask; + out[22] = (w12 >> 2) & mask; + out[23] = ((w12 >> 37) | (w13 << 27)) & mask; + out[24] = (w13 >> 8) & mask; + out[25] = ((w13 >> 43) | (w14 << 21)) & mask; + out[26] = (w14 >> 14) & mask; + out[27] = ((w14 >> 49) | (w15 << 15)) & mask; + out[28] = (w15 >> 20) & mask; + out[29] = ((w15 >> 55) | (w16 << 9)) & mask; + out[30] = (w16 >> 26) & mask; + out[31] = ((w16 >> 61) | (w17 << 3)) & mask; + + return in; +} + +inline const uint8_t* unpack36_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 36) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = ((w0 >> 36) | (w1 << 28)) & mask; + out[2] = (w1 >> 8) & mask; + out[3] = ((w1 >> 44) | (w2 << 20)) & mask; + out[4] = (w2 >> 16) & mask; + out[5] = ((w2 >> 52) | (w3 << 12)) & mask; + out[6] = (w3 >> 24) & mask; + out[7] = ((w3 >> 60) | (w4 << 4)) & mask; + out[8] = ((w4 >> 32) | (w5 << 32)) & mask; + out[9] = (w5 >> 4) & mask; + out[10] = ((w5 >> 40) | (w6 << 24)) & mask; + out[11] = (w6 >> 12) & mask; + out[12] = ((w6 >> 48) | (w7 << 16)) & mask; + out[13] = (w7 >> 20) & mask; + out[14] = ((w7 >> 56) | (w8 << 8)) & mask; + out[15] = w8 >> 28; + out[16] = (w9) & mask; + out[17] = ((w9 >> 36) | (w10 << 28)) & mask; + out[18] = (w10 >> 8) & mask; + out[19] = ((w10 >> 44) | (w11 << 20)) & mask; + out[20] = (w11 >> 16) & mask; + out[21] = ((w11 >> 52) | (w12 << 12)) & mask; + out[22] = (w12 >> 24) & mask; + out[23] = ((w12 >> 60) | (w13 << 4)) & mask; + out[24] = ((w13 >> 32) | (w14 << 32)) & mask; + out[25] = (w14 >> 4) & mask; + out[26] = ((w14 >> 40) | (w15 << 24)) & mask; + out[27] = (w15 >> 12) & mask; + out[28] = ((w15 >> 48) | (w16 << 16)) & mask; + out[29] = (w16 >> 20) & mask; + out[30] = ((w16 >> 56) | (w17 << 8)) & mask; + out[31] = w17 >> 28; + + return in; +} + +inline const uint8_t* unpack37_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 37) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w18 = static_cast(util::SafeLoadAs(in)); + w18 = bit_util::FromLittleEndian(w18); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 37) | (w1 << 27)) & mask; + out[2] = (w1 >> 10) & mask; + out[3] = ((w1 >> 47) | (w2 << 17)) & mask; + out[4] = (w2 >> 20) & mask; + out[5] = ((w2 >> 57) | (w3 << 7)) & mask; + out[6] = ((w3 >> 30) | (w4 << 34)) & mask; + out[7] = (w4 >> 3) & mask; + out[8] = ((w4 >> 40) | (w5 << 24)) & mask; + out[9] = (w5 >> 13) & mask; + out[10] = ((w5 >> 50) | (w6 << 14)) & mask; + out[11] = (w6 >> 23) & mask; + out[12] = ((w6 >> 60) | (w7 << 4)) & mask; + out[13] = ((w7 >> 33) | (w8 << 31)) & mask; + out[14] = (w8 >> 6) & mask; + out[15] = ((w8 >> 43) | (w9 << 21)) & mask; + out[16] = (w9 >> 16) & mask; + out[17] = ((w9 >> 53) | (w10 << 11)) & mask; + out[18] = (w10 >> 26) & mask; + out[19] = ((w10 >> 63) | (w11 << 1)) & mask; + out[20] = ((w11 >> 36) | (w12 << 28)) & mask; + out[21] = (w12 >> 9) & mask; + out[22] = ((w12 >> 46) | (w13 << 18)) & mask; + out[23] = (w13 >> 19) & mask; + out[24] = ((w13 >> 56) | (w14 << 8)) & mask; + out[25] = ((w14 >> 29) | (w15 << 35)) & mask; + out[26] = (w15 >> 2) & mask; + out[27] = ((w15 >> 39) | (w16 << 25)) & mask; + out[28] = (w16 >> 12) & mask; + out[29] = ((w16 >> 49) | (w17 << 15)) & mask; + out[30] = (w17 >> 22) & mask; + out[31] = ((w17 >> 59) | (w18 << 5)) & mask; + + return in; +} + +inline const uint8_t* unpack38_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 38) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = ((w0 >> 38) | (w1 << 26)) & mask; + out[2] = (w1 >> 12) & mask; + out[3] = ((w1 >> 50) | (w2 << 14)) & mask; + out[4] = (w2 >> 24) & mask; + out[5] = ((w2 >> 62) | (w3 << 2)) & mask; + out[6] = ((w3 >> 36) | (w4 << 28)) & mask; + out[7] = (w4 >> 10) & mask; + out[8] = ((w4 >> 48) | (w5 << 16)) & mask; + out[9] = (w5 >> 22) & mask; + out[10] = ((w5 >> 60) | (w6 << 4)) & mask; + out[11] = ((w6 >> 34) | (w7 << 30)) & mask; + out[12] = (w7 >> 8) & mask; + out[13] = ((w7 >> 46) | (w8 << 18)) & mask; + out[14] = (w8 >> 20) & mask; + out[15] = ((w8 >> 58) | (w9 << 6)) & mask; + out[16] = ((w9 >> 32) | (w10 << 32)) & mask; + out[17] = (w10 >> 6) & mask; + out[18] = ((w10 >> 44) | (w11 << 20)) & mask; + out[19] = (w11 >> 18) & mask; + out[20] = ((w11 >> 56) | (w12 << 8)) & mask; + out[21] = ((w12 >> 30) | (w13 << 34)) & mask; + out[22] = (w13 >> 4) & mask; + out[23] = ((w13 >> 42) | (w14 << 22)) & mask; + out[24] = (w14 >> 16) & mask; + out[25] = ((w14 >> 54) | (w15 << 10)) & mask; + out[26] = ((w15 >> 28) | (w16 << 36)) & mask; + out[27] = (w16 >> 2) & mask; + out[28] = ((w16 >> 40) | (w17 << 24)) & mask; + out[29] = (w17 >> 14) & mask; + out[30] = ((w17 >> 52) | (w18 << 12)) & mask; + out[31] = w18 >> 26; + + return in; +} + +inline const uint8_t* unpack39_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 39) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w19 = static_cast(util::SafeLoadAs(in)); + w19 = bit_util::FromLittleEndian(w19); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 39) | (w1 << 25)) & mask; + out[2] = (w1 >> 14) & mask; + out[3] = ((w1 >> 53) | (w2 << 11)) & mask; + out[4] = ((w2 >> 28) | (w3 << 36)) & mask; + out[5] = (w3 >> 3) & mask; + out[6] = ((w3 >> 42) | (w4 << 22)) & mask; + out[7] = (w4 >> 17) & mask; + out[8] = ((w4 >> 56) | (w5 << 8)) & mask; + out[9] = ((w5 >> 31) | (w6 << 33)) & mask; + out[10] = (w6 >> 6) & mask; + out[11] = ((w6 >> 45) | (w7 << 19)) & mask; + out[12] = (w7 >> 20) & mask; + out[13] = ((w7 >> 59) | (w8 << 5)) & mask; + out[14] = ((w8 >> 34) | (w9 << 30)) & mask; + out[15] = (w9 >> 9) & mask; + out[16] = ((w9 >> 48) | (w10 << 16)) & mask; + out[17] = (w10 >> 23) & mask; + out[18] = ((w10 >> 62) | (w11 << 2)) & mask; + out[19] = ((w11 >> 37) | (w12 << 27)) & mask; + out[20] = (w12 >> 12) & mask; + out[21] = ((w12 >> 51) | (w13 << 13)) & mask; + out[22] = ((w13 >> 26) | (w14 << 38)) & mask; + out[23] = (w14 >> 1) & mask; + out[24] = ((w14 >> 40) | (w15 << 24)) & mask; + out[25] = (w15 >> 15) & mask; + out[26] = ((w15 >> 54) | (w16 << 10)) & mask; + out[27] = ((w16 >> 29) | (w17 << 35)) & mask; + out[28] = (w17 >> 4) & mask; + out[29] = ((w17 >> 43) | (w18 << 21)) & mask; + out[30] = (w18 >> 18) & mask; + out[31] = ((w18 >> 57) | (w19 << 7)) & mask; + + return in; +} + +inline const uint8_t* unpack40_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 40) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = ((w0 >> 40) | (w1 << 24)) & mask; + out[2] = (w1 >> 16) & mask; + out[3] = ((w1 >> 56) | (w2 << 8)) & mask; + out[4] = ((w2 >> 32) | (w3 << 32)) & mask; + out[5] = (w3 >> 8) & mask; + out[6] = ((w3 >> 48) | (w4 << 16)) & mask; + out[7] = w4 >> 24; + out[8] = (w5) & mask; + out[9] = ((w5 >> 40) | (w6 << 24)) & mask; + out[10] = (w6 >> 16) & mask; + out[11] = ((w6 >> 56) | (w7 << 8)) & mask; + out[12] = ((w7 >> 32) | (w8 << 32)) & mask; + out[13] = (w8 >> 8) & mask; + out[14] = ((w8 >> 48) | (w9 << 16)) & mask; + out[15] = w9 >> 24; + out[16] = (w10) & mask; + out[17] = ((w10 >> 40) | (w11 << 24)) & mask; + out[18] = (w11 >> 16) & mask; + out[19] = ((w11 >> 56) | (w12 << 8)) & mask; + out[20] = ((w12 >> 32) | (w13 << 32)) & mask; + out[21] = (w13 >> 8) & mask; + out[22] = ((w13 >> 48) | (w14 << 16)) & mask; + out[23] = w14 >> 24; + out[24] = (w15) & mask; + out[25] = ((w15 >> 40) | (w16 << 24)) & mask; + out[26] = (w16 >> 16) & mask; + out[27] = ((w16 >> 56) | (w17 << 8)) & mask; + out[28] = ((w17 >> 32) | (w18 << 32)) & mask; + out[29] = (w18 >> 8) & mask; + out[30] = ((w18 >> 48) | (w19 << 16)) & mask; + out[31] = w19 >> 24; + + return in; +} + +inline const uint8_t* unpack41_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 41) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w20 = static_cast(util::SafeLoadAs(in)); + w20 = bit_util::FromLittleEndian(w20); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 41) | (w1 << 23)) & mask; + out[2] = (w1 >> 18) & mask; + out[3] = ((w1 >> 59) | (w2 << 5)) & mask; + out[4] = ((w2 >> 36) | (w3 << 28)) & mask; + out[5] = (w3 >> 13) & mask; + out[6] = ((w3 >> 54) | (w4 << 10)) & mask; + out[7] = ((w4 >> 31) | (w5 << 33)) & mask; + out[8] = (w5 >> 8) & mask; + out[9] = ((w5 >> 49) | (w6 << 15)) & mask; + out[10] = ((w6 >> 26) | (w7 << 38)) & mask; + out[11] = (w7 >> 3) & mask; + out[12] = ((w7 >> 44) | (w8 << 20)) & mask; + out[13] = (w8 >> 21) & mask; + out[14] = ((w8 >> 62) | (w9 << 2)) & mask; + out[15] = ((w9 >> 39) | (w10 << 25)) & mask; + out[16] = (w10 >> 16) & mask; + out[17] = ((w10 >> 57) | (w11 << 7)) & mask; + out[18] = ((w11 >> 34) | (w12 << 30)) & mask; + out[19] = (w12 >> 11) & mask; + out[20] = ((w12 >> 52) | (w13 << 12)) & mask; + out[21] = ((w13 >> 29) | (w14 << 35)) & mask; + out[22] = (w14 >> 6) & mask; + out[23] = ((w14 >> 47) | (w15 << 17)) & mask; + out[24] = ((w15 >> 24) | (w16 << 40)) & mask; + out[25] = (w16 >> 1) & mask; + out[26] = ((w16 >> 42) | (w17 << 22)) & mask; + out[27] = (w17 >> 19) & mask; + out[28] = ((w17 >> 60) | (w18 << 4)) & mask; + out[29] = ((w18 >> 37) | (w19 << 27)) & mask; + out[30] = (w19 >> 14) & mask; + out[31] = ((w19 >> 55) | (w20 << 9)) & mask; + + return in; +} + +inline const uint8_t* unpack42_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 42) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = ((w0 >> 42) | (w1 << 22)) & mask; + out[2] = (w1 >> 20) & mask; + out[3] = ((w1 >> 62) | (w2 << 2)) & mask; + out[4] = ((w2 >> 40) | (w3 << 24)) & mask; + out[5] = (w3 >> 18) & mask; + out[6] = ((w3 >> 60) | (w4 << 4)) & mask; + out[7] = ((w4 >> 38) | (w5 << 26)) & mask; + out[8] = (w5 >> 16) & mask; + out[9] = ((w5 >> 58) | (w6 << 6)) & mask; + out[10] = ((w6 >> 36) | (w7 << 28)) & mask; + out[11] = (w7 >> 14) & mask; + out[12] = ((w7 >> 56) | (w8 << 8)) & mask; + out[13] = ((w8 >> 34) | (w9 << 30)) & mask; + out[14] = (w9 >> 12) & mask; + out[15] = ((w9 >> 54) | (w10 << 10)) & mask; + out[16] = ((w10 >> 32) | (w11 << 32)) & mask; + out[17] = (w11 >> 10) & mask; + out[18] = ((w11 >> 52) | (w12 << 12)) & mask; + out[19] = ((w12 >> 30) | (w13 << 34)) & mask; + out[20] = (w13 >> 8) & mask; + out[21] = ((w13 >> 50) | (w14 << 14)) & mask; + out[22] = ((w14 >> 28) | (w15 << 36)) & mask; + out[23] = (w15 >> 6) & mask; + out[24] = ((w15 >> 48) | (w16 << 16)) & mask; + out[25] = ((w16 >> 26) | (w17 << 38)) & mask; + out[26] = (w17 >> 4) & mask; + out[27] = ((w17 >> 46) | (w18 << 18)) & mask; + out[28] = ((w18 >> 24) | (w19 << 40)) & mask; + out[29] = (w19 >> 2) & mask; + out[30] = ((w19 >> 44) | (w20 << 20)) & mask; + out[31] = w20 >> 22; + + return in; +} + +inline const uint8_t* unpack43_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 43) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w21 = static_cast(util::SafeLoadAs(in)); + w21 = bit_util::FromLittleEndian(w21); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 43) | (w1 << 21)) & mask; + out[2] = ((w1 >> 22) | (w2 << 42)) & mask; + out[3] = (w2 >> 1) & mask; + out[4] = ((w2 >> 44) | (w3 << 20)) & mask; + out[5] = ((w3 >> 23) | (w4 << 41)) & mask; + out[6] = (w4 >> 2) & mask; + out[7] = ((w4 >> 45) | (w5 << 19)) & mask; + out[8] = ((w5 >> 24) | (w6 << 40)) & mask; + out[9] = (w6 >> 3) & mask; + out[10] = ((w6 >> 46) | (w7 << 18)) & mask; + out[11] = ((w7 >> 25) | (w8 << 39)) & mask; + out[12] = (w8 >> 4) & mask; + out[13] = ((w8 >> 47) | (w9 << 17)) & mask; + out[14] = ((w9 >> 26) | (w10 << 38)) & mask; + out[15] = (w10 >> 5) & mask; + out[16] = ((w10 >> 48) | (w11 << 16)) & mask; + out[17] = ((w11 >> 27) | (w12 << 37)) & mask; + out[18] = (w12 >> 6) & mask; + out[19] = ((w12 >> 49) | (w13 << 15)) & mask; + out[20] = ((w13 >> 28) | (w14 << 36)) & mask; + out[21] = (w14 >> 7) & mask; + out[22] = ((w14 >> 50) | (w15 << 14)) & mask; + out[23] = ((w15 >> 29) | (w16 << 35)) & mask; + out[24] = (w16 >> 8) & mask; + out[25] = ((w16 >> 51) | (w17 << 13)) & mask; + out[26] = ((w17 >> 30) | (w18 << 34)) & mask; + out[27] = (w18 >> 9) & mask; + out[28] = ((w18 >> 52) | (w19 << 12)) & mask; + out[29] = ((w19 >> 31) | (w20 << 33)) & mask; + out[30] = (w20 >> 10) & mask; + out[31] = ((w20 >> 53) | (w21 << 11)) & mask; + + return in; +} + +inline const uint8_t* unpack44_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 44) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = ((w0 >> 44) | (w1 << 20)) & mask; + out[2] = ((w1 >> 24) | (w2 << 40)) & mask; + out[3] = (w2 >> 4) & mask; + out[4] = ((w2 >> 48) | (w3 << 16)) & mask; + out[5] = ((w3 >> 28) | (w4 << 36)) & mask; + out[6] = (w4 >> 8) & mask; + out[7] = ((w4 >> 52) | (w5 << 12)) & mask; + out[8] = ((w5 >> 32) | (w6 << 32)) & mask; + out[9] = (w6 >> 12) & mask; + out[10] = ((w6 >> 56) | (w7 << 8)) & mask; + out[11] = ((w7 >> 36) | (w8 << 28)) & mask; + out[12] = (w8 >> 16) & mask; + out[13] = ((w8 >> 60) | (w9 << 4)) & mask; + out[14] = ((w9 >> 40) | (w10 << 24)) & mask; + out[15] = w10 >> 20; + out[16] = (w11) & mask; + out[17] = ((w11 >> 44) | (w12 << 20)) & mask; + out[18] = ((w12 >> 24) | (w13 << 40)) & mask; + out[19] = (w13 >> 4) & mask; + out[20] = ((w13 >> 48) | (w14 << 16)) & mask; + out[21] = ((w14 >> 28) | (w15 << 36)) & mask; + out[22] = (w15 >> 8) & mask; + out[23] = ((w15 >> 52) | (w16 << 12)) & mask; + out[24] = ((w16 >> 32) | (w17 << 32)) & mask; + out[25] = (w17 >> 12) & mask; + out[26] = ((w17 >> 56) | (w18 << 8)) & mask; + out[27] = ((w18 >> 36) | (w19 << 28)) & mask; + out[28] = (w19 >> 16) & mask; + out[29] = ((w19 >> 60) | (w20 << 4)) & mask; + out[30] = ((w20 >> 40) | (w21 << 24)) & mask; + out[31] = w21 >> 20; + + return in; +} + +inline const uint8_t* unpack45_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 45) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w22 = static_cast(util::SafeLoadAs(in)); + w22 = bit_util::FromLittleEndian(w22); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 45) | (w1 << 19)) & mask; + out[2] = ((w1 >> 26) | (w2 << 38)) & mask; + out[3] = (w2 >> 7) & mask; + out[4] = ((w2 >> 52) | (w3 << 12)) & mask; + out[5] = ((w3 >> 33) | (w4 << 31)) & mask; + out[6] = (w4 >> 14) & mask; + out[7] = ((w4 >> 59) | (w5 << 5)) & mask; + out[8] = ((w5 >> 40) | (w6 << 24)) & mask; + out[9] = ((w6 >> 21) | (w7 << 43)) & mask; + out[10] = (w7 >> 2) & mask; + out[11] = ((w7 >> 47) | (w8 << 17)) & mask; + out[12] = ((w8 >> 28) | (w9 << 36)) & mask; + out[13] = (w9 >> 9) & mask; + out[14] = ((w9 >> 54) | (w10 << 10)) & mask; + out[15] = ((w10 >> 35) | (w11 << 29)) & mask; + out[16] = (w11 >> 16) & mask; + out[17] = ((w11 >> 61) | (w12 << 3)) & mask; + out[18] = ((w12 >> 42) | (w13 << 22)) & mask; + out[19] = ((w13 >> 23) | (w14 << 41)) & mask; + out[20] = (w14 >> 4) & mask; + out[21] = ((w14 >> 49) | (w15 << 15)) & mask; + out[22] = ((w15 >> 30) | (w16 << 34)) & mask; + out[23] = (w16 >> 11) & mask; + out[24] = ((w16 >> 56) | (w17 << 8)) & mask; + out[25] = ((w17 >> 37) | (w18 << 27)) & mask; + out[26] = (w18 >> 18) & mask; + out[27] = ((w18 >> 63) | (w19 << 1)) & mask; + out[28] = ((w19 >> 44) | (w20 << 20)) & mask; + out[29] = ((w20 >> 25) | (w21 << 39)) & mask; + out[30] = (w21 >> 6) & mask; + out[31] = ((w21 >> 51) | (w22 << 13)) & mask; + + return in; +} + +inline const uint8_t* unpack46_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 46) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = ((w0 >> 46) | (w1 << 18)) & mask; + out[2] = ((w1 >> 28) | (w2 << 36)) & mask; + out[3] = (w2 >> 10) & mask; + out[4] = ((w2 >> 56) | (w3 << 8)) & mask; + out[5] = ((w3 >> 38) | (w4 << 26)) & mask; + out[6] = ((w4 >> 20) | (w5 << 44)) & mask; + out[7] = (w5 >> 2) & mask; + out[8] = ((w5 >> 48) | (w6 << 16)) & mask; + out[9] = ((w6 >> 30) | (w7 << 34)) & mask; + out[10] = (w7 >> 12) & mask; + out[11] = ((w7 >> 58) | (w8 << 6)) & mask; + out[12] = ((w8 >> 40) | (w9 << 24)) & mask; + out[13] = ((w9 >> 22) | (w10 << 42)) & mask; + out[14] = (w10 >> 4) & mask; + out[15] = ((w10 >> 50) | (w11 << 14)) & mask; + out[16] = ((w11 >> 32) | (w12 << 32)) & mask; + out[17] = (w12 >> 14) & mask; + out[18] = ((w12 >> 60) | (w13 << 4)) & mask; + out[19] = ((w13 >> 42) | (w14 << 22)) & mask; + out[20] = ((w14 >> 24) | (w15 << 40)) & mask; + out[21] = (w15 >> 6) & mask; + out[22] = ((w15 >> 52) | (w16 << 12)) & mask; + out[23] = ((w16 >> 34) | (w17 << 30)) & mask; + out[24] = (w17 >> 16) & mask; + out[25] = ((w17 >> 62) | (w18 << 2)) & mask; + out[26] = ((w18 >> 44) | (w19 << 20)) & mask; + out[27] = ((w19 >> 26) | (w20 << 38)) & mask; + out[28] = (w20 >> 8) & mask; + out[29] = ((w20 >> 54) | (w21 << 10)) & mask; + out[30] = ((w21 >> 36) | (w22 << 28)) & mask; + out[31] = w22 >> 18; + + return in; +} + +inline const uint8_t* unpack47_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 47) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w23 = static_cast(util::SafeLoadAs(in)); + w23 = bit_util::FromLittleEndian(w23); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 47) | (w1 << 17)) & mask; + out[2] = ((w1 >> 30) | (w2 << 34)) & mask; + out[3] = (w2 >> 13) & mask; + out[4] = ((w2 >> 60) | (w3 << 4)) & mask; + out[5] = ((w3 >> 43) | (w4 << 21)) & mask; + out[6] = ((w4 >> 26) | (w5 << 38)) & mask; + out[7] = (w5 >> 9) & mask; + out[8] = ((w5 >> 56) | (w6 << 8)) & mask; + out[9] = ((w6 >> 39) | (w7 << 25)) & mask; + out[10] = ((w7 >> 22) | (w8 << 42)) & mask; + out[11] = (w8 >> 5) & mask; + out[12] = ((w8 >> 52) | (w9 << 12)) & mask; + out[13] = ((w9 >> 35) | (w10 << 29)) & mask; + out[14] = ((w10 >> 18) | (w11 << 46)) & mask; + out[15] = (w11 >> 1) & mask; + out[16] = ((w11 >> 48) | (w12 << 16)) & mask; + out[17] = ((w12 >> 31) | (w13 << 33)) & mask; + out[18] = (w13 >> 14) & mask; + out[19] = ((w13 >> 61) | (w14 << 3)) & mask; + out[20] = ((w14 >> 44) | (w15 << 20)) & mask; + out[21] = ((w15 >> 27) | (w16 << 37)) & mask; + out[22] = (w16 >> 10) & mask; + out[23] = ((w16 >> 57) | (w17 << 7)) & mask; + out[24] = ((w17 >> 40) | (w18 << 24)) & mask; + out[25] = ((w18 >> 23) | (w19 << 41)) & mask; + out[26] = (w19 >> 6) & mask; + out[27] = ((w19 >> 53) | (w20 << 11)) & mask; + out[28] = ((w20 >> 36) | (w21 << 28)) & mask; + out[29] = ((w21 >> 19) | (w22 << 45)) & mask; + out[30] = (w22 >> 2) & mask; + out[31] = ((w22 >> 49) | (w23 << 15)) & mask; + + return in; +} + +inline const uint8_t* unpack48_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 48) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = ((w0 >> 48) | (w1 << 16)) & mask; + out[2] = ((w1 >> 32) | (w2 << 32)) & mask; + out[3] = w2 >> 16; + out[4] = (w3) & mask; + out[5] = ((w3 >> 48) | (w4 << 16)) & mask; + out[6] = ((w4 >> 32) | (w5 << 32)) & mask; + out[7] = w5 >> 16; + out[8] = (w6) & mask; + out[9] = ((w6 >> 48) | (w7 << 16)) & mask; + out[10] = ((w7 >> 32) | (w8 << 32)) & mask; + out[11] = w8 >> 16; + out[12] = (w9) & mask; + out[13] = ((w9 >> 48) | (w10 << 16)) & mask; + out[14] = ((w10 >> 32) | (w11 << 32)) & mask; + out[15] = w11 >> 16; + out[16] = (w12) & mask; + out[17] = ((w12 >> 48) | (w13 << 16)) & mask; + out[18] = ((w13 >> 32) | (w14 << 32)) & mask; + out[19] = w14 >> 16; + out[20] = (w15) & mask; + out[21] = ((w15 >> 48) | (w16 << 16)) & mask; + out[22] = ((w16 >> 32) | (w17 << 32)) & mask; + out[23] = w17 >> 16; + out[24] = (w18) & mask; + out[25] = ((w18 >> 48) | (w19 << 16)) & mask; + out[26] = ((w19 >> 32) | (w20 << 32)) & mask; + out[27] = w20 >> 16; + out[28] = (w21) & mask; + out[29] = ((w21 >> 48) | (w22 << 16)) & mask; + out[30] = ((w22 >> 32) | (w23 << 32)) & mask; + out[31] = w23 >> 16; + + return in; +} + +inline const uint8_t* unpack49_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 49) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w24 = static_cast(util::SafeLoadAs(in)); + w24 = bit_util::FromLittleEndian(w24); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 49) | (w1 << 15)) & mask; + out[2] = ((w1 >> 34) | (w2 << 30)) & mask; + out[3] = ((w2 >> 19) | (w3 << 45)) & mask; + out[4] = (w3 >> 4) & mask; + out[5] = ((w3 >> 53) | (w4 << 11)) & mask; + out[6] = ((w4 >> 38) | (w5 << 26)) & mask; + out[7] = ((w5 >> 23) | (w6 << 41)) & mask; + out[8] = (w6 >> 8) & mask; + out[9] = ((w6 >> 57) | (w7 << 7)) & mask; + out[10] = ((w7 >> 42) | (w8 << 22)) & mask; + out[11] = ((w8 >> 27) | (w9 << 37)) & mask; + out[12] = (w9 >> 12) & mask; + out[13] = ((w9 >> 61) | (w10 << 3)) & mask; + out[14] = ((w10 >> 46) | (w11 << 18)) & mask; + out[15] = ((w11 >> 31) | (w12 << 33)) & mask; + out[16] = ((w12 >> 16) | (w13 << 48)) & mask; + out[17] = (w13 >> 1) & mask; + out[18] = ((w13 >> 50) | (w14 << 14)) & mask; + out[19] = ((w14 >> 35) | (w15 << 29)) & mask; + out[20] = ((w15 >> 20) | (w16 << 44)) & mask; + out[21] = (w16 >> 5) & mask; + out[22] = ((w16 >> 54) | (w17 << 10)) & mask; + out[23] = ((w17 >> 39) | (w18 << 25)) & mask; + out[24] = ((w18 >> 24) | (w19 << 40)) & mask; + out[25] = (w19 >> 9) & mask; + out[26] = ((w19 >> 58) | (w20 << 6)) & mask; + out[27] = ((w20 >> 43) | (w21 << 21)) & mask; + out[28] = ((w21 >> 28) | (w22 << 36)) & mask; + out[29] = (w22 >> 13) & mask; + out[30] = ((w22 >> 62) | (w23 << 2)) & mask; + out[31] = ((w23 >> 47) | (w24 << 17)) & mask; + + return in; +} + +inline const uint8_t* unpack50_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 50) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = ((w0 >> 50) | (w1 << 14)) & mask; + out[2] = ((w1 >> 36) | (w2 << 28)) & mask; + out[3] = ((w2 >> 22) | (w3 << 42)) & mask; + out[4] = (w3 >> 8) & mask; + out[5] = ((w3 >> 58) | (w4 << 6)) & mask; + out[6] = ((w4 >> 44) | (w5 << 20)) & mask; + out[7] = ((w5 >> 30) | (w6 << 34)) & mask; + out[8] = ((w6 >> 16) | (w7 << 48)) & mask; + out[9] = (w7 >> 2) & mask; + out[10] = ((w7 >> 52) | (w8 << 12)) & mask; + out[11] = ((w8 >> 38) | (w9 << 26)) & mask; + out[12] = ((w9 >> 24) | (w10 << 40)) & mask; + out[13] = (w10 >> 10) & mask; + out[14] = ((w10 >> 60) | (w11 << 4)) & mask; + out[15] = ((w11 >> 46) | (w12 << 18)) & mask; + out[16] = ((w12 >> 32) | (w13 << 32)) & mask; + out[17] = ((w13 >> 18) | (w14 << 46)) & mask; + out[18] = (w14 >> 4) & mask; + out[19] = ((w14 >> 54) | (w15 << 10)) & mask; + out[20] = ((w15 >> 40) | (w16 << 24)) & mask; + out[21] = ((w16 >> 26) | (w17 << 38)) & mask; + out[22] = (w17 >> 12) & mask; + out[23] = ((w17 >> 62) | (w18 << 2)) & mask; + out[24] = ((w18 >> 48) | (w19 << 16)) & mask; + out[25] = ((w19 >> 34) | (w20 << 30)) & mask; + out[26] = ((w20 >> 20) | (w21 << 44)) & mask; + out[27] = (w21 >> 6) & mask; + out[28] = ((w21 >> 56) | (w22 << 8)) & mask; + out[29] = ((w22 >> 42) | (w23 << 22)) & mask; + out[30] = ((w23 >> 28) | (w24 << 36)) & mask; + out[31] = w24 >> 14; + + return in; +} + +inline const uint8_t* unpack51_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 51) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w25 = static_cast(util::SafeLoadAs(in)); + w25 = bit_util::FromLittleEndian(w25); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 51) | (w1 << 13)) & mask; + out[2] = ((w1 >> 38) | (w2 << 26)) & mask; + out[3] = ((w2 >> 25) | (w3 << 39)) & mask; + out[4] = (w3 >> 12) & mask; + out[5] = ((w3 >> 63) | (w4 << 1)) & mask; + out[6] = ((w4 >> 50) | (w5 << 14)) & mask; + out[7] = ((w5 >> 37) | (w6 << 27)) & mask; + out[8] = ((w6 >> 24) | (w7 << 40)) & mask; + out[9] = (w7 >> 11) & mask; + out[10] = ((w7 >> 62) | (w8 << 2)) & mask; + out[11] = ((w8 >> 49) | (w9 << 15)) & mask; + out[12] = ((w9 >> 36) | (w10 << 28)) & mask; + out[13] = ((w10 >> 23) | (w11 << 41)) & mask; + out[14] = (w11 >> 10) & mask; + out[15] = ((w11 >> 61) | (w12 << 3)) & mask; + out[16] = ((w12 >> 48) | (w13 << 16)) & mask; + out[17] = ((w13 >> 35) | (w14 << 29)) & mask; + out[18] = ((w14 >> 22) | (w15 << 42)) & mask; + out[19] = (w15 >> 9) & mask; + out[20] = ((w15 >> 60) | (w16 << 4)) & mask; + out[21] = ((w16 >> 47) | (w17 << 17)) & mask; + out[22] = ((w17 >> 34) | (w18 << 30)) & mask; + out[23] = ((w18 >> 21) | (w19 << 43)) & mask; + out[24] = (w19 >> 8) & mask; + out[25] = ((w19 >> 59) | (w20 << 5)) & mask; + out[26] = ((w20 >> 46) | (w21 << 18)) & mask; + out[27] = ((w21 >> 33) | (w22 << 31)) & mask; + out[28] = ((w22 >> 20) | (w23 << 44)) & mask; + out[29] = (w23 >> 7) & mask; + out[30] = ((w23 >> 58) | (w24 << 6)) & mask; + out[31] = ((w24 >> 45) | (w25 << 19)) & mask; + + return in; +} + +inline const uint8_t* unpack52_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 52) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = ((w0 >> 52) | (w1 << 12)) & mask; + out[2] = ((w1 >> 40) | (w2 << 24)) & mask; + out[3] = ((w2 >> 28) | (w3 << 36)) & mask; + out[4] = ((w3 >> 16) | (w4 << 48)) & mask; + out[5] = (w4 >> 4) & mask; + out[6] = ((w4 >> 56) | (w5 << 8)) & mask; + out[7] = ((w5 >> 44) | (w6 << 20)) & mask; + out[8] = ((w6 >> 32) | (w7 << 32)) & mask; + out[9] = ((w7 >> 20) | (w8 << 44)) & mask; + out[10] = (w8 >> 8) & mask; + out[11] = ((w8 >> 60) | (w9 << 4)) & mask; + out[12] = ((w9 >> 48) | (w10 << 16)) & mask; + out[13] = ((w10 >> 36) | (w11 << 28)) & mask; + out[14] = ((w11 >> 24) | (w12 << 40)) & mask; + out[15] = w12 >> 12; + out[16] = (w13) & mask; + out[17] = ((w13 >> 52) | (w14 << 12)) & mask; + out[18] = ((w14 >> 40) | (w15 << 24)) & mask; + out[19] = ((w15 >> 28) | (w16 << 36)) & mask; + out[20] = ((w16 >> 16) | (w17 << 48)) & mask; + out[21] = (w17 >> 4) & mask; + out[22] = ((w17 >> 56) | (w18 << 8)) & mask; + out[23] = ((w18 >> 44) | (w19 << 20)) & mask; + out[24] = ((w19 >> 32) | (w20 << 32)) & mask; + out[25] = ((w20 >> 20) | (w21 << 44)) & mask; + out[26] = (w21 >> 8) & mask; + out[27] = ((w21 >> 60) | (w22 << 4)) & mask; + out[28] = ((w22 >> 48) | (w23 << 16)) & mask; + out[29] = ((w23 >> 36) | (w24 << 28)) & mask; + out[30] = ((w24 >> 24) | (w25 << 40)) & mask; + out[31] = w25 >> 12; + + return in; +} + +inline const uint8_t* unpack53_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 53) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w26 = static_cast(util::SafeLoadAs(in)); + w26 = bit_util::FromLittleEndian(w26); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 53) | (w1 << 11)) & mask; + out[2] = ((w1 >> 42) | (w2 << 22)) & mask; + out[3] = ((w2 >> 31) | (w3 << 33)) & mask; + out[4] = ((w3 >> 20) | (w4 << 44)) & mask; + out[5] = (w4 >> 9) & mask; + out[6] = ((w4 >> 62) | (w5 << 2)) & mask; + out[7] = ((w5 >> 51) | (w6 << 13)) & mask; + out[8] = ((w6 >> 40) | (w7 << 24)) & mask; + out[9] = ((w7 >> 29) | (w8 << 35)) & mask; + out[10] = ((w8 >> 18) | (w9 << 46)) & mask; + out[11] = (w9 >> 7) & mask; + out[12] = ((w9 >> 60) | (w10 << 4)) & mask; + out[13] = ((w10 >> 49) | (w11 << 15)) & mask; + out[14] = ((w11 >> 38) | (w12 << 26)) & mask; + out[15] = ((w12 >> 27) | (w13 << 37)) & mask; + out[16] = ((w13 >> 16) | (w14 << 48)) & mask; + out[17] = (w14 >> 5) & mask; + out[18] = ((w14 >> 58) | (w15 << 6)) & mask; + out[19] = ((w15 >> 47) | (w16 << 17)) & mask; + out[20] = ((w16 >> 36) | (w17 << 28)) & mask; + out[21] = ((w17 >> 25) | (w18 << 39)) & mask; + out[22] = ((w18 >> 14) | (w19 << 50)) & mask; + out[23] = (w19 >> 3) & mask; + out[24] = ((w19 >> 56) | (w20 << 8)) & mask; + out[25] = ((w20 >> 45) | (w21 << 19)) & mask; + out[26] = ((w21 >> 34) | (w22 << 30)) & mask; + out[27] = ((w22 >> 23) | (w23 << 41)) & mask; + out[28] = ((w23 >> 12) | (w24 << 52)) & mask; + out[29] = (w24 >> 1) & mask; + out[30] = ((w24 >> 54) | (w25 << 10)) & mask; + out[31] = ((w25 >> 43) | (w26 << 21)) & mask; + + return in; +} + +inline const uint8_t* unpack54_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 54) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = ((w0 >> 54) | (w1 << 10)) & mask; + out[2] = ((w1 >> 44) | (w2 << 20)) & mask; + out[3] = ((w2 >> 34) | (w3 << 30)) & mask; + out[4] = ((w3 >> 24) | (w4 << 40)) & mask; + out[5] = ((w4 >> 14) | (w5 << 50)) & mask; + out[6] = (w5 >> 4) & mask; + out[7] = ((w5 >> 58) | (w6 << 6)) & mask; + out[8] = ((w6 >> 48) | (w7 << 16)) & mask; + out[9] = ((w7 >> 38) | (w8 << 26)) & mask; + out[10] = ((w8 >> 28) | (w9 << 36)) & mask; + out[11] = ((w9 >> 18) | (w10 << 46)) & mask; + out[12] = (w10 >> 8) & mask; + out[13] = ((w10 >> 62) | (w11 << 2)) & mask; + out[14] = ((w11 >> 52) | (w12 << 12)) & mask; + out[15] = ((w12 >> 42) | (w13 << 22)) & mask; + out[16] = ((w13 >> 32) | (w14 << 32)) & mask; + out[17] = ((w14 >> 22) | (w15 << 42)) & mask; + out[18] = ((w15 >> 12) | (w16 << 52)) & mask; + out[19] = (w16 >> 2) & mask; + out[20] = ((w16 >> 56) | (w17 << 8)) & mask; + out[21] = ((w17 >> 46) | (w18 << 18)) & mask; + out[22] = ((w18 >> 36) | (w19 << 28)) & mask; + out[23] = ((w19 >> 26) | (w20 << 38)) & mask; + out[24] = ((w20 >> 16) | (w21 << 48)) & mask; + out[25] = (w21 >> 6) & mask; + out[26] = ((w21 >> 60) | (w22 << 4)) & mask; + out[27] = ((w22 >> 50) | (w23 << 14)) & mask; + out[28] = ((w23 >> 40) | (w24 << 24)) & mask; + out[29] = ((w24 >> 30) | (w25 << 34)) & mask; + out[30] = ((w25 >> 20) | (w26 << 44)) & mask; + out[31] = w26 >> 10; + + return in; +} + +inline const uint8_t* unpack55_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 55) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w27 = static_cast(util::SafeLoadAs(in)); + w27 = bit_util::FromLittleEndian(w27); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 55) | (w1 << 9)) & mask; + out[2] = ((w1 >> 46) | (w2 << 18)) & mask; + out[3] = ((w2 >> 37) | (w3 << 27)) & mask; + out[4] = ((w3 >> 28) | (w4 << 36)) & mask; + out[5] = ((w4 >> 19) | (w5 << 45)) & mask; + out[6] = ((w5 >> 10) | (w6 << 54)) & mask; + out[7] = (w6 >> 1) & mask; + out[8] = ((w6 >> 56) | (w7 << 8)) & mask; + out[9] = ((w7 >> 47) | (w8 << 17)) & mask; + out[10] = ((w8 >> 38) | (w9 << 26)) & mask; + out[11] = ((w9 >> 29) | (w10 << 35)) & mask; + out[12] = ((w10 >> 20) | (w11 << 44)) & mask; + out[13] = ((w11 >> 11) | (w12 << 53)) & mask; + out[14] = (w12 >> 2) & mask; + out[15] = ((w12 >> 57) | (w13 << 7)) & mask; + out[16] = ((w13 >> 48) | (w14 << 16)) & mask; + out[17] = ((w14 >> 39) | (w15 << 25)) & mask; + out[18] = ((w15 >> 30) | (w16 << 34)) & mask; + out[19] = ((w16 >> 21) | (w17 << 43)) & mask; + out[20] = ((w17 >> 12) | (w18 << 52)) & mask; + out[21] = (w18 >> 3) & mask; + out[22] = ((w18 >> 58) | (w19 << 6)) & mask; + out[23] = ((w19 >> 49) | (w20 << 15)) & mask; + out[24] = ((w20 >> 40) | (w21 << 24)) & mask; + out[25] = ((w21 >> 31) | (w22 << 33)) & mask; + out[26] = ((w22 >> 22) | (w23 << 42)) & mask; + out[27] = ((w23 >> 13) | (w24 << 51)) & mask; + out[28] = (w24 >> 4) & mask; + out[29] = ((w24 >> 59) | (w25 << 5)) & mask; + out[30] = ((w25 >> 50) | (w26 << 14)) & mask; + out[31] = ((w26 >> 41) | (w27 << 23)) & mask; + + return in; +} + +inline const uint8_t* unpack56_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 56) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = ((w0 >> 56) | (w1 << 8)) & mask; + out[2] = ((w1 >> 48) | (w2 << 16)) & mask; + out[3] = ((w2 >> 40) | (w3 << 24)) & mask; + out[4] = ((w3 >> 32) | (w4 << 32)) & mask; + out[5] = ((w4 >> 24) | (w5 << 40)) & mask; + out[6] = ((w5 >> 16) | (w6 << 48)) & mask; + out[7] = w6 >> 8; + out[8] = (w7) & mask; + out[9] = ((w7 >> 56) | (w8 << 8)) & mask; + out[10] = ((w8 >> 48) | (w9 << 16)) & mask; + out[11] = ((w9 >> 40) | (w10 << 24)) & mask; + out[12] = ((w10 >> 32) | (w11 << 32)) & mask; + out[13] = ((w11 >> 24) | (w12 << 40)) & mask; + out[14] = ((w12 >> 16) | (w13 << 48)) & mask; + out[15] = w13 >> 8; + out[16] = (w14) & mask; + out[17] = ((w14 >> 56) | (w15 << 8)) & mask; + out[18] = ((w15 >> 48) | (w16 << 16)) & mask; + out[19] = ((w16 >> 40) | (w17 << 24)) & mask; + out[20] = ((w17 >> 32) | (w18 << 32)) & mask; + out[21] = ((w18 >> 24) | (w19 << 40)) & mask; + out[22] = ((w19 >> 16) | (w20 << 48)) & mask; + out[23] = w20 >> 8; + out[24] = (w21) & mask; + out[25] = ((w21 >> 56) | (w22 << 8)) & mask; + out[26] = ((w22 >> 48) | (w23 << 16)) & mask; + out[27] = ((w23 >> 40) | (w24 << 24)) & mask; + out[28] = ((w24 >> 32) | (w25 << 32)) & mask; + out[29] = ((w25 >> 24) | (w26 << 40)) & mask; + out[30] = ((w26 >> 16) | (w27 << 48)) & mask; + out[31] = w27 >> 8; + + return in; +} + +inline const uint8_t* unpack57_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 57) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w28 = static_cast(util::SafeLoadAs(in)); + w28 = bit_util::FromLittleEndian(w28); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 57) | (w1 << 7)) & mask; + out[2] = ((w1 >> 50) | (w2 << 14)) & mask; + out[3] = ((w2 >> 43) | (w3 << 21)) & mask; + out[4] = ((w3 >> 36) | (w4 << 28)) & mask; + out[5] = ((w4 >> 29) | (w5 << 35)) & mask; + out[6] = ((w5 >> 22) | (w6 << 42)) & mask; + out[7] = ((w6 >> 15) | (w7 << 49)) & mask; + out[8] = ((w7 >> 8) | (w8 << 56)) & mask; + out[9] = (w8 >> 1) & mask; + out[10] = ((w8 >> 58) | (w9 << 6)) & mask; + out[11] = ((w9 >> 51) | (w10 << 13)) & mask; + out[12] = ((w10 >> 44) | (w11 << 20)) & mask; + out[13] = ((w11 >> 37) | (w12 << 27)) & mask; + out[14] = ((w12 >> 30) | (w13 << 34)) & mask; + out[15] = ((w13 >> 23) | (w14 << 41)) & mask; + out[16] = ((w14 >> 16) | (w15 << 48)) & mask; + out[17] = ((w15 >> 9) | (w16 << 55)) & mask; + out[18] = (w16 >> 2) & mask; + out[19] = ((w16 >> 59) | (w17 << 5)) & mask; + out[20] = ((w17 >> 52) | (w18 << 12)) & mask; + out[21] = ((w18 >> 45) | (w19 << 19)) & mask; + out[22] = ((w19 >> 38) | (w20 << 26)) & mask; + out[23] = ((w20 >> 31) | (w21 << 33)) & mask; + out[24] = ((w21 >> 24) | (w22 << 40)) & mask; + out[25] = ((w22 >> 17) | (w23 << 47)) & mask; + out[26] = ((w23 >> 10) | (w24 << 54)) & mask; + out[27] = (w24 >> 3) & mask; + out[28] = ((w24 >> 60) | (w25 << 4)) & mask; + out[29] = ((w25 >> 53) | (w26 << 11)) & mask; + out[30] = ((w26 >> 46) | (w27 << 18)) & mask; + out[31] = ((w27 >> 39) | (w28 << 25)) & mask; + + return in; +} + +inline const uint8_t* unpack58_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 58) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = ((w0 >> 58) | (w1 << 6)) & mask; + out[2] = ((w1 >> 52) | (w2 << 12)) & mask; + out[3] = ((w2 >> 46) | (w3 << 18)) & mask; + out[4] = ((w3 >> 40) | (w4 << 24)) & mask; + out[5] = ((w4 >> 34) | (w5 << 30)) & mask; + out[6] = ((w5 >> 28) | (w6 << 36)) & mask; + out[7] = ((w6 >> 22) | (w7 << 42)) & mask; + out[8] = ((w7 >> 16) | (w8 << 48)) & mask; + out[9] = ((w8 >> 10) | (w9 << 54)) & mask; + out[10] = (w9 >> 4) & mask; + out[11] = ((w9 >> 62) | (w10 << 2)) & mask; + out[12] = ((w10 >> 56) | (w11 << 8)) & mask; + out[13] = ((w11 >> 50) | (w12 << 14)) & mask; + out[14] = ((w12 >> 44) | (w13 << 20)) & mask; + out[15] = ((w13 >> 38) | (w14 << 26)) & mask; + out[16] = ((w14 >> 32) | (w15 << 32)) & mask; + out[17] = ((w15 >> 26) | (w16 << 38)) & mask; + out[18] = ((w16 >> 20) | (w17 << 44)) & mask; + out[19] = ((w17 >> 14) | (w18 << 50)) & mask; + out[20] = ((w18 >> 8) | (w19 << 56)) & mask; + out[21] = (w19 >> 2) & mask; + out[22] = ((w19 >> 60) | (w20 << 4)) & mask; + out[23] = ((w20 >> 54) | (w21 << 10)) & mask; + out[24] = ((w21 >> 48) | (w22 << 16)) & mask; + out[25] = ((w22 >> 42) | (w23 << 22)) & mask; + out[26] = ((w23 >> 36) | (w24 << 28)) & mask; + out[27] = ((w24 >> 30) | (w25 << 34)) & mask; + out[28] = ((w25 >> 24) | (w26 << 40)) & mask; + out[29] = ((w26 >> 18) | (w27 << 46)) & mask; + out[30] = ((w27 >> 12) | (w28 << 52)) & mask; + out[31] = w28 >> 6; + + return in; +} + +inline const uint8_t* unpack59_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 59) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w29 = static_cast(util::SafeLoadAs(in)); + w29 = bit_util::FromLittleEndian(w29); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 59) | (w1 << 5)) & mask; + out[2] = ((w1 >> 54) | (w2 << 10)) & mask; + out[3] = ((w2 >> 49) | (w3 << 15)) & mask; + out[4] = ((w3 >> 44) | (w4 << 20)) & mask; + out[5] = ((w4 >> 39) | (w5 << 25)) & mask; + out[6] = ((w5 >> 34) | (w6 << 30)) & mask; + out[7] = ((w6 >> 29) | (w7 << 35)) & mask; + out[8] = ((w7 >> 24) | (w8 << 40)) & mask; + out[9] = ((w8 >> 19) | (w9 << 45)) & mask; + out[10] = ((w9 >> 14) | (w10 << 50)) & mask; + out[11] = ((w10 >> 9) | (w11 << 55)) & mask; + out[12] = (w11 >> 4) & mask; + out[13] = ((w11 >> 63) | (w12 << 1)) & mask; + out[14] = ((w12 >> 58) | (w13 << 6)) & mask; + out[15] = ((w13 >> 53) | (w14 << 11)) & mask; + out[16] = ((w14 >> 48) | (w15 << 16)) & mask; + out[17] = ((w15 >> 43) | (w16 << 21)) & mask; + out[18] = ((w16 >> 38) | (w17 << 26)) & mask; + out[19] = ((w17 >> 33) | (w18 << 31)) & mask; + out[20] = ((w18 >> 28) | (w19 << 36)) & mask; + out[21] = ((w19 >> 23) | (w20 << 41)) & mask; + out[22] = ((w20 >> 18) | (w21 << 46)) & mask; + out[23] = ((w21 >> 13) | (w22 << 51)) & mask; + out[24] = ((w22 >> 8) | (w23 << 56)) & mask; + out[25] = (w23 >> 3) & mask; + out[26] = ((w23 >> 62) | (w24 << 2)) & mask; + out[27] = ((w24 >> 57) | (w25 << 7)) & mask; + out[28] = ((w25 >> 52) | (w26 << 12)) & mask; + out[29] = ((w26 >> 47) | (w27 << 17)) & mask; + out[30] = ((w27 >> 42) | (w28 << 22)) & mask; + out[31] = ((w28 >> 37) | (w29 << 27)) & mask; + + return in; +} + +inline const uint8_t* unpack60_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 60) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w29 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = ((w0 >> 60) | (w1 << 4)) & mask; + out[2] = ((w1 >> 56) | (w2 << 8)) & mask; + out[3] = ((w2 >> 52) | (w3 << 12)) & mask; + out[4] = ((w3 >> 48) | (w4 << 16)) & mask; + out[5] = ((w4 >> 44) | (w5 << 20)) & mask; + out[6] = ((w5 >> 40) | (w6 << 24)) & mask; + out[7] = ((w6 >> 36) | (w7 << 28)) & mask; + out[8] = ((w7 >> 32) | (w8 << 32)) & mask; + out[9] = ((w8 >> 28) | (w9 << 36)) & mask; + out[10] = ((w9 >> 24) | (w10 << 40)) & mask; + out[11] = ((w10 >> 20) | (w11 << 44)) & mask; + out[12] = ((w11 >> 16) | (w12 << 48)) & mask; + out[13] = ((w12 >> 12) | (w13 << 52)) & mask; + out[14] = ((w13 >> 8) | (w14 << 56)) & mask; + out[15] = w14 >> 4; + out[16] = (w15) & mask; + out[17] = ((w15 >> 60) | (w16 << 4)) & mask; + out[18] = ((w16 >> 56) | (w17 << 8)) & mask; + out[19] = ((w17 >> 52) | (w18 << 12)) & mask; + out[20] = ((w18 >> 48) | (w19 << 16)) & mask; + out[21] = ((w19 >> 44) | (w20 << 20)) & mask; + out[22] = ((w20 >> 40) | (w21 << 24)) & mask; + out[23] = ((w21 >> 36) | (w22 << 28)) & mask; + out[24] = ((w22 >> 32) | (w23 << 32)) & mask; + out[25] = ((w23 >> 28) | (w24 << 36)) & mask; + out[26] = ((w24 >> 24) | (w25 << 40)) & mask; + out[27] = ((w25 >> 20) | (w26 << 44)) & mask; + out[28] = ((w26 >> 16) | (w27 << 48)) & mask; + out[29] = ((w27 >> 12) | (w28 << 52)) & mask; + out[30] = ((w28 >> 8) | (w29 << 56)) & mask; + out[31] = w29 >> 4; + + return in; +} + +inline const uint8_t* unpack61_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 61) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w29 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w30 = static_cast(util::SafeLoadAs(in)); + w30 = bit_util::FromLittleEndian(w30); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 61) | (w1 << 3)) & mask; + out[2] = ((w1 >> 58) | (w2 << 6)) & mask; + out[3] = ((w2 >> 55) | (w3 << 9)) & mask; + out[4] = ((w3 >> 52) | (w4 << 12)) & mask; + out[5] = ((w4 >> 49) | (w5 << 15)) & mask; + out[6] = ((w5 >> 46) | (w6 << 18)) & mask; + out[7] = ((w6 >> 43) | (w7 << 21)) & mask; + out[8] = ((w7 >> 40) | (w8 << 24)) & mask; + out[9] = ((w8 >> 37) | (w9 << 27)) & mask; + out[10] = ((w9 >> 34) | (w10 << 30)) & mask; + out[11] = ((w10 >> 31) | (w11 << 33)) & mask; + out[12] = ((w11 >> 28) | (w12 << 36)) & mask; + out[13] = ((w12 >> 25) | (w13 << 39)) & mask; + out[14] = ((w13 >> 22) | (w14 << 42)) & mask; + out[15] = ((w14 >> 19) | (w15 << 45)) & mask; + out[16] = ((w15 >> 16) | (w16 << 48)) & mask; + out[17] = ((w16 >> 13) | (w17 << 51)) & mask; + out[18] = ((w17 >> 10) | (w18 << 54)) & mask; + out[19] = ((w18 >> 7) | (w19 << 57)) & mask; + out[20] = ((w19 >> 4) | (w20 << 60)) & mask; + out[21] = (w20 >> 1) & mask; + out[22] = ((w20 >> 62) | (w21 << 2)) & mask; + out[23] = ((w21 >> 59) | (w22 << 5)) & mask; + out[24] = ((w22 >> 56) | (w23 << 8)) & mask; + out[25] = ((w23 >> 53) | (w24 << 11)) & mask; + out[26] = ((w24 >> 50) | (w25 << 14)) & mask; + out[27] = ((w25 >> 47) | (w26 << 17)) & mask; + out[28] = ((w26 >> 44) | (w27 << 20)) & mask; + out[29] = ((w27 >> 41) | (w28 << 23)) & mask; + out[30] = ((w28 >> 38) | (w29 << 26)) & mask; + out[31] = ((w29 >> 35) | (w30 << 29)) & mask; + + return in; +} + +inline const uint8_t* unpack62_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 62) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w29 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w30 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + out[0] = (w0) & mask; + out[1] = ((w0 >> 62) | (w1 << 2)) & mask; + out[2] = ((w1 >> 60) | (w2 << 4)) & mask; + out[3] = ((w2 >> 58) | (w3 << 6)) & mask; + out[4] = ((w3 >> 56) | (w4 << 8)) & mask; + out[5] = ((w4 >> 54) | (w5 << 10)) & mask; + out[6] = ((w5 >> 52) | (w6 << 12)) & mask; + out[7] = ((w6 >> 50) | (w7 << 14)) & mask; + out[8] = ((w7 >> 48) | (w8 << 16)) & mask; + out[9] = ((w8 >> 46) | (w9 << 18)) & mask; + out[10] = ((w9 >> 44) | (w10 << 20)) & mask; + out[11] = ((w10 >> 42) | (w11 << 22)) & mask; + out[12] = ((w11 >> 40) | (w12 << 24)) & mask; + out[13] = ((w12 >> 38) | (w13 << 26)) & mask; + out[14] = ((w13 >> 36) | (w14 << 28)) & mask; + out[15] = ((w14 >> 34) | (w15 << 30)) & mask; + out[16] = ((w15 >> 32) | (w16 << 32)) & mask; + out[17] = ((w16 >> 30) | (w17 << 34)) & mask; + out[18] = ((w17 >> 28) | (w18 << 36)) & mask; + out[19] = ((w18 >> 26) | (w19 << 38)) & mask; + out[20] = ((w19 >> 24) | (w20 << 40)) & mask; + out[21] = ((w20 >> 22) | (w21 << 42)) & mask; + out[22] = ((w21 >> 20) | (w22 << 44)) & mask; + out[23] = ((w22 >> 18) | (w23 << 46)) & mask; + out[24] = ((w23 >> 16) | (w24 << 48)) & mask; + out[25] = ((w24 >> 14) | (w25 << 50)) & mask; + out[26] = ((w25 >> 12) | (w26 << 52)) & mask; + out[27] = ((w26 >> 10) | (w27 << 54)) & mask; + out[28] = ((w27 >> 8) | (w28 << 56)) & mask; + out[29] = ((w28 >> 6) | (w29 << 58)) & mask; + out[30] = ((w29 >> 4) | (w30 << 60)) & mask; + out[31] = w30 >> 2; + + return in; +} + +inline const uint8_t* unpack63_64(const uint8_t* in, uint64_t* out){ + constexpr uint64_t mask = ((uint64_t{1} << 63) - uint64_t{1}); + + const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w29 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + const auto w30 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); + in += 8; + auto w31 = static_cast(util::SafeLoadAs(in)); + w31 = bit_util::FromLittleEndian(w31); + in += 4; + out[0] = (w0) & mask; + out[1] = ((w0 >> 63) | (w1 << 1)) & mask; + out[2] = ((w1 >> 62) | (w2 << 2)) & mask; + out[3] = ((w2 >> 61) | (w3 << 3)) & mask; + out[4] = ((w3 >> 60) | (w4 << 4)) & mask; + out[5] = ((w4 >> 59) | (w5 << 5)) & mask; + out[6] = ((w5 >> 58) | (w6 << 6)) & mask; + out[7] = ((w6 >> 57) | (w7 << 7)) & mask; + out[8] = ((w7 >> 56) | (w8 << 8)) & mask; + out[9] = ((w8 >> 55) | (w9 << 9)) & mask; + out[10] = ((w9 >> 54) | (w10 << 10)) & mask; + out[11] = ((w10 >> 53) | (w11 << 11)) & mask; + out[12] = ((w11 >> 52) | (w12 << 12)) & mask; + out[13] = ((w12 >> 51) | (w13 << 13)) & mask; + out[14] = ((w13 >> 50) | (w14 << 14)) & mask; + out[15] = ((w14 >> 49) | (w15 << 15)) & mask; + out[16] = ((w15 >> 48) | (w16 << 16)) & mask; + out[17] = ((w16 >> 47) | (w17 << 17)) & mask; + out[18] = ((w17 >> 46) | (w18 << 18)) & mask; + out[19] = ((w18 >> 45) | (w19 << 19)) & mask; + out[20] = ((w19 >> 44) | (w20 << 20)) & mask; + out[21] = ((w20 >> 43) | (w21 << 21)) & mask; + out[22] = ((w21 >> 42) | (w22 << 22)) & mask; + out[23] = ((w22 >> 41) | (w23 << 23)) & mask; + out[24] = ((w23 >> 40) | (w24 << 24)) & mask; + out[25] = ((w24 >> 39) | (w25 << 25)) & mask; + out[26] = ((w25 >> 38) | (w26 << 26)) & mask; + out[27] = ((w26 >> 37) | (w27 << 27)) & mask; + out[28] = ((w27 >> 36) | (w28 << 28)) & mask; + out[29] = ((w28 >> 35) | (w29 << 29)) & mask; + out[30] = ((w29 >> 34) | (w30 << 30)) & mask; + out[31] = ((w30 >> 33) | (w31 << 31)) & mask; + + return in; +} + +inline const uint8_t* unpack64_64(const uint8_t* in, uint64_t* out){ + for(int k = 0; k < 32; k += 1) { + auto w = util::SafeLoadAs(in); + out[k] = bit_util::FromLittleEndian(w); + in += 8; + } + return in; +} + +} // namespace arrow::internal + From 23e72689ac141fa51d470e71b7812beaa79d00d5 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 16 Sep 2025 17:49:49 +0200 Subject: [PATCH 08/76] Simplify scalar_unpack increments --- cpp/src/arrow/util/bpacking_scalar_codegen.py | 41 ++++++++++++------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_scalar_codegen.py b/cpp/src/arrow/util/bpacking_scalar_codegen.py index 0a373ab2748..8882e9a7c83 100644 --- a/cpp/src/arrow/util/bpacking_scalar_codegen.py +++ b/cpp/src/arrow/util/bpacking_scalar_codegen.py @@ -65,6 +65,11 @@ #include "arrow/util/ubsan.h" namespace arrow::internal { + +template +Int LoadInt(const uint8_t* in) { + return bit_util::FromLittleEndian(util::SafeLoadAs(in)); +} """ FOOTER = """ @@ -118,11 +123,12 @@ def print_unpack_0(self) -> None: def print_unpack_last(self) -> None: print(self.unpack_signature(self.out_bit_width)) print(f" for(int k = 0; k < {self.howmany}; k += 1) {{") - print(f" auto w = util::SafeLoadAs<{self.unsigned_type}>(in);") - print(" out[k] = bit_util::FromLittleEndian(w);") - print(f" in += {self.out_byte_width};") + print( + f" out[k] = LoadInt<{self.unsigned_type}>(" + f"in + (k * {self.out_byte_width}));" + ) print(" }") - print(" return in;") + print(f" return in + ({self.out_byte_width} * {self.howmany});") print("}") def print_unpack_k(self, bit: int) -> None: @@ -136,24 +142,22 @@ def print_unpack_k(self, bit: int) -> None: for k in range(self.howmanywords(bit) - 1): print( - f" const auto w{k} = " - f"bit_util::FromLittleEndian(util::SafeLoadAs<{self.unsigned_type}>(in));" + f" const auto w{k} = LoadInt<{self.unsigned_type}>(" + f"in + {k} * {self.out_byte_width});" ) - print(f" in += {self.out_byte_width};") k = self.howmanywords(bit) - 1 - if self.smart_halve and bit % 2 == 1: + use_smart_halving = self.smart_halve and bit % 2 == 1 + if use_smart_halving: print( - f" auto w{k} = static_cast<{self.unsigned_type}>(util::SafeLoadAs<{self.unsigned_type_half}>(in));" + f" const auto w{k} = static_cast<{self.unsigned_type}>(LoadInt<{self.unsigned_type_half}>(" + f"in + {k} * {self.out_byte_width}));" ) - print(f" w{k} = bit_util::FromLittleEndian(w{k});") - print(f" in += {self.out_byte_width // 2};") else: print( - f" const auto w{k} = " - f"bit_util::FromLittleEndian(util::SafeLoadAs<{self.unsigned_type}>(in));" + f" const auto w{k} = LoadInt<{self.unsigned_type}>(" + f"in + {k} * {self.out_byte_width});" ) - print(f" in += {self.out_byte_width};") for j in range(self.howmany): firstword = j * bit // self.out_bit_width @@ -174,7 +178,14 @@ def print_unpack_k(self, bit: int) -> None: f"(w{firstword + 1} << {secondshift})){maskstr};" ) print("") - print(" return in;") + + if use_smart_halving: + print( + f" return in + ({self.howmanywords(bit) - 1} * {self.out_byte_width}" + f" + {self.out_byte_width // 2});" + ) + else: + print(f" return in + ({self.howmanywords(bit)} * {self.out_byte_width});") print("}") def print_all(self) -> None: From cffc9d447e1c2a744663ee0a59a8ea961259ebcf Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 16 Sep 2025 17:50:52 +0200 Subject: [PATCH 09/76] Gen: regenerate bpacking_scalar --- .../util/bpacking_scalar_generated_internal.h | 4797 ++++++----------- 1 file changed, 1623 insertions(+), 3174 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_scalar_generated_internal.h b/cpp/src/arrow/util/bpacking_scalar_generated_internal.h index 74c25885402..d215fcfbc46 100644 --- a/cpp/src/arrow/util/bpacking_scalar_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_scalar_generated_internal.h @@ -38,6 +38,11 @@ namespace arrow::internal { +template +Int LoadInt(const uint8_t* in) { + return bit_util::FromLittleEndian(util::SafeLoadAs(in)); +} + inline const uint8_t* unpack0_32(const uint8_t* in, uint32_t* out){ std::memset(out, 0, 32 * 4); return in; @@ -46,8 +51,7 @@ inline const uint8_t* unpack0_32(const uint8_t* in, uint32_t* out){ inline const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 1) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 1) & mask; out[2] = (w0 >> 2) & mask; @@ -81,16 +85,14 @@ inline const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out){ out[30] = (w0 >> 30) & mask; out[31] = w0 >> 31; - return in; + return in + (1 * 4); } inline const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 2) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 2) & mask; out[2] = (w0 >> 4) & mask; @@ -124,18 +126,15 @@ inline const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out){ out[30] = (w1 >> 28) & mask; out[31] = w1 >> 30; - return in; + return in + (2 * 4); } inline const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 3) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 3) & mask; out[2] = (w0 >> 6) & mask; @@ -169,20 +168,16 @@ inline const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out){ out[30] = (w2 >> 26) & mask; out[31] = w2 >> 29; - return in; + return in + (3 * 4); } inline const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 4) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 4) & mask; out[2] = (w0 >> 8) & mask; @@ -216,22 +211,17 @@ inline const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out){ out[30] = (w3 >> 24) & mask; out[31] = w3 >> 28; - return in; + return in + (4 * 4); } inline const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 5) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 5) & mask; out[2] = (w0 >> 10) & mask; @@ -265,24 +255,18 @@ inline const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out){ out[30] = (w4 >> 22) & mask; out[31] = w4 >> 27; - return in; + return in + (5 * 4); } inline const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 6) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 6) & mask; out[2] = (w0 >> 12) & mask; @@ -316,26 +300,19 @@ inline const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out){ out[30] = (w5 >> 20) & mask; out[31] = w5 >> 26; - return in; + return in + (6 * 4); } inline const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 7) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 7) & mask; out[2] = (w0 >> 14) & mask; @@ -369,28 +346,20 @@ inline const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out){ out[30] = (w6 >> 18) & mask; out[31] = w6 >> 25; - return in; + return in + (7 * 4); } inline const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 8) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 8) & mask; out[2] = (w0 >> 16) & mask; @@ -424,30 +393,21 @@ inline const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out){ out[30] = (w7 >> 16) & mask; out[31] = w7 >> 24; - return in; + return in + (8 * 4); } inline const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 9) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 9) & mask; out[2] = (w0 >> 18) & mask; @@ -481,32 +441,22 @@ inline const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out){ out[30] = (w8 >> 14) & mask; out[31] = w8 >> 23; - return in; + return in + (9 * 4); } inline const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 10) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 10) & mask; out[2] = (w0 >> 20) & mask; @@ -540,34 +490,23 @@ inline const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out){ out[30] = (w9 >> 12) & mask; out[31] = w9 >> 22; - return in; + return in + (10 * 4); } inline const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 11) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 11) & mask; out[2] = ((w0 >> 22) | (w1 << 10)) & mask; @@ -601,36 +540,24 @@ inline const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out){ out[30] = (w10 >> 10) & mask; out[31] = w10 >> 21; - return in; + return in + (11 * 4); } inline const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 12) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 12) & mask; out[2] = ((w0 >> 24) | (w1 << 8)) & mask; @@ -664,38 +591,25 @@ inline const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out){ out[30] = (w11 >> 8) & mask; out[31] = w11 >> 20; - return in; + return in + (12 * 4); } inline const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 13) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 13) & mask; out[2] = ((w0 >> 26) | (w1 << 6)) & mask; @@ -729,40 +643,26 @@ inline const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out){ out[30] = (w12 >> 6) & mask; out[31] = w12 >> 19; - return in; + return in + (13 * 4); } inline const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 14) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 14) & mask; out[2] = ((w0 >> 28) | (w1 << 4)) & mask; @@ -796,42 +696,27 @@ inline const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out){ out[30] = (w13 >> 4) & mask; out[31] = w13 >> 18; - return in; + return in + (14 * 4); } inline const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 15) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 15) & mask; out[2] = ((w0 >> 30) | (w1 << 2)) & mask; @@ -865,44 +750,28 @@ inline const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out){ out[30] = (w14 >> 2) & mask; out[31] = w14 >> 17; - return in; + return in + (15 * 4); } inline const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 16) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); out[0] = (w0) & mask; out[1] = w0 >> 16; out[2] = (w1) & mask; @@ -936,46 +805,29 @@ inline const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out){ out[30] = (w15) & mask; out[31] = w15 >> 16; - return in; + return in + (16 * 4); } inline const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 17) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 17) | (w1 << 15)) & mask; out[2] = (w1 >> 2) & mask; @@ -1009,48 +861,30 @@ inline const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out){ out[30] = ((w15 >> 30) | (w16 << 2)) & mask; out[31] = w16 >> 15; - return in; + return in + (17 * 4); } inline const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 18) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 18) | (w1 << 14)) & mask; out[2] = (w1 >> 4) & mask; @@ -1084,50 +918,31 @@ inline const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out){ out[30] = ((w16 >> 28) | (w17 << 4)) & mask; out[31] = w17 >> 14; - return in; + return in + (18 * 4); } inline const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 19) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 19) | (w1 << 13)) & mask; out[2] = (w1 >> 6) & mask; @@ -1161,52 +976,32 @@ inline const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out){ out[30] = ((w17 >> 26) | (w18 << 6)) & mask; out[31] = w18 >> 13; - return in; + return in + (19 * 4); } inline const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 20) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 20) | (w1 << 12)) & mask; out[2] = (w1 >> 8) & mask; @@ -1240,54 +1035,33 @@ inline const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out){ out[30] = ((w18 >> 24) | (w19 << 8)) & mask; out[31] = w19 >> 12; - return in; + return in + (20 * 4); } inline const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 21) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 21) | (w1 << 11)) & mask; out[2] = (w1 >> 10) & mask; @@ -1321,56 +1095,34 @@ inline const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out){ out[30] = ((w19 >> 22) | (w20 << 10)) & mask; out[31] = w20 >> 11; - return in; + return in + (21 * 4); } inline const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 22) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 22) | (w1 << 10)) & mask; out[2] = ((w1 >> 12) | (w2 << 20)) & mask; @@ -1404,58 +1156,35 @@ inline const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out){ out[30] = ((w20 >> 20) | (w21 << 12)) & mask; out[31] = w21 >> 10; - return in; + return in + (22 * 4); } inline const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 23) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 23) | (w1 << 9)) & mask; out[2] = ((w1 >> 14) | (w2 << 18)) & mask; @@ -1489,60 +1218,36 @@ inline const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out){ out[30] = ((w21 >> 18) | (w22 << 14)) & mask; out[31] = w22 >> 9; - return in; + return in + (23 * 4); } inline const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 24) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); + const auto w23 = LoadInt(in + 23 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 24) | (w1 << 8)) & mask; out[2] = ((w1 >> 16) | (w2 << 16)) & mask; @@ -1576,62 +1281,37 @@ inline const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out){ out[30] = ((w22 >> 16) | (w23 << 16)) & mask; out[31] = w23 >> 8; - return in; + return in + (24 * 4); } inline const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 25) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); + const auto w23 = LoadInt(in + 23 * 4); + const auto w24 = LoadInt(in + 24 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 25) | (w1 << 7)) & mask; out[2] = ((w1 >> 18) | (w2 << 14)) & mask; @@ -1665,64 +1345,38 @@ inline const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out){ out[30] = ((w23 >> 14) | (w24 << 18)) & mask; out[31] = w24 >> 7; - return in; + return in + (25 * 4); } inline const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 26) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); + const auto w23 = LoadInt(in + 23 * 4); + const auto w24 = LoadInt(in + 24 * 4); + const auto w25 = LoadInt(in + 25 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 26) | (w1 << 6)) & mask; out[2] = ((w1 >> 20) | (w2 << 12)) & mask; @@ -1756,66 +1410,39 @@ inline const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out){ out[30] = ((w24 >> 12) | (w25 << 20)) & mask; out[31] = w25 >> 6; - return in; + return in + (26 * 4); } inline const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 27) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); + const auto w23 = LoadInt(in + 23 * 4); + const auto w24 = LoadInt(in + 24 * 4); + const auto w25 = LoadInt(in + 25 * 4); + const auto w26 = LoadInt(in + 26 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 27) | (w1 << 5)) & mask; out[2] = ((w1 >> 22) | (w2 << 10)) & mask; @@ -1849,68 +1476,40 @@ inline const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out){ out[30] = ((w25 >> 10) | (w26 << 22)) & mask; out[31] = w26 >> 5; - return in; + return in + (27 * 4); } inline const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 28) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); + const auto w23 = LoadInt(in + 23 * 4); + const auto w24 = LoadInt(in + 24 * 4); + const auto w25 = LoadInt(in + 25 * 4); + const auto w26 = LoadInt(in + 26 * 4); + const auto w27 = LoadInt(in + 27 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 28) | (w1 << 4)) & mask; out[2] = ((w1 >> 24) | (w2 << 8)) & mask; @@ -1944,70 +1543,41 @@ inline const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out){ out[30] = ((w26 >> 8) | (w27 << 24)) & mask; out[31] = w27 >> 4; - return in; + return in + (28 * 4); } inline const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 29) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); + const auto w23 = LoadInt(in + 23 * 4); + const auto w24 = LoadInt(in + 24 * 4); + const auto w25 = LoadInt(in + 25 * 4); + const auto w26 = LoadInt(in + 26 * 4); + const auto w27 = LoadInt(in + 27 * 4); + const auto w28 = LoadInt(in + 28 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 29) | (w1 << 3)) & mask; out[2] = ((w1 >> 26) | (w2 << 6)) & mask; @@ -2041,72 +1611,42 @@ inline const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out){ out[30] = ((w27 >> 6) | (w28 << 26)) & mask; out[31] = w28 >> 3; - return in; + return in + (29 * 4); } inline const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 30) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w29 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); + const auto w23 = LoadInt(in + 23 * 4); + const auto w24 = LoadInt(in + 24 * 4); + const auto w25 = LoadInt(in + 25 * 4); + const auto w26 = LoadInt(in + 26 * 4); + const auto w27 = LoadInt(in + 27 * 4); + const auto w28 = LoadInt(in + 28 * 4); + const auto w29 = LoadInt(in + 29 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 30) | (w1 << 2)) & mask; out[2] = ((w1 >> 28) | (w2 << 4)) & mask; @@ -2140,74 +1680,43 @@ inline const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out){ out[30] = ((w28 >> 4) | (w29 << 28)) & mask; out[31] = w29 >> 2; - return in; + return in + (30 * 4); } inline const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 31) - uint32_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w29 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; - const auto w30 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 4; + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); + const auto w23 = LoadInt(in + 23 * 4); + const auto w24 = LoadInt(in + 24 * 4); + const auto w25 = LoadInt(in + 25 * 4); + const auto w26 = LoadInt(in + 26 * 4); + const auto w27 = LoadInt(in + 27 * 4); + const auto w28 = LoadInt(in + 28 * 4); + const auto w29 = LoadInt(in + 29 * 4); + const auto w30 = LoadInt(in + 30 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 31) | (w1 << 1)) & mask; out[2] = ((w1 >> 30) | (w2 << 2)) & mask; @@ -2241,16 +1750,14 @@ inline const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out){ out[30] = ((w29 >> 2) | (w30 << 30)) & mask; out[31] = w30 >> 1; - return in; + return in + (31 * 4); } inline const uint8_t* unpack32_32(const uint8_t* in, uint32_t* out){ for(int k = 0; k < 32; k += 1) { - auto w = util::SafeLoadAs(in); - out[k] = bit_util::FromLittleEndian(w); - in += 4; + out[k] = LoadInt(in + (k * 4)); } - return in; + return in + (4 * 32); } inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out){ @@ -2261,9 +1768,7 @@ inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out){ inline const uint8_t* unpack1_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 1) - uint64_t{1}); - auto w0 = static_cast(util::SafeLoadAs(in)); - w0 = bit_util::FromLittleEndian(w0); - in += 4; + const auto w0 = static_cast(LoadInt(in + 0 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 1) & mask; out[2] = (w0 >> 2) & mask; @@ -2297,14 +1802,13 @@ inline const uint8_t* unpack1_64(const uint8_t* in, uint64_t* out){ out[30] = (w0 >> 30) & mask; out[31] = (w0 >> 31) & mask; - return in; + return in + (0 * 8 + 4); } inline const uint8_t* unpack2_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 2) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 2) & mask; out[2] = (w0 >> 4) & mask; @@ -2338,17 +1842,14 @@ inline const uint8_t* unpack2_64(const uint8_t* in, uint64_t* out){ out[30] = (w0 >> 60) & mask; out[31] = w0 >> 62; - return in; + return in + (1 * 8); } inline const uint8_t* unpack3_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 3) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w1 = static_cast(util::SafeLoadAs(in)); - w1 = bit_util::FromLittleEndian(w1); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = static_cast(LoadInt(in + 1 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 3) & mask; out[2] = (w0 >> 6) & mask; @@ -2382,16 +1883,14 @@ inline const uint8_t* unpack3_64(const uint8_t* in, uint64_t* out){ out[30] = (w1 >> 26) & mask; out[31] = (w1 >> 29) & mask; - return in; + return in + (1 * 8 + 4); } inline const uint8_t* unpack4_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 4) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 4) & mask; out[2] = (w0 >> 8) & mask; @@ -2425,19 +1924,15 @@ inline const uint8_t* unpack4_64(const uint8_t* in, uint64_t* out){ out[30] = (w1 >> 56) & mask; out[31] = w1 >> 60; - return in; + return in + (2 * 8); } inline const uint8_t* unpack5_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 5) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w2 = static_cast(util::SafeLoadAs(in)); - w2 = bit_util::FromLittleEndian(w2); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = static_cast(LoadInt(in + 2 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 5) & mask; out[2] = (w0 >> 10) & mask; @@ -2471,18 +1966,15 @@ inline const uint8_t* unpack5_64(const uint8_t* in, uint64_t* out){ out[30] = (w2 >> 22) & mask; out[31] = (w2 >> 27) & mask; - return in; + return in + (2 * 8 + 4); } inline const uint8_t* unpack6_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 6) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 6) & mask; out[2] = (w0 >> 12) & mask; @@ -2516,21 +2008,16 @@ inline const uint8_t* unpack6_64(const uint8_t* in, uint64_t* out){ out[30] = (w2 >> 52) & mask; out[31] = w2 >> 58; - return in; + return in + (3 * 8); } inline const uint8_t* unpack7_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 7) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w3 = static_cast(util::SafeLoadAs(in)); - w3 = bit_util::FromLittleEndian(w3); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = static_cast(LoadInt(in + 3 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 7) & mask; out[2] = (w0 >> 14) & mask; @@ -2564,20 +2051,16 @@ inline const uint8_t* unpack7_64(const uint8_t* in, uint64_t* out){ out[30] = (w3 >> 18) & mask; out[31] = (w3 >> 25) & mask; - return in; + return in + (3 * 8 + 4); } inline const uint8_t* unpack8_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 8) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 8) & mask; out[2] = (w0 >> 16) & mask; @@ -2611,23 +2094,17 @@ inline const uint8_t* unpack8_64(const uint8_t* in, uint64_t* out){ out[30] = (w3 >> 48) & mask; out[31] = w3 >> 56; - return in; + return in + (4 * 8); } inline const uint8_t* unpack9_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 9) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w4 = static_cast(util::SafeLoadAs(in)); - w4 = bit_util::FromLittleEndian(w4); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = static_cast(LoadInt(in + 4 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 9) & mask; out[2] = (w0 >> 18) & mask; @@ -2661,22 +2138,17 @@ inline const uint8_t* unpack9_64(const uint8_t* in, uint64_t* out){ out[30] = (w4 >> 14) & mask; out[31] = (w4 >> 23) & mask; - return in; + return in + (4 * 8 + 4); } inline const uint8_t* unpack10_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 10) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 10) & mask; out[2] = (w0 >> 20) & mask; @@ -2710,25 +2182,18 @@ inline const uint8_t* unpack10_64(const uint8_t* in, uint64_t* out){ out[30] = (w4 >> 44) & mask; out[31] = w4 >> 54; - return in; + return in + (5 * 8); } inline const uint8_t* unpack11_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 11) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w5 = static_cast(util::SafeLoadAs(in)); - w5 = bit_util::FromLittleEndian(w5); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = static_cast(LoadInt(in + 5 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 11) & mask; out[2] = (w0 >> 22) & mask; @@ -2762,24 +2227,18 @@ inline const uint8_t* unpack11_64(const uint8_t* in, uint64_t* out){ out[30] = (w5 >> 10) & mask; out[31] = (w5 >> 21) & mask; - return in; + return in + (5 * 8 + 4); } inline const uint8_t* unpack12_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 12) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 12) & mask; out[2] = (w0 >> 24) & mask; @@ -2813,27 +2272,19 @@ inline const uint8_t* unpack12_64(const uint8_t* in, uint64_t* out){ out[30] = (w5 >> 40) & mask; out[31] = w5 >> 52; - return in; + return in + (6 * 8); } inline const uint8_t* unpack13_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 13) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w6 = static_cast(util::SafeLoadAs(in)); - w6 = bit_util::FromLittleEndian(w6); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = static_cast(LoadInt(in + 6 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 13) & mask; out[2] = (w0 >> 26) & mask; @@ -2867,26 +2318,19 @@ inline const uint8_t* unpack13_64(const uint8_t* in, uint64_t* out){ out[30] = (w6 >> 6) & mask; out[31] = (w6 >> 19) & mask; - return in; + return in + (6 * 8 + 4); } inline const uint8_t* unpack14_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 14) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 14) & mask; out[2] = (w0 >> 28) & mask; @@ -2920,29 +2364,20 @@ inline const uint8_t* unpack14_64(const uint8_t* in, uint64_t* out){ out[30] = (w6 >> 36) & mask; out[31] = w6 >> 50; - return in; + return in + (7 * 8); } inline const uint8_t* unpack15_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 15) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w7 = static_cast(util::SafeLoadAs(in)); - w7 = bit_util::FromLittleEndian(w7); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = static_cast(LoadInt(in + 7 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 15) & mask; out[2] = (w0 >> 30) & mask; @@ -2976,28 +2411,20 @@ inline const uint8_t* unpack15_64(const uint8_t* in, uint64_t* out){ out[30] = (w7 >> 2) & mask; out[31] = (w7 >> 17) & mask; - return in; + return in + (7 * 8 + 4); } inline const uint8_t* unpack16_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 16) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 16) & mask; out[2] = (w0 >> 32) & mask; @@ -3031,31 +2458,21 @@ inline const uint8_t* unpack16_64(const uint8_t* in, uint64_t* out){ out[30] = (w7 >> 32) & mask; out[31] = w7 >> 48; - return in; + return in + (8 * 8); } inline const uint8_t* unpack17_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 17) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w8 = static_cast(util::SafeLoadAs(in)); - w8 = bit_util::FromLittleEndian(w8); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = static_cast(LoadInt(in + 8 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 17) & mask; out[2] = (w0 >> 34) & mask; @@ -3089,30 +2506,21 @@ inline const uint8_t* unpack17_64(const uint8_t* in, uint64_t* out){ out[30] = ((w7 >> 62) | (w8 << 2)) & mask; out[31] = (w8 >> 15) & mask; - return in; + return in + (8 * 8 + 4); } inline const uint8_t* unpack18_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 18) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 18) & mask; out[2] = (w0 >> 36) & mask; @@ -3146,33 +2554,22 @@ inline const uint8_t* unpack18_64(const uint8_t* in, uint64_t* out){ out[30] = (w8 >> 28) & mask; out[31] = w8 >> 46; - return in; + return in + (9 * 8); } inline const uint8_t* unpack19_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 19) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w9 = static_cast(util::SafeLoadAs(in)); - w9 = bit_util::FromLittleEndian(w9); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = static_cast(LoadInt(in + 9 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 19) & mask; out[2] = (w0 >> 38) & mask; @@ -3206,32 +2603,22 @@ inline const uint8_t* unpack19_64(const uint8_t* in, uint64_t* out){ out[30] = ((w8 >> 58) | (w9 << 6)) & mask; out[31] = (w9 >> 13) & mask; - return in; + return in + (9 * 8 + 4); } inline const uint8_t* unpack20_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 20) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 20) & mask; out[2] = (w0 >> 40) & mask; @@ -3265,35 +2652,23 @@ inline const uint8_t* unpack20_64(const uint8_t* in, uint64_t* out){ out[30] = (w9 >> 24) & mask; out[31] = w9 >> 44; - return in; + return in + (10 * 8); } inline const uint8_t* unpack21_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 21) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w10 = static_cast(util::SafeLoadAs(in)); - w10 = bit_util::FromLittleEndian(w10); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = static_cast(LoadInt(in + 10 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 21) & mask; out[2] = (w0 >> 42) & mask; @@ -3327,34 +2702,23 @@ inline const uint8_t* unpack21_64(const uint8_t* in, uint64_t* out){ out[30] = ((w9 >> 54) | (w10 << 10)) & mask; out[31] = (w10 >> 11) & mask; - return in; + return in + (10 * 8 + 4); } inline const uint8_t* unpack22_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 22) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 22) & mask; out[2] = ((w0 >> 44) | (w1 << 20)) & mask; @@ -3388,37 +2752,24 @@ inline const uint8_t* unpack22_64(const uint8_t* in, uint64_t* out){ out[30] = (w10 >> 20) & mask; out[31] = w10 >> 42; - return in; + return in + (11 * 8); } inline const uint8_t* unpack23_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 23) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w11 = static_cast(util::SafeLoadAs(in)); - w11 = bit_util::FromLittleEndian(w11); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = static_cast(LoadInt(in + 11 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 23) & mask; out[2] = ((w0 >> 46) | (w1 << 18)) & mask; @@ -3452,36 +2803,24 @@ inline const uint8_t* unpack23_64(const uint8_t* in, uint64_t* out){ out[30] = ((w10 >> 50) | (w11 << 14)) & mask; out[31] = (w11 >> 9) & mask; - return in; + return in + (11 * 8 + 4); } inline const uint8_t* unpack24_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 24) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 24) & mask; out[2] = ((w0 >> 48) | (w1 << 16)) & mask; @@ -3515,39 +2854,25 @@ inline const uint8_t* unpack24_64(const uint8_t* in, uint64_t* out){ out[30] = (w11 >> 16) & mask; out[31] = w11 >> 40; - return in; + return in + (12 * 8); } inline const uint8_t* unpack25_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 25) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w12 = static_cast(util::SafeLoadAs(in)); - w12 = bit_util::FromLittleEndian(w12); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = static_cast(LoadInt(in + 12 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 25) & mask; out[2] = ((w0 >> 50) | (w1 << 14)) & mask; @@ -3581,38 +2906,25 @@ inline const uint8_t* unpack25_64(const uint8_t* in, uint64_t* out){ out[30] = ((w11 >> 46) | (w12 << 18)) & mask; out[31] = (w12 >> 7) & mask; - return in; + return in + (12 * 8 + 4); } inline const uint8_t* unpack26_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 26) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 26) & mask; out[2] = ((w0 >> 52) | (w1 << 12)) & mask; @@ -3646,41 +2958,26 @@ inline const uint8_t* unpack26_64(const uint8_t* in, uint64_t* out){ out[30] = (w12 >> 12) & mask; out[31] = w12 >> 38; - return in; + return in + (13 * 8); } inline const uint8_t* unpack27_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 27) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w13 = static_cast(util::SafeLoadAs(in)); - w13 = bit_util::FromLittleEndian(w13); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = static_cast(LoadInt(in + 13 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 27) & mask; out[2] = ((w0 >> 54) | (w1 << 10)) & mask; @@ -3714,40 +3011,26 @@ inline const uint8_t* unpack27_64(const uint8_t* in, uint64_t* out){ out[30] = ((w12 >> 42) | (w13 << 22)) & mask; out[31] = (w13 >> 5) & mask; - return in; + return in + (13 * 8 + 4); } inline const uint8_t* unpack28_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 28) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 28) & mask; out[2] = ((w0 >> 56) | (w1 << 8)) & mask; @@ -3781,43 +3064,27 @@ inline const uint8_t* unpack28_64(const uint8_t* in, uint64_t* out){ out[30] = (w13 >> 8) & mask; out[31] = w13 >> 36; - return in; + return in + (14 * 8); } inline const uint8_t* unpack29_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 29) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w14 = static_cast(util::SafeLoadAs(in)); - w14 = bit_util::FromLittleEndian(w14); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = static_cast(LoadInt(in + 14 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 29) & mask; out[2] = ((w0 >> 58) | (w1 << 6)) & mask; @@ -3851,42 +3118,27 @@ inline const uint8_t* unpack29_64(const uint8_t* in, uint64_t* out){ out[30] = ((w13 >> 38) | (w14 << 26)) & mask; out[31] = (w14 >> 3) & mask; - return in; + return in + (14 * 8 + 4); } inline const uint8_t* unpack30_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 30) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 30) & mask; out[2] = ((w0 >> 60) | (w1 << 4)) & mask; @@ -3920,45 +3172,28 @@ inline const uint8_t* unpack30_64(const uint8_t* in, uint64_t* out){ out[30] = (w14 >> 4) & mask; out[31] = w14 >> 34; - return in; + return in + (15 * 8); } inline const uint8_t* unpack31_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 31) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w15 = static_cast(util::SafeLoadAs(in)); - w15 = bit_util::FromLittleEndian(w15); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = static_cast(LoadInt(in + 15 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 31) & mask; out[2] = ((w0 >> 62) | (w1 << 2)) & mask; @@ -3992,44 +3227,28 @@ inline const uint8_t* unpack31_64(const uint8_t* in, uint64_t* out){ out[30] = ((w14 >> 34) | (w15 << 30)) & mask; out[31] = (w15 >> 1) & mask; - return in; + return in + (15 * 8 + 4); } inline const uint8_t* unpack32_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 32) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); out[0] = (w0) & mask; out[1] = w0 >> 32; out[2] = (w1) & mask; @@ -4063,47 +3282,29 @@ inline const uint8_t* unpack32_64(const uint8_t* in, uint64_t* out){ out[30] = (w15) & mask; out[31] = w15 >> 32; - return in; + return in + (16 * 8); } inline const uint8_t* unpack33_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 33) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w16 = static_cast(util::SafeLoadAs(in)); - w16 = bit_util::FromLittleEndian(w16); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = static_cast(LoadInt(in + 16 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 33) | (w1 << 31)) & mask; out[2] = (w1 >> 2) & mask; @@ -4137,46 +3338,29 @@ inline const uint8_t* unpack33_64(const uint8_t* in, uint64_t* out){ out[30] = (w15 >> 30) & mask; out[31] = ((w15 >> 63) | (w16 << 1)) & mask; - return in; + return in + (16 * 8 + 4); } inline const uint8_t* unpack34_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 34) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 34) | (w1 << 30)) & mask; out[2] = (w1 >> 4) & mask; @@ -4210,49 +3394,30 @@ inline const uint8_t* unpack34_64(const uint8_t* in, uint64_t* out){ out[30] = ((w15 >> 60) | (w16 << 4)) & mask; out[31] = w16 >> 30; - return in; + return in + (17 * 8); } inline const uint8_t* unpack35_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 35) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w17 = static_cast(util::SafeLoadAs(in)); - w17 = bit_util::FromLittleEndian(w17); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = static_cast(LoadInt(in + 17 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 35) | (w1 << 29)) & mask; out[2] = (w1 >> 6) & mask; @@ -4286,48 +3451,30 @@ inline const uint8_t* unpack35_64(const uint8_t* in, uint64_t* out){ out[30] = (w16 >> 26) & mask; out[31] = ((w16 >> 61) | (w17 << 3)) & mask; - return in; + return in + (17 * 8 + 4); } inline const uint8_t* unpack36_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 36) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 36) | (w1 << 28)) & mask; out[2] = (w1 >> 8) & mask; @@ -4361,51 +3508,31 @@ inline const uint8_t* unpack36_64(const uint8_t* in, uint64_t* out){ out[30] = ((w16 >> 56) | (w17 << 8)) & mask; out[31] = w17 >> 28; - return in; + return in + (18 * 8); } inline const uint8_t* unpack37_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 37) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w18 = static_cast(util::SafeLoadAs(in)); - w18 = bit_util::FromLittleEndian(w18); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = static_cast(LoadInt(in + 18 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 37) | (w1 << 27)) & mask; out[2] = (w1 >> 10) & mask; @@ -4439,50 +3566,31 @@ inline const uint8_t* unpack37_64(const uint8_t* in, uint64_t* out){ out[30] = (w17 >> 22) & mask; out[31] = ((w17 >> 59) | (w18 << 5)) & mask; - return in; + return in + (18 * 8 + 4); } inline const uint8_t* unpack38_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 38) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 38) | (w1 << 26)) & mask; out[2] = (w1 >> 12) & mask; @@ -4516,53 +3624,32 @@ inline const uint8_t* unpack38_64(const uint8_t* in, uint64_t* out){ out[30] = ((w17 >> 52) | (w18 << 12)) & mask; out[31] = w18 >> 26; - return in; + return in + (19 * 8); } inline const uint8_t* unpack39_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 39) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w19 = static_cast(util::SafeLoadAs(in)); - w19 = bit_util::FromLittleEndian(w19); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = static_cast(LoadInt(in + 19 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 39) | (w1 << 25)) & mask; out[2] = (w1 >> 14) & mask; @@ -4596,52 +3683,32 @@ inline const uint8_t* unpack39_64(const uint8_t* in, uint64_t* out){ out[30] = (w18 >> 18) & mask; out[31] = ((w18 >> 57) | (w19 << 7)) & mask; - return in; + return in + (19 * 8 + 4); } inline const uint8_t* unpack40_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 40) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 40) | (w1 << 24)) & mask; out[2] = (w1 >> 16) & mask; @@ -4675,55 +3742,33 @@ inline const uint8_t* unpack40_64(const uint8_t* in, uint64_t* out){ out[30] = ((w18 >> 48) | (w19 << 16)) & mask; out[31] = w19 >> 24; - return in; + return in + (20 * 8); } inline const uint8_t* unpack41_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 41) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w20 = static_cast(util::SafeLoadAs(in)); - w20 = bit_util::FromLittleEndian(w20); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = static_cast(LoadInt(in + 20 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 41) | (w1 << 23)) & mask; out[2] = (w1 >> 18) & mask; @@ -4757,54 +3802,33 @@ inline const uint8_t* unpack41_64(const uint8_t* in, uint64_t* out){ out[30] = (w19 >> 14) & mask; out[31] = ((w19 >> 55) | (w20 << 9)) & mask; - return in; + return in + (20 * 8 + 4); } inline const uint8_t* unpack42_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 42) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 42) | (w1 << 22)) & mask; out[2] = (w1 >> 20) & mask; @@ -4838,57 +3862,34 @@ inline const uint8_t* unpack42_64(const uint8_t* in, uint64_t* out){ out[30] = ((w19 >> 44) | (w20 << 20)) & mask; out[31] = w20 >> 22; - return in; + return in + (21 * 8); } inline const uint8_t* unpack43_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 43) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w21 = static_cast(util::SafeLoadAs(in)); - w21 = bit_util::FromLittleEndian(w21); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = static_cast(LoadInt(in + 21 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 43) | (w1 << 21)) & mask; out[2] = ((w1 >> 22) | (w2 << 42)) & mask; @@ -4922,56 +3923,34 @@ inline const uint8_t* unpack43_64(const uint8_t* in, uint64_t* out){ out[30] = (w20 >> 10) & mask; out[31] = ((w20 >> 53) | (w21 << 11)) & mask; - return in; + return in + (21 * 8 + 4); } inline const uint8_t* unpack44_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 44) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 44) | (w1 << 20)) & mask; out[2] = ((w1 >> 24) | (w2 << 40)) & mask; @@ -5005,59 +3984,35 @@ inline const uint8_t* unpack44_64(const uint8_t* in, uint64_t* out){ out[30] = ((w20 >> 40) | (w21 << 24)) & mask; out[31] = w21 >> 20; - return in; + return in + (22 * 8); } inline const uint8_t* unpack45_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 45) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w22 = static_cast(util::SafeLoadAs(in)); - w22 = bit_util::FromLittleEndian(w22); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = static_cast(LoadInt(in + 22 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 45) | (w1 << 19)) & mask; out[2] = ((w1 >> 26) | (w2 << 38)) & mask; @@ -5091,58 +4046,35 @@ inline const uint8_t* unpack45_64(const uint8_t* in, uint64_t* out){ out[30] = (w21 >> 6) & mask; out[31] = ((w21 >> 51) | (w22 << 13)) & mask; - return in; + return in + (22 * 8 + 4); } inline const uint8_t* unpack46_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 46) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 46) | (w1 << 18)) & mask; out[2] = ((w1 >> 28) | (w2 << 36)) & mask; @@ -5176,61 +4108,36 @@ inline const uint8_t* unpack46_64(const uint8_t* in, uint64_t* out){ out[30] = ((w21 >> 36) | (w22 << 28)) & mask; out[31] = w22 >> 18; - return in; + return in + (23 * 8); } inline const uint8_t* unpack47_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 47) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w23 = static_cast(util::SafeLoadAs(in)); - w23 = bit_util::FromLittleEndian(w23); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = static_cast(LoadInt(in + 23 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 47) | (w1 << 17)) & mask; out[2] = ((w1 >> 30) | (w2 << 34)) & mask; @@ -5264,60 +4171,36 @@ inline const uint8_t* unpack47_64(const uint8_t* in, uint64_t* out){ out[30] = (w22 >> 2) & mask; out[31] = ((w22 >> 49) | (w23 << 15)) & mask; - return in; + return in + (23 * 8 + 4); } inline const uint8_t* unpack48_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 48) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 48) | (w1 << 16)) & mask; out[2] = ((w1 >> 32) | (w2 << 32)) & mask; @@ -5351,63 +4234,37 @@ inline const uint8_t* unpack48_64(const uint8_t* in, uint64_t* out){ out[30] = ((w22 >> 32) | (w23 << 32)) & mask; out[31] = w23 >> 16; - return in; + return in + (24 * 8); } inline const uint8_t* unpack49_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 49) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w24 = static_cast(util::SafeLoadAs(in)); - w24 = bit_util::FromLittleEndian(w24); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = static_cast(LoadInt(in + 24 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 49) | (w1 << 15)) & mask; out[2] = ((w1 >> 34) | (w2 << 30)) & mask; @@ -5441,62 +4298,37 @@ inline const uint8_t* unpack49_64(const uint8_t* in, uint64_t* out){ out[30] = ((w22 >> 62) | (w23 << 2)) & mask; out[31] = ((w23 >> 47) | (w24 << 17)) & mask; - return in; + return in + (24 * 8 + 4); } inline const uint8_t* unpack50_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 50) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 50) | (w1 << 14)) & mask; out[2] = ((w1 >> 36) | (w2 << 28)) & mask; @@ -5530,65 +4362,38 @@ inline const uint8_t* unpack50_64(const uint8_t* in, uint64_t* out){ out[30] = ((w23 >> 28) | (w24 << 36)) & mask; out[31] = w24 >> 14; - return in; + return in + (25 * 8); } inline const uint8_t* unpack51_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 51) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w25 = static_cast(util::SafeLoadAs(in)); - w25 = bit_util::FromLittleEndian(w25); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = static_cast(LoadInt(in + 25 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 51) | (w1 << 13)) & mask; out[2] = ((w1 >> 38) | (w2 << 26)) & mask; @@ -5622,64 +4427,38 @@ inline const uint8_t* unpack51_64(const uint8_t* in, uint64_t* out){ out[30] = ((w23 >> 58) | (w24 << 6)) & mask; out[31] = ((w24 >> 45) | (w25 << 19)) & mask; - return in; + return in + (25 * 8 + 4); } inline const uint8_t* unpack52_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 52) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 52) | (w1 << 12)) & mask; out[2] = ((w1 >> 40) | (w2 << 24)) & mask; @@ -5713,67 +4492,39 @@ inline const uint8_t* unpack52_64(const uint8_t* in, uint64_t* out){ out[30] = ((w24 >> 24) | (w25 << 40)) & mask; out[31] = w25 >> 12; - return in; + return in + (26 * 8); } inline const uint8_t* unpack53_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 53) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w26 = static_cast(util::SafeLoadAs(in)); - w26 = bit_util::FromLittleEndian(w26); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = static_cast(LoadInt(in + 26 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 53) | (w1 << 11)) & mask; out[2] = ((w1 >> 42) | (w2 << 22)) & mask; @@ -5807,66 +4558,39 @@ inline const uint8_t* unpack53_64(const uint8_t* in, uint64_t* out){ out[30] = ((w24 >> 54) | (w25 << 10)) & mask; out[31] = ((w25 >> 43) | (w26 << 21)) & mask; - return in; + return in + (26 * 8 + 4); } inline const uint8_t* unpack54_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 54) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 54) | (w1 << 10)) & mask; out[2] = ((w1 >> 44) | (w2 << 20)) & mask; @@ -5900,69 +4624,40 @@ inline const uint8_t* unpack54_64(const uint8_t* in, uint64_t* out){ out[30] = ((w25 >> 20) | (w26 << 44)) & mask; out[31] = w26 >> 10; - return in; + return in + (27 * 8); } inline const uint8_t* unpack55_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 55) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w27 = static_cast(util::SafeLoadAs(in)); - w27 = bit_util::FromLittleEndian(w27); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = static_cast(LoadInt(in + 27 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 55) | (w1 << 9)) & mask; out[2] = ((w1 >> 46) | (w2 << 18)) & mask; @@ -5996,68 +4691,40 @@ inline const uint8_t* unpack55_64(const uint8_t* in, uint64_t* out){ out[30] = ((w25 >> 50) | (w26 << 14)) & mask; out[31] = ((w26 >> 41) | (w27 << 23)) & mask; - return in; + return in + (27 * 8 + 4); } inline const uint8_t* unpack56_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 56) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = LoadInt(in + 27 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 56) | (w1 << 8)) & mask; out[2] = ((w1 >> 48) | (w2 << 16)) & mask; @@ -6091,71 +4758,41 @@ inline const uint8_t* unpack56_64(const uint8_t* in, uint64_t* out){ out[30] = ((w26 >> 16) | (w27 << 48)) & mask; out[31] = w27 >> 8; - return in; + return in + (28 * 8); } inline const uint8_t* unpack57_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 57) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w28 = static_cast(util::SafeLoadAs(in)); - w28 = bit_util::FromLittleEndian(w28); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = LoadInt(in + 27 * 8); + const auto w28 = static_cast(LoadInt(in + 28 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 57) | (w1 << 7)) & mask; out[2] = ((w1 >> 50) | (w2 << 14)) & mask; @@ -6189,70 +4826,41 @@ inline const uint8_t* unpack57_64(const uint8_t* in, uint64_t* out){ out[30] = ((w26 >> 46) | (w27 << 18)) & mask; out[31] = ((w27 >> 39) | (w28 << 25)) & mask; - return in; + return in + (28 * 8 + 4); } inline const uint8_t* unpack58_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 58) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = LoadInt(in + 27 * 8); + const auto w28 = LoadInt(in + 28 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 58) | (w1 << 6)) & mask; out[2] = ((w1 >> 52) | (w2 << 12)) & mask; @@ -6286,73 +4894,42 @@ inline const uint8_t* unpack58_64(const uint8_t* in, uint64_t* out){ out[30] = ((w27 >> 12) | (w28 << 52)) & mask; out[31] = w28 >> 6; - return in; + return in + (29 * 8); } inline const uint8_t* unpack59_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 59) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w29 = static_cast(util::SafeLoadAs(in)); - w29 = bit_util::FromLittleEndian(w29); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = LoadInt(in + 27 * 8); + const auto w28 = LoadInt(in + 28 * 8); + const auto w29 = static_cast(LoadInt(in + 29 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 59) | (w1 << 5)) & mask; out[2] = ((w1 >> 54) | (w2 << 10)) & mask; @@ -6386,72 +4963,42 @@ inline const uint8_t* unpack59_64(const uint8_t* in, uint64_t* out){ out[30] = ((w27 >> 42) | (w28 << 22)) & mask; out[31] = ((w28 >> 37) | (w29 << 27)) & mask; - return in; + return in + (29 * 8 + 4); } inline const uint8_t* unpack60_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 60) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w29 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = LoadInt(in + 27 * 8); + const auto w28 = LoadInt(in + 28 * 8); + const auto w29 = LoadInt(in + 29 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 60) | (w1 << 4)) & mask; out[2] = ((w1 >> 56) | (w2 << 8)) & mask; @@ -6485,75 +5032,43 @@ inline const uint8_t* unpack60_64(const uint8_t* in, uint64_t* out){ out[30] = ((w28 >> 8) | (w29 << 56)) & mask; out[31] = w29 >> 4; - return in; + return in + (30 * 8); } inline const uint8_t* unpack61_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 61) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w29 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w30 = static_cast(util::SafeLoadAs(in)); - w30 = bit_util::FromLittleEndian(w30); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = LoadInt(in + 27 * 8); + const auto w28 = LoadInt(in + 28 * 8); + const auto w29 = LoadInt(in + 29 * 8); + const auto w30 = static_cast(LoadInt(in + 30 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 61) | (w1 << 3)) & mask; out[2] = ((w1 >> 58) | (w2 << 6)) & mask; @@ -6587,74 +5102,43 @@ inline const uint8_t* unpack61_64(const uint8_t* in, uint64_t* out){ out[30] = ((w28 >> 38) | (w29 << 26)) & mask; out[31] = ((w29 >> 35) | (w30 << 29)) & mask; - return in; + return in + (30 * 8 + 4); } inline const uint8_t* unpack62_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 62) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w29 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w30 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = LoadInt(in + 27 * 8); + const auto w28 = LoadInt(in + 28 * 8); + const auto w29 = LoadInt(in + 29 * 8); + const auto w30 = LoadInt(in + 30 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 62) | (w1 << 2)) & mask; out[2] = ((w1 >> 60) | (w2 << 4)) & mask; @@ -6688,77 +5172,44 @@ inline const uint8_t* unpack62_64(const uint8_t* in, uint64_t* out){ out[30] = ((w29 >> 4) | (w30 << 60)) & mask; out[31] = w30 >> 2; - return in; + return in + (31 * 8); } inline const uint8_t* unpack63_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 63) - uint64_t{1}); - const auto w0 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w1 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w2 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w3 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w4 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w5 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w6 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w7 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w8 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w9 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w10 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w11 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w12 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w13 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w14 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w15 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w16 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w17 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w18 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w19 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w20 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w21 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w22 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w23 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w24 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w25 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w26 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w27 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w28 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w29 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - const auto w30 = bit_util::FromLittleEndian(util::SafeLoadAs(in)); - in += 8; - auto w31 = static_cast(util::SafeLoadAs(in)); - w31 = bit_util::FromLittleEndian(w31); - in += 4; + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = LoadInt(in + 27 * 8); + const auto w28 = LoadInt(in + 28 * 8); + const auto w29 = LoadInt(in + 29 * 8); + const auto w30 = LoadInt(in + 30 * 8); + const auto w31 = static_cast(LoadInt(in + 31 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 63) | (w1 << 1)) & mask; out[2] = ((w1 >> 62) | (w2 << 2)) & mask; @@ -6792,16 +5243,14 @@ inline const uint8_t* unpack63_64(const uint8_t* in, uint64_t* out){ out[30] = ((w29 >> 34) | (w30 << 30)) & mask; out[31] = ((w30 >> 33) | (w31 << 31)) & mask; - return in; + return in + (31 * 8 + 4); } inline const uint8_t* unpack64_64(const uint8_t* in, uint64_t* out){ for(int k = 0; k < 32; k += 1) { - auto w = util::SafeLoadAs(in); - out[k] = bit_util::FromLittleEndian(w); - in += 8; + out[k] = LoadInt(in + (k * 8)); } - return in; + return in + (8 * 32); } } // namespace arrow::internal From 3abbc15346fabceac05906751b291772d70f339d Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 18 Sep 2025 10:36:58 +0200 Subject: [PATCH 10/76] Try: reinterpret cast --- cpp/src/arrow/util/bpacking.cc | 6 +- cpp/src/arrow/util/bpacking_scalar_codegen.py | 25 +- .../util/bpacking_scalar_generated_internal.h | 3436 ++++++++--------- 3 files changed, 1734 insertions(+), 1733 deletions(-) diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index fefca194518..73aa7939b20 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -35,7 +35,8 @@ namespace arrow { namespace internal { -int unpack32_scalar(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { +int unpack32_scalar(const uint8_t* in_, uint32_t* out, int batch_size, int num_bits) { + auto in = reinterpret_cast(in_); batch_size = batch_size / 32 * 32; int num_loops = batch_size / 32; @@ -176,7 +177,8 @@ int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { #endif } -int unpack64_scalar(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { +int unpack64_scalar(const uint8_t* in_, uint64_t* out, int batch_size, int num_bits) { + auto in = reinterpret_cast(in_); batch_size = batch_size / 32 * 32; int num_loops = batch_size / 32; diff --git a/cpp/src/arrow/util/bpacking_scalar_codegen.py b/cpp/src/arrow/util/bpacking_scalar_codegen.py index 8882e9a7c83..7cec6b9ac42 100644 --- a/cpp/src/arrow/util/bpacking_scalar_codegen.py +++ b/cpp/src/arrow/util/bpacking_scalar_codegen.py @@ -67,8 +67,8 @@ namespace arrow::internal { template -Int LoadInt(const uint8_t* in) { - return bit_util::FromLittleEndian(util::SafeLoadAs(in)); +Int LoadInt(const Int* in) { + return bit_util::FromLittleEndian(util::SafeLoad(in)); } """ @@ -109,8 +109,8 @@ def howmanybytes(self, bit: int) -> int: def unpack_signature(self, bit: int) -> str: return ( - f"inline const uint8_t* unpack{bit}_{self.out_bit_width}" - f"(const uint8_t* in, {self.unsigned_type}* out)" + f"inline const {self.unsigned_type}* unpack{bit}_{self.out_bit_width}" + f"(const {self.unsigned_type}* in, {self.unsigned_type}* out)" "{" ) @@ -125,10 +125,10 @@ def print_unpack_last(self) -> None: print(f" for(int k = 0; k < {self.howmany}; k += 1) {{") print( f" out[k] = LoadInt<{self.unsigned_type}>(" - f"in + (k * {self.out_byte_width}));" + f"in + (k ));" ) print(" }") - print(f" return in + ({self.out_byte_width} * {self.howmany});") + print(f" return in + ( {self.howmany});") print("}") def print_unpack_k(self, bit: int) -> None: @@ -143,20 +143,20 @@ def print_unpack_k(self, bit: int) -> None: for k in range(self.howmanywords(bit) - 1): print( f" const auto w{k} = LoadInt<{self.unsigned_type}>(" - f"in + {k} * {self.out_byte_width});" + f"in + {k});" ) k = self.howmanywords(bit) - 1 use_smart_halving = self.smart_halve and bit % 2 == 1 if use_smart_halving: print( - f" const auto w{k} = static_cast<{self.unsigned_type}>(LoadInt<{self.unsigned_type_half}>(" - f"in + {k} * {self.out_byte_width}));" + f" const auto w{k} = static_cast<{self.unsigned_type}>(LoadInt(" + f"reinterpret_cast(in + {k})));" ) else: print( f" const auto w{k} = LoadInt<{self.unsigned_type}>(" - f"in + {k} * {self.out_byte_width});" + f"in + {k} );" ) for j in range(self.howmany): @@ -181,11 +181,10 @@ def print_unpack_k(self, bit: int) -> None: if use_smart_halving: print( - f" return in + ({self.howmanywords(bit) - 1} * {self.out_byte_width}" - f" + {self.out_byte_width // 2});" + f" return in + ({self.howmanywords(bit) - 1});" ) else: - print(f" return in + ({self.howmanywords(bit)} * {self.out_byte_width});") + print(f" return in + ({self.howmanywords(bit)} );") print("}") def print_all(self) -> None: diff --git a/cpp/src/arrow/util/bpacking_scalar_generated_internal.h b/cpp/src/arrow/util/bpacking_scalar_generated_internal.h index d215fcfbc46..c1787fc53b7 100644 --- a/cpp/src/arrow/util/bpacking_scalar_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_scalar_generated_internal.h @@ -39,19 +39,19 @@ namespace arrow::internal { template -Int LoadInt(const uint8_t* in) { - return bit_util::FromLittleEndian(util::SafeLoadAs(in)); +Int LoadInt(const Int* in) { + return bit_util::FromLittleEndian(util::SafeLoad(in)); } -inline const uint8_t* unpack0_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack0_32(const uint32_t* in, uint32_t* out){ std::memset(out, 0, 32 * 4); return in; } -inline const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 1) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); + const auto w0 = LoadInt(in + 0 ); out[0] = (w0) & mask; out[1] = (w0 >> 1) & mask; out[2] = (w0 >> 2) & mask; @@ -85,14 +85,14 @@ inline const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out){ out[30] = (w0 >> 30) & mask; out[31] = w0 >> 31; - return in + (1 * 4); + return in + (1 ); } -inline const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 2) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1 ); out[0] = (w0) & mask; out[1] = (w0 >> 2) & mask; out[2] = (w0 >> 4) & mask; @@ -126,15 +126,15 @@ inline const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out){ out[30] = (w1 >> 28) & mask; out[31] = w1 >> 30; - return in + (2 * 4); + return in + (2 ); } -inline const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 3) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2 ); out[0] = (w0) & mask; out[1] = (w0 >> 3) & mask; out[2] = (w0 >> 6) & mask; @@ -168,16 +168,16 @@ inline const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out){ out[30] = (w2 >> 26) & mask; out[31] = w2 >> 29; - return in + (3 * 4); + return in + (3 ); } -inline const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 4) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3 ); out[0] = (w0) & mask; out[1] = (w0 >> 4) & mask; out[2] = (w0 >> 8) & mask; @@ -211,17 +211,17 @@ inline const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out){ out[30] = (w3 >> 24) & mask; out[31] = w3 >> 28; - return in + (4 * 4); + return in + (4 ); } -inline const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 5) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4 ); out[0] = (w0) & mask; out[1] = (w0 >> 5) & mask; out[2] = (w0 >> 10) & mask; @@ -255,18 +255,18 @@ inline const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out){ out[30] = (w4 >> 22) & mask; out[31] = w4 >> 27; - return in + (5 * 4); + return in + (5 ); } -inline const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 6) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5 ); out[0] = (w0) & mask; out[1] = (w0 >> 6) & mask; out[2] = (w0 >> 12) & mask; @@ -300,19 +300,19 @@ inline const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out){ out[30] = (w5 >> 20) & mask; out[31] = w5 >> 26; - return in + (6 * 4); + return in + (6 ); } -inline const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 7) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6 ); out[0] = (w0) & mask; out[1] = (w0 >> 7) & mask; out[2] = (w0 >> 14) & mask; @@ -346,20 +346,20 @@ inline const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out){ out[30] = (w6 >> 18) & mask; out[31] = w6 >> 25; - return in + (7 * 4); + return in + (7 ); } -inline const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 8) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7 ); out[0] = (w0) & mask; out[1] = (w0 >> 8) & mask; out[2] = (w0 >> 16) & mask; @@ -393,21 +393,21 @@ inline const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out){ out[30] = (w7 >> 16) & mask; out[31] = w7 >> 24; - return in + (8 * 4); + return in + (8 ); } -inline const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 9) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8 ); out[0] = (w0) & mask; out[1] = (w0 >> 9) & mask; out[2] = (w0 >> 18) & mask; @@ -441,22 +441,22 @@ inline const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out){ out[30] = (w8 >> 14) & mask; out[31] = w8 >> 23; - return in + (9 * 4); + return in + (9 ); } -inline const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 10) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9 ); out[0] = (w0) & mask; out[1] = (w0 >> 10) & mask; out[2] = (w0 >> 20) & mask; @@ -490,23 +490,23 @@ inline const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out){ out[30] = (w9 >> 12) & mask; out[31] = w9 >> 22; - return in + (10 * 4); + return in + (10 ); } -inline const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 11) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10 ); out[0] = (w0) & mask; out[1] = (w0 >> 11) & mask; out[2] = ((w0 >> 22) | (w1 << 10)) & mask; @@ -540,24 +540,24 @@ inline const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out){ out[30] = (w10 >> 10) & mask; out[31] = w10 >> 21; - return in + (11 * 4); + return in + (11 ); } -inline const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 12) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11 ); out[0] = (w0) & mask; out[1] = (w0 >> 12) & mask; out[2] = ((w0 >> 24) | (w1 << 8)) & mask; @@ -591,25 +591,25 @@ inline const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out){ out[30] = (w11 >> 8) & mask; out[31] = w11 >> 20; - return in + (12 * 4); + return in + (12 ); } -inline const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 13) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12 ); out[0] = (w0) & mask; out[1] = (w0 >> 13) & mask; out[2] = ((w0 >> 26) | (w1 << 6)) & mask; @@ -643,26 +643,26 @@ inline const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out){ out[30] = (w12 >> 6) & mask; out[31] = w12 >> 19; - return in + (13 * 4); + return in + (13 ); } -inline const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 14) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13 ); out[0] = (w0) & mask; out[1] = (w0 >> 14) & mask; out[2] = ((w0 >> 28) | (w1 << 4)) & mask; @@ -696,27 +696,27 @@ inline const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out){ out[30] = (w13 >> 4) & mask; out[31] = w13 >> 18; - return in + (14 * 4); + return in + (14 ); } -inline const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 15) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14 ); out[0] = (w0) & mask; out[1] = (w0 >> 15) & mask; out[2] = ((w0 >> 30) | (w1 << 2)) & mask; @@ -750,28 +750,28 @@ inline const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out){ out[30] = (w14 >> 2) & mask; out[31] = w14 >> 17; - return in + (15 * 4); + return in + (15 ); } -inline const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 16) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); - const auto w15 = LoadInt(in + 15 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15 ); out[0] = (w0) & mask; out[1] = w0 >> 16; out[2] = (w1) & mask; @@ -805,29 +805,29 @@ inline const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out){ out[30] = (w15) & mask; out[31] = w15 >> 16; - return in + (16 * 4); + return in + (16 ); } -inline const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 17) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); - const auto w15 = LoadInt(in + 15 * 4); - const auto w16 = LoadInt(in + 16 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16 ); out[0] = (w0) & mask; out[1] = ((w0 >> 17) | (w1 << 15)) & mask; out[2] = (w1 >> 2) & mask; @@ -861,30 +861,30 @@ inline const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out){ out[30] = ((w15 >> 30) | (w16 << 2)) & mask; out[31] = w16 >> 15; - return in + (17 * 4); + return in + (17 ); } -inline const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 18) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); - const auto w15 = LoadInt(in + 15 * 4); - const auto w16 = LoadInt(in + 16 * 4); - const auto w17 = LoadInt(in + 17 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17 ); out[0] = (w0) & mask; out[1] = ((w0 >> 18) | (w1 << 14)) & mask; out[2] = (w1 >> 4) & mask; @@ -918,31 +918,31 @@ inline const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out){ out[30] = ((w16 >> 28) | (w17 << 4)) & mask; out[31] = w17 >> 14; - return in + (18 * 4); + return in + (18 ); } -inline const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 19) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); - const auto w15 = LoadInt(in + 15 * 4); - const auto w16 = LoadInt(in + 16 * 4); - const auto w17 = LoadInt(in + 17 * 4); - const auto w18 = LoadInt(in + 18 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18 ); out[0] = (w0) & mask; out[1] = ((w0 >> 19) | (w1 << 13)) & mask; out[2] = (w1 >> 6) & mask; @@ -976,32 +976,32 @@ inline const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out){ out[30] = ((w17 >> 26) | (w18 << 6)) & mask; out[31] = w18 >> 13; - return in + (19 * 4); + return in + (19 ); } -inline const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 20) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); - const auto w15 = LoadInt(in + 15 * 4); - const auto w16 = LoadInt(in + 16 * 4); - const auto w17 = LoadInt(in + 17 * 4); - const auto w18 = LoadInt(in + 18 * 4); - const auto w19 = LoadInt(in + 19 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19 ); out[0] = (w0) & mask; out[1] = ((w0 >> 20) | (w1 << 12)) & mask; out[2] = (w1 >> 8) & mask; @@ -1035,33 +1035,33 @@ inline const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out){ out[30] = ((w18 >> 24) | (w19 << 8)) & mask; out[31] = w19 >> 12; - return in + (20 * 4); + return in + (20 ); } -inline const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 21) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); - const auto w15 = LoadInt(in + 15 * 4); - const auto w16 = LoadInt(in + 16 * 4); - const auto w17 = LoadInt(in + 17 * 4); - const auto w18 = LoadInt(in + 18 * 4); - const auto w19 = LoadInt(in + 19 * 4); - const auto w20 = LoadInt(in + 20 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20 ); out[0] = (w0) & mask; out[1] = ((w0 >> 21) | (w1 << 11)) & mask; out[2] = (w1 >> 10) & mask; @@ -1095,34 +1095,34 @@ inline const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out){ out[30] = ((w19 >> 22) | (w20 << 10)) & mask; out[31] = w20 >> 11; - return in + (21 * 4); + return in + (21 ); } -inline const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 22) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); - const auto w15 = LoadInt(in + 15 * 4); - const auto w16 = LoadInt(in + 16 * 4); - const auto w17 = LoadInt(in + 17 * 4); - const auto w18 = LoadInt(in + 18 * 4); - const auto w19 = LoadInt(in + 19 * 4); - const auto w20 = LoadInt(in + 20 * 4); - const auto w21 = LoadInt(in + 21 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21 ); out[0] = (w0) & mask; out[1] = ((w0 >> 22) | (w1 << 10)) & mask; out[2] = ((w1 >> 12) | (w2 << 20)) & mask; @@ -1156,35 +1156,35 @@ inline const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out){ out[30] = ((w20 >> 20) | (w21 << 12)) & mask; out[31] = w21 >> 10; - return in + (22 * 4); + return in + (22 ); } -inline const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 23) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); - const auto w15 = LoadInt(in + 15 * 4); - const auto w16 = LoadInt(in + 16 * 4); - const auto w17 = LoadInt(in + 17 * 4); - const auto w18 = LoadInt(in + 18 * 4); - const auto w19 = LoadInt(in + 19 * 4); - const auto w20 = LoadInt(in + 20 * 4); - const auto w21 = LoadInt(in + 21 * 4); - const auto w22 = LoadInt(in + 22 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22 ); out[0] = (w0) & mask; out[1] = ((w0 >> 23) | (w1 << 9)) & mask; out[2] = ((w1 >> 14) | (w2 << 18)) & mask; @@ -1218,36 +1218,36 @@ inline const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out){ out[30] = ((w21 >> 18) | (w22 << 14)) & mask; out[31] = w22 >> 9; - return in + (23 * 4); + return in + (23 ); } -inline const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 24) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); - const auto w15 = LoadInt(in + 15 * 4); - const auto w16 = LoadInt(in + 16 * 4); - const auto w17 = LoadInt(in + 17 * 4); - const auto w18 = LoadInt(in + 18 * 4); - const auto w19 = LoadInt(in + 19 * 4); - const auto w20 = LoadInt(in + 20 * 4); - const auto w21 = LoadInt(in + 21 * 4); - const auto w22 = LoadInt(in + 22 * 4); - const auto w23 = LoadInt(in + 23 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23 ); out[0] = (w0) & mask; out[1] = ((w0 >> 24) | (w1 << 8)) & mask; out[2] = ((w1 >> 16) | (w2 << 16)) & mask; @@ -1281,37 +1281,37 @@ inline const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out){ out[30] = ((w22 >> 16) | (w23 << 16)) & mask; out[31] = w23 >> 8; - return in + (24 * 4); + return in + (24 ); } -inline const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 25) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); - const auto w15 = LoadInt(in + 15 * 4); - const auto w16 = LoadInt(in + 16 * 4); - const auto w17 = LoadInt(in + 17 * 4); - const auto w18 = LoadInt(in + 18 * 4); - const auto w19 = LoadInt(in + 19 * 4); - const auto w20 = LoadInt(in + 20 * 4); - const auto w21 = LoadInt(in + 21 * 4); - const auto w22 = LoadInt(in + 22 * 4); - const auto w23 = LoadInt(in + 23 * 4); - const auto w24 = LoadInt(in + 24 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24 ); out[0] = (w0) & mask; out[1] = ((w0 >> 25) | (w1 << 7)) & mask; out[2] = ((w1 >> 18) | (w2 << 14)) & mask; @@ -1345,38 +1345,38 @@ inline const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out){ out[30] = ((w23 >> 14) | (w24 << 18)) & mask; out[31] = w24 >> 7; - return in + (25 * 4); + return in + (25 ); } -inline const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 26) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); - const auto w15 = LoadInt(in + 15 * 4); - const auto w16 = LoadInt(in + 16 * 4); - const auto w17 = LoadInt(in + 17 * 4); - const auto w18 = LoadInt(in + 18 * 4); - const auto w19 = LoadInt(in + 19 * 4); - const auto w20 = LoadInt(in + 20 * 4); - const auto w21 = LoadInt(in + 21 * 4); - const auto w22 = LoadInt(in + 22 * 4); - const auto w23 = LoadInt(in + 23 * 4); - const auto w24 = LoadInt(in + 24 * 4); - const auto w25 = LoadInt(in + 25 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25 ); out[0] = (w0) & mask; out[1] = ((w0 >> 26) | (w1 << 6)) & mask; out[2] = ((w1 >> 20) | (w2 << 12)) & mask; @@ -1410,39 +1410,39 @@ inline const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out){ out[30] = ((w24 >> 12) | (w25 << 20)) & mask; out[31] = w25 >> 6; - return in + (26 * 4); + return in + (26 ); } -inline const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 27) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); - const auto w15 = LoadInt(in + 15 * 4); - const auto w16 = LoadInt(in + 16 * 4); - const auto w17 = LoadInt(in + 17 * 4); - const auto w18 = LoadInt(in + 18 * 4); - const auto w19 = LoadInt(in + 19 * 4); - const auto w20 = LoadInt(in + 20 * 4); - const auto w21 = LoadInt(in + 21 * 4); - const auto w22 = LoadInt(in + 22 * 4); - const auto w23 = LoadInt(in + 23 * 4); - const auto w24 = LoadInt(in + 24 * 4); - const auto w25 = LoadInt(in + 25 * 4); - const auto w26 = LoadInt(in + 26 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25); + const auto w26 = LoadInt(in + 26 ); out[0] = (w0) & mask; out[1] = ((w0 >> 27) | (w1 << 5)) & mask; out[2] = ((w1 >> 22) | (w2 << 10)) & mask; @@ -1476,40 +1476,40 @@ inline const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out){ out[30] = ((w25 >> 10) | (w26 << 22)) & mask; out[31] = w26 >> 5; - return in + (27 * 4); + return in + (27 ); } -inline const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 28) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); - const auto w15 = LoadInt(in + 15 * 4); - const auto w16 = LoadInt(in + 16 * 4); - const auto w17 = LoadInt(in + 17 * 4); - const auto w18 = LoadInt(in + 18 * 4); - const auto w19 = LoadInt(in + 19 * 4); - const auto w20 = LoadInt(in + 20 * 4); - const auto w21 = LoadInt(in + 21 * 4); - const auto w22 = LoadInt(in + 22 * 4); - const auto w23 = LoadInt(in + 23 * 4); - const auto w24 = LoadInt(in + 24 * 4); - const auto w25 = LoadInt(in + 25 * 4); - const auto w26 = LoadInt(in + 26 * 4); - const auto w27 = LoadInt(in + 27 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25); + const auto w26 = LoadInt(in + 26); + const auto w27 = LoadInt(in + 27 ); out[0] = (w0) & mask; out[1] = ((w0 >> 28) | (w1 << 4)) & mask; out[2] = ((w1 >> 24) | (w2 << 8)) & mask; @@ -1543,41 +1543,41 @@ inline const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out){ out[30] = ((w26 >> 8) | (w27 << 24)) & mask; out[31] = w27 >> 4; - return in + (28 * 4); + return in + (28 ); } -inline const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 29) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); - const auto w15 = LoadInt(in + 15 * 4); - const auto w16 = LoadInt(in + 16 * 4); - const auto w17 = LoadInt(in + 17 * 4); - const auto w18 = LoadInt(in + 18 * 4); - const auto w19 = LoadInt(in + 19 * 4); - const auto w20 = LoadInt(in + 20 * 4); - const auto w21 = LoadInt(in + 21 * 4); - const auto w22 = LoadInt(in + 22 * 4); - const auto w23 = LoadInt(in + 23 * 4); - const auto w24 = LoadInt(in + 24 * 4); - const auto w25 = LoadInt(in + 25 * 4); - const auto w26 = LoadInt(in + 26 * 4); - const auto w27 = LoadInt(in + 27 * 4); - const auto w28 = LoadInt(in + 28 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25); + const auto w26 = LoadInt(in + 26); + const auto w27 = LoadInt(in + 27); + const auto w28 = LoadInt(in + 28 ); out[0] = (w0) & mask; out[1] = ((w0 >> 29) | (w1 << 3)) & mask; out[2] = ((w1 >> 26) | (w2 << 6)) & mask; @@ -1611,42 +1611,42 @@ inline const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out){ out[30] = ((w27 >> 6) | (w28 << 26)) & mask; out[31] = w28 >> 3; - return in + (29 * 4); + return in + (29 ); } -inline const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 30) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); - const auto w15 = LoadInt(in + 15 * 4); - const auto w16 = LoadInt(in + 16 * 4); - const auto w17 = LoadInt(in + 17 * 4); - const auto w18 = LoadInt(in + 18 * 4); - const auto w19 = LoadInt(in + 19 * 4); - const auto w20 = LoadInt(in + 20 * 4); - const auto w21 = LoadInt(in + 21 * 4); - const auto w22 = LoadInt(in + 22 * 4); - const auto w23 = LoadInt(in + 23 * 4); - const auto w24 = LoadInt(in + 24 * 4); - const auto w25 = LoadInt(in + 25 * 4); - const auto w26 = LoadInt(in + 26 * 4); - const auto w27 = LoadInt(in + 27 * 4); - const auto w28 = LoadInt(in + 28 * 4); - const auto w29 = LoadInt(in + 29 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25); + const auto w26 = LoadInt(in + 26); + const auto w27 = LoadInt(in + 27); + const auto w28 = LoadInt(in + 28); + const auto w29 = LoadInt(in + 29 ); out[0] = (w0) & mask; out[1] = ((w0 >> 30) | (w1 << 2)) & mask; out[2] = ((w1 >> 28) | (w2 << 4)) & mask; @@ -1680,43 +1680,43 @@ inline const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out){ out[30] = ((w28 >> 4) | (w29 << 28)) & mask; out[31] = w29 >> 2; - return in + (30 * 4); + return in + (30 ); } -inline const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 31) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 * 4); - const auto w1 = LoadInt(in + 1 * 4); - const auto w2 = LoadInt(in + 2 * 4); - const auto w3 = LoadInt(in + 3 * 4); - const auto w4 = LoadInt(in + 4 * 4); - const auto w5 = LoadInt(in + 5 * 4); - const auto w6 = LoadInt(in + 6 * 4); - const auto w7 = LoadInt(in + 7 * 4); - const auto w8 = LoadInt(in + 8 * 4); - const auto w9 = LoadInt(in + 9 * 4); - const auto w10 = LoadInt(in + 10 * 4); - const auto w11 = LoadInt(in + 11 * 4); - const auto w12 = LoadInt(in + 12 * 4); - const auto w13 = LoadInt(in + 13 * 4); - const auto w14 = LoadInt(in + 14 * 4); - const auto w15 = LoadInt(in + 15 * 4); - const auto w16 = LoadInt(in + 16 * 4); - const auto w17 = LoadInt(in + 17 * 4); - const auto w18 = LoadInt(in + 18 * 4); - const auto w19 = LoadInt(in + 19 * 4); - const auto w20 = LoadInt(in + 20 * 4); - const auto w21 = LoadInt(in + 21 * 4); - const auto w22 = LoadInt(in + 22 * 4); - const auto w23 = LoadInt(in + 23 * 4); - const auto w24 = LoadInt(in + 24 * 4); - const auto w25 = LoadInt(in + 25 * 4); - const auto w26 = LoadInt(in + 26 * 4); - const auto w27 = LoadInt(in + 27 * 4); - const auto w28 = LoadInt(in + 28 * 4); - const auto w29 = LoadInt(in + 29 * 4); - const auto w30 = LoadInt(in + 30 * 4); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25); + const auto w26 = LoadInt(in + 26); + const auto w27 = LoadInt(in + 27); + const auto w28 = LoadInt(in + 28); + const auto w29 = LoadInt(in + 29); + const auto w30 = LoadInt(in + 30 ); out[0] = (w0) & mask; out[1] = ((w0 >> 31) | (w1 << 1)) & mask; out[2] = ((w1 >> 30) | (w2 << 2)) & mask; @@ -1750,25 +1750,25 @@ inline const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out){ out[30] = ((w29 >> 2) | (w30 << 30)) & mask; out[31] = w30 >> 1; - return in + (31 * 4); + return in + (31 ); } -inline const uint8_t* unpack32_32(const uint8_t* in, uint32_t* out){ +inline const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out){ for(int k = 0; k < 32; k += 1) { - out[k] = LoadInt(in + (k * 4)); + out[k] = LoadInt(in + (k )); } - return in + (4 * 32); + return in + ( 32); } -inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack0_64(const uint64_t* in, uint64_t* out){ std::memset(out, 0, 32 * 8); return in; } -inline const uint8_t* unpack1_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack1_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 1) - uint64_t{1}); - const auto w0 = static_cast(LoadInt(in + 0 * 8)); + const auto w0 = static_cast(LoadInt(reinterpret_cast(in + 0))); out[0] = (w0) & mask; out[1] = (w0 >> 1) & mask; out[2] = (w0 >> 2) & mask; @@ -1802,13 +1802,13 @@ inline const uint8_t* unpack1_64(const uint8_t* in, uint64_t* out){ out[30] = (w0 >> 30) & mask; out[31] = (w0 >> 31) & mask; - return in + (0 * 8 + 4); + return in + (0); } -inline const uint8_t* unpack2_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack2_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 2) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); + const auto w0 = LoadInt(in + 0 ); out[0] = (w0) & mask; out[1] = (w0 >> 2) & mask; out[2] = (w0 >> 4) & mask; @@ -1842,14 +1842,14 @@ inline const uint8_t* unpack2_64(const uint8_t* in, uint64_t* out){ out[30] = (w0 >> 60) & mask; out[31] = w0 >> 62; - return in + (1 * 8); + return in + (1 ); } -inline const uint8_t* unpack3_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack3_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 3) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = static_cast(LoadInt(in + 1 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = static_cast(LoadInt(reinterpret_cast(in + 1))); out[0] = (w0) & mask; out[1] = (w0 >> 3) & mask; out[2] = (w0 >> 6) & mask; @@ -1883,14 +1883,14 @@ inline const uint8_t* unpack3_64(const uint8_t* in, uint64_t* out){ out[30] = (w1 >> 26) & mask; out[31] = (w1 >> 29) & mask; - return in + (1 * 8 + 4); + return in + (1); } -inline const uint8_t* unpack4_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack4_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 4) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1 ); out[0] = (w0) & mask; out[1] = (w0 >> 4) & mask; out[2] = (w0 >> 8) & mask; @@ -1924,15 +1924,15 @@ inline const uint8_t* unpack4_64(const uint8_t* in, uint64_t* out){ out[30] = (w1 >> 56) & mask; out[31] = w1 >> 60; - return in + (2 * 8); + return in + (2 ); } -inline const uint8_t* unpack5_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack5_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 5) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = static_cast(LoadInt(in + 2 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = static_cast(LoadInt(reinterpret_cast(in + 2))); out[0] = (w0) & mask; out[1] = (w0 >> 5) & mask; out[2] = (w0 >> 10) & mask; @@ -1966,15 +1966,15 @@ inline const uint8_t* unpack5_64(const uint8_t* in, uint64_t* out){ out[30] = (w2 >> 22) & mask; out[31] = (w2 >> 27) & mask; - return in + (2 * 8 + 4); + return in + (2); } -inline const uint8_t* unpack6_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack6_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 6) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2 ); out[0] = (w0) & mask; out[1] = (w0 >> 6) & mask; out[2] = (w0 >> 12) & mask; @@ -2008,16 +2008,16 @@ inline const uint8_t* unpack6_64(const uint8_t* in, uint64_t* out){ out[30] = (w2 >> 52) & mask; out[31] = w2 >> 58; - return in + (3 * 8); + return in + (3 ); } -inline const uint8_t* unpack7_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack7_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 7) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = static_cast(LoadInt(in + 3 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = static_cast(LoadInt(reinterpret_cast(in + 3))); out[0] = (w0) & mask; out[1] = (w0 >> 7) & mask; out[2] = (w0 >> 14) & mask; @@ -2051,16 +2051,16 @@ inline const uint8_t* unpack7_64(const uint8_t* in, uint64_t* out){ out[30] = (w3 >> 18) & mask; out[31] = (w3 >> 25) & mask; - return in + (3 * 8 + 4); + return in + (3); } -inline const uint8_t* unpack8_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack8_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 8) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3 ); out[0] = (w0) & mask; out[1] = (w0 >> 8) & mask; out[2] = (w0 >> 16) & mask; @@ -2094,17 +2094,17 @@ inline const uint8_t* unpack8_64(const uint8_t* in, uint64_t* out){ out[30] = (w3 >> 48) & mask; out[31] = w3 >> 56; - return in + (4 * 8); + return in + (4 ); } -inline const uint8_t* unpack9_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack9_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 9) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = static_cast(LoadInt(in + 4 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = static_cast(LoadInt(reinterpret_cast(in + 4))); out[0] = (w0) & mask; out[1] = (w0 >> 9) & mask; out[2] = (w0 >> 18) & mask; @@ -2138,17 +2138,17 @@ inline const uint8_t* unpack9_64(const uint8_t* in, uint64_t* out){ out[30] = (w4 >> 14) & mask; out[31] = (w4 >> 23) & mask; - return in + (4 * 8 + 4); + return in + (4); } -inline const uint8_t* unpack10_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack10_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 10) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4 ); out[0] = (w0) & mask; out[1] = (w0 >> 10) & mask; out[2] = (w0 >> 20) & mask; @@ -2182,18 +2182,18 @@ inline const uint8_t* unpack10_64(const uint8_t* in, uint64_t* out){ out[30] = (w4 >> 44) & mask; out[31] = w4 >> 54; - return in + (5 * 8); + return in + (5 ); } -inline const uint8_t* unpack11_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack11_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 11) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = static_cast(LoadInt(in + 5 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = static_cast(LoadInt(reinterpret_cast(in + 5))); out[0] = (w0) & mask; out[1] = (w0 >> 11) & mask; out[2] = (w0 >> 22) & mask; @@ -2227,18 +2227,18 @@ inline const uint8_t* unpack11_64(const uint8_t* in, uint64_t* out){ out[30] = (w5 >> 10) & mask; out[31] = (w5 >> 21) & mask; - return in + (5 * 8 + 4); + return in + (5); } -inline const uint8_t* unpack12_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack12_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 12) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5 ); out[0] = (w0) & mask; out[1] = (w0 >> 12) & mask; out[2] = (w0 >> 24) & mask; @@ -2272,19 +2272,19 @@ inline const uint8_t* unpack12_64(const uint8_t* in, uint64_t* out){ out[30] = (w5 >> 40) & mask; out[31] = w5 >> 52; - return in + (6 * 8); + return in + (6 ); } -inline const uint8_t* unpack13_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack13_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 13) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = static_cast(LoadInt(in + 6 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = static_cast(LoadInt(reinterpret_cast(in + 6))); out[0] = (w0) & mask; out[1] = (w0 >> 13) & mask; out[2] = (w0 >> 26) & mask; @@ -2318,19 +2318,19 @@ inline const uint8_t* unpack13_64(const uint8_t* in, uint64_t* out){ out[30] = (w6 >> 6) & mask; out[31] = (w6 >> 19) & mask; - return in + (6 * 8 + 4); + return in + (6); } -inline const uint8_t* unpack14_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack14_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 14) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6 ); out[0] = (w0) & mask; out[1] = (w0 >> 14) & mask; out[2] = (w0 >> 28) & mask; @@ -2364,20 +2364,20 @@ inline const uint8_t* unpack14_64(const uint8_t* in, uint64_t* out){ out[30] = (w6 >> 36) & mask; out[31] = w6 >> 50; - return in + (7 * 8); + return in + (7 ); } -inline const uint8_t* unpack15_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack15_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 15) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = static_cast(LoadInt(in + 7 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = static_cast(LoadInt(reinterpret_cast(in + 7))); out[0] = (w0) & mask; out[1] = (w0 >> 15) & mask; out[2] = (w0 >> 30) & mask; @@ -2411,20 +2411,20 @@ inline const uint8_t* unpack15_64(const uint8_t* in, uint64_t* out){ out[30] = (w7 >> 2) & mask; out[31] = (w7 >> 17) & mask; - return in + (7 * 8 + 4); + return in + (7); } -inline const uint8_t* unpack16_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack16_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 16) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7 ); out[0] = (w0) & mask; out[1] = (w0 >> 16) & mask; out[2] = (w0 >> 32) & mask; @@ -2458,21 +2458,21 @@ inline const uint8_t* unpack16_64(const uint8_t* in, uint64_t* out){ out[30] = (w7 >> 32) & mask; out[31] = w7 >> 48; - return in + (8 * 8); + return in + (8 ); } -inline const uint8_t* unpack17_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack17_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 17) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = static_cast(LoadInt(in + 8 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = static_cast(LoadInt(reinterpret_cast(in + 8))); out[0] = (w0) & mask; out[1] = (w0 >> 17) & mask; out[2] = (w0 >> 34) & mask; @@ -2506,21 +2506,21 @@ inline const uint8_t* unpack17_64(const uint8_t* in, uint64_t* out){ out[30] = ((w7 >> 62) | (w8 << 2)) & mask; out[31] = (w8 >> 15) & mask; - return in + (8 * 8 + 4); + return in + (8); } -inline const uint8_t* unpack18_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack18_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 18) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8 ); out[0] = (w0) & mask; out[1] = (w0 >> 18) & mask; out[2] = (w0 >> 36) & mask; @@ -2554,22 +2554,22 @@ inline const uint8_t* unpack18_64(const uint8_t* in, uint64_t* out){ out[30] = (w8 >> 28) & mask; out[31] = w8 >> 46; - return in + (9 * 8); + return in + (9 ); } -inline const uint8_t* unpack19_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack19_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 19) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = static_cast(LoadInt(in + 9 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = static_cast(LoadInt(reinterpret_cast(in + 9))); out[0] = (w0) & mask; out[1] = (w0 >> 19) & mask; out[2] = (w0 >> 38) & mask; @@ -2603,22 +2603,22 @@ inline const uint8_t* unpack19_64(const uint8_t* in, uint64_t* out){ out[30] = ((w8 >> 58) | (w9 << 6)) & mask; out[31] = (w9 >> 13) & mask; - return in + (9 * 8 + 4); + return in + (9); } -inline const uint8_t* unpack20_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack20_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 20) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9 ); out[0] = (w0) & mask; out[1] = (w0 >> 20) & mask; out[2] = (w0 >> 40) & mask; @@ -2652,23 +2652,23 @@ inline const uint8_t* unpack20_64(const uint8_t* in, uint64_t* out){ out[30] = (w9 >> 24) & mask; out[31] = w9 >> 44; - return in + (10 * 8); + return in + (10 ); } -inline const uint8_t* unpack21_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack21_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 21) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = static_cast(LoadInt(in + 10 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = static_cast(LoadInt(reinterpret_cast(in + 10))); out[0] = (w0) & mask; out[1] = (w0 >> 21) & mask; out[2] = (w0 >> 42) & mask; @@ -2702,23 +2702,23 @@ inline const uint8_t* unpack21_64(const uint8_t* in, uint64_t* out){ out[30] = ((w9 >> 54) | (w10 << 10)) & mask; out[31] = (w10 >> 11) & mask; - return in + (10 * 8 + 4); + return in + (10); } -inline const uint8_t* unpack22_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack22_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 22) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10 ); out[0] = (w0) & mask; out[1] = (w0 >> 22) & mask; out[2] = ((w0 >> 44) | (w1 << 20)) & mask; @@ -2752,24 +2752,24 @@ inline const uint8_t* unpack22_64(const uint8_t* in, uint64_t* out){ out[30] = (w10 >> 20) & mask; out[31] = w10 >> 42; - return in + (11 * 8); + return in + (11 ); } -inline const uint8_t* unpack23_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack23_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 23) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = static_cast(LoadInt(in + 11 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = static_cast(LoadInt(reinterpret_cast(in + 11))); out[0] = (w0) & mask; out[1] = (w0 >> 23) & mask; out[2] = ((w0 >> 46) | (w1 << 18)) & mask; @@ -2803,24 +2803,24 @@ inline const uint8_t* unpack23_64(const uint8_t* in, uint64_t* out){ out[30] = ((w10 >> 50) | (w11 << 14)) & mask; out[31] = (w11 >> 9) & mask; - return in + (11 * 8 + 4); + return in + (11); } -inline const uint8_t* unpack24_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack24_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 24) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11 ); out[0] = (w0) & mask; out[1] = (w0 >> 24) & mask; out[2] = ((w0 >> 48) | (w1 << 16)) & mask; @@ -2854,25 +2854,25 @@ inline const uint8_t* unpack24_64(const uint8_t* in, uint64_t* out){ out[30] = (w11 >> 16) & mask; out[31] = w11 >> 40; - return in + (12 * 8); + return in + (12 ); } -inline const uint8_t* unpack25_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack25_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 25) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = static_cast(LoadInt(in + 12 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = static_cast(LoadInt(reinterpret_cast(in + 12))); out[0] = (w0) & mask; out[1] = (w0 >> 25) & mask; out[2] = ((w0 >> 50) | (w1 << 14)) & mask; @@ -2906,25 +2906,25 @@ inline const uint8_t* unpack25_64(const uint8_t* in, uint64_t* out){ out[30] = ((w11 >> 46) | (w12 << 18)) & mask; out[31] = (w12 >> 7) & mask; - return in + (12 * 8 + 4); + return in + (12); } -inline const uint8_t* unpack26_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack26_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 26) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12 ); out[0] = (w0) & mask; out[1] = (w0 >> 26) & mask; out[2] = ((w0 >> 52) | (w1 << 12)) & mask; @@ -2958,26 +2958,26 @@ inline const uint8_t* unpack26_64(const uint8_t* in, uint64_t* out){ out[30] = (w12 >> 12) & mask; out[31] = w12 >> 38; - return in + (13 * 8); + return in + (13 ); } -inline const uint8_t* unpack27_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack27_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 27) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = static_cast(LoadInt(in + 13 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = static_cast(LoadInt(reinterpret_cast(in + 13))); out[0] = (w0) & mask; out[1] = (w0 >> 27) & mask; out[2] = ((w0 >> 54) | (w1 << 10)) & mask; @@ -3011,26 +3011,26 @@ inline const uint8_t* unpack27_64(const uint8_t* in, uint64_t* out){ out[30] = ((w12 >> 42) | (w13 << 22)) & mask; out[31] = (w13 >> 5) & mask; - return in + (13 * 8 + 4); + return in + (13); } -inline const uint8_t* unpack28_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack28_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 28) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13 ); out[0] = (w0) & mask; out[1] = (w0 >> 28) & mask; out[2] = ((w0 >> 56) | (w1 << 8)) & mask; @@ -3064,27 +3064,27 @@ inline const uint8_t* unpack28_64(const uint8_t* in, uint64_t* out){ out[30] = (w13 >> 8) & mask; out[31] = w13 >> 36; - return in + (14 * 8); + return in + (14 ); } -inline const uint8_t* unpack29_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack29_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 29) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = static_cast(LoadInt(in + 14 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = static_cast(LoadInt(reinterpret_cast(in + 14))); out[0] = (w0) & mask; out[1] = (w0 >> 29) & mask; out[2] = ((w0 >> 58) | (w1 << 6)) & mask; @@ -3118,27 +3118,27 @@ inline const uint8_t* unpack29_64(const uint8_t* in, uint64_t* out){ out[30] = ((w13 >> 38) | (w14 << 26)) & mask; out[31] = (w14 >> 3) & mask; - return in + (14 * 8 + 4); + return in + (14); } -inline const uint8_t* unpack30_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack30_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 30) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14 ); out[0] = (w0) & mask; out[1] = (w0 >> 30) & mask; out[2] = ((w0 >> 60) | (w1 << 4)) & mask; @@ -3172,28 +3172,28 @@ inline const uint8_t* unpack30_64(const uint8_t* in, uint64_t* out){ out[30] = (w14 >> 4) & mask; out[31] = w14 >> 34; - return in + (15 * 8); + return in + (15 ); } -inline const uint8_t* unpack31_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack31_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 31) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = static_cast(LoadInt(in + 15 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = static_cast(LoadInt(reinterpret_cast(in + 15))); out[0] = (w0) & mask; out[1] = (w0 >> 31) & mask; out[2] = ((w0 >> 62) | (w1 << 2)) & mask; @@ -3227,28 +3227,28 @@ inline const uint8_t* unpack31_64(const uint8_t* in, uint64_t* out){ out[30] = ((w14 >> 34) | (w15 << 30)) & mask; out[31] = (w15 >> 1) & mask; - return in + (15 * 8 + 4); + return in + (15); } -inline const uint8_t* unpack32_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack32_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 32) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15 ); out[0] = (w0) & mask; out[1] = w0 >> 32; out[2] = (w1) & mask; @@ -3282,29 +3282,29 @@ inline const uint8_t* unpack32_64(const uint8_t* in, uint64_t* out){ out[30] = (w15) & mask; out[31] = w15 >> 32; - return in + (16 * 8); + return in + (16 ); } -inline const uint8_t* unpack33_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack33_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 33) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = static_cast(LoadInt(in + 16 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = static_cast(LoadInt(reinterpret_cast(in + 16))); out[0] = (w0) & mask; out[1] = ((w0 >> 33) | (w1 << 31)) & mask; out[2] = (w1 >> 2) & mask; @@ -3338,29 +3338,29 @@ inline const uint8_t* unpack33_64(const uint8_t* in, uint64_t* out){ out[30] = (w15 >> 30) & mask; out[31] = ((w15 >> 63) | (w16 << 1)) & mask; - return in + (16 * 8 + 4); + return in + (16); } -inline const uint8_t* unpack34_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack34_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 34) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16 ); out[0] = (w0) & mask; out[1] = ((w0 >> 34) | (w1 << 30)) & mask; out[2] = (w1 >> 4) & mask; @@ -3394,30 +3394,30 @@ inline const uint8_t* unpack34_64(const uint8_t* in, uint64_t* out){ out[30] = ((w15 >> 60) | (w16 << 4)) & mask; out[31] = w16 >> 30; - return in + (17 * 8); + return in + (17 ); } -inline const uint8_t* unpack35_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack35_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 35) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = static_cast(LoadInt(in + 17 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = static_cast(LoadInt(reinterpret_cast(in + 17))); out[0] = (w0) & mask; out[1] = ((w0 >> 35) | (w1 << 29)) & mask; out[2] = (w1 >> 6) & mask; @@ -3451,30 +3451,30 @@ inline const uint8_t* unpack35_64(const uint8_t* in, uint64_t* out){ out[30] = (w16 >> 26) & mask; out[31] = ((w16 >> 61) | (w17 << 3)) & mask; - return in + (17 * 8 + 4); + return in + (17); } -inline const uint8_t* unpack36_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack36_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 36) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17 ); out[0] = (w0) & mask; out[1] = ((w0 >> 36) | (w1 << 28)) & mask; out[2] = (w1 >> 8) & mask; @@ -3508,31 +3508,31 @@ inline const uint8_t* unpack36_64(const uint8_t* in, uint64_t* out){ out[30] = ((w16 >> 56) | (w17 << 8)) & mask; out[31] = w17 >> 28; - return in + (18 * 8); + return in + (18 ); } -inline const uint8_t* unpack37_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack37_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 37) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = static_cast(LoadInt(in + 18 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = static_cast(LoadInt(reinterpret_cast(in + 18))); out[0] = (w0) & mask; out[1] = ((w0 >> 37) | (w1 << 27)) & mask; out[2] = (w1 >> 10) & mask; @@ -3566,31 +3566,31 @@ inline const uint8_t* unpack37_64(const uint8_t* in, uint64_t* out){ out[30] = (w17 >> 22) & mask; out[31] = ((w17 >> 59) | (w18 << 5)) & mask; - return in + (18 * 8 + 4); + return in + (18); } -inline const uint8_t* unpack38_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack38_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 38) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18 ); out[0] = (w0) & mask; out[1] = ((w0 >> 38) | (w1 << 26)) & mask; out[2] = (w1 >> 12) & mask; @@ -3624,32 +3624,32 @@ inline const uint8_t* unpack38_64(const uint8_t* in, uint64_t* out){ out[30] = ((w17 >> 52) | (w18 << 12)) & mask; out[31] = w18 >> 26; - return in + (19 * 8); + return in + (19 ); } -inline const uint8_t* unpack39_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack39_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 39) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = static_cast(LoadInt(in + 19 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = static_cast(LoadInt(reinterpret_cast(in + 19))); out[0] = (w0) & mask; out[1] = ((w0 >> 39) | (w1 << 25)) & mask; out[2] = (w1 >> 14) & mask; @@ -3683,32 +3683,32 @@ inline const uint8_t* unpack39_64(const uint8_t* in, uint64_t* out){ out[30] = (w18 >> 18) & mask; out[31] = ((w18 >> 57) | (w19 << 7)) & mask; - return in + (19 * 8 + 4); + return in + (19); } -inline const uint8_t* unpack40_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack40_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 40) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19 ); out[0] = (w0) & mask; out[1] = ((w0 >> 40) | (w1 << 24)) & mask; out[2] = (w1 >> 16) & mask; @@ -3742,33 +3742,33 @@ inline const uint8_t* unpack40_64(const uint8_t* in, uint64_t* out){ out[30] = ((w18 >> 48) | (w19 << 16)) & mask; out[31] = w19 >> 24; - return in + (20 * 8); + return in + (20 ); } -inline const uint8_t* unpack41_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack41_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 41) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = static_cast(LoadInt(in + 20 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = static_cast(LoadInt(reinterpret_cast(in + 20))); out[0] = (w0) & mask; out[1] = ((w0 >> 41) | (w1 << 23)) & mask; out[2] = (w1 >> 18) & mask; @@ -3802,33 +3802,33 @@ inline const uint8_t* unpack41_64(const uint8_t* in, uint64_t* out){ out[30] = (w19 >> 14) & mask; out[31] = ((w19 >> 55) | (w20 << 9)) & mask; - return in + (20 * 8 + 4); + return in + (20); } -inline const uint8_t* unpack42_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack42_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 42) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20 ); out[0] = (w0) & mask; out[1] = ((w0 >> 42) | (w1 << 22)) & mask; out[2] = (w1 >> 20) & mask; @@ -3862,34 +3862,34 @@ inline const uint8_t* unpack42_64(const uint8_t* in, uint64_t* out){ out[30] = ((w19 >> 44) | (w20 << 20)) & mask; out[31] = w20 >> 22; - return in + (21 * 8); + return in + (21 ); } -inline const uint8_t* unpack43_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack43_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 43) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = static_cast(LoadInt(in + 21 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = static_cast(LoadInt(reinterpret_cast(in + 21))); out[0] = (w0) & mask; out[1] = ((w0 >> 43) | (w1 << 21)) & mask; out[2] = ((w1 >> 22) | (w2 << 42)) & mask; @@ -3923,34 +3923,34 @@ inline const uint8_t* unpack43_64(const uint8_t* in, uint64_t* out){ out[30] = (w20 >> 10) & mask; out[31] = ((w20 >> 53) | (w21 << 11)) & mask; - return in + (21 * 8 + 4); + return in + (21); } -inline const uint8_t* unpack44_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack44_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 44) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21 ); out[0] = (w0) & mask; out[1] = ((w0 >> 44) | (w1 << 20)) & mask; out[2] = ((w1 >> 24) | (w2 << 40)) & mask; @@ -3984,35 +3984,35 @@ inline const uint8_t* unpack44_64(const uint8_t* in, uint64_t* out){ out[30] = ((w20 >> 40) | (w21 << 24)) & mask; out[31] = w21 >> 20; - return in + (22 * 8); + return in + (22 ); } -inline const uint8_t* unpack45_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack45_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 45) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = static_cast(LoadInt(in + 22 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = static_cast(LoadInt(reinterpret_cast(in + 22))); out[0] = (w0) & mask; out[1] = ((w0 >> 45) | (w1 << 19)) & mask; out[2] = ((w1 >> 26) | (w2 << 38)) & mask; @@ -4046,35 +4046,35 @@ inline const uint8_t* unpack45_64(const uint8_t* in, uint64_t* out){ out[30] = (w21 >> 6) & mask; out[31] = ((w21 >> 51) | (w22 << 13)) & mask; - return in + (22 * 8 + 4); + return in + (22); } -inline const uint8_t* unpack46_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack46_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 46) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22 ); out[0] = (w0) & mask; out[1] = ((w0 >> 46) | (w1 << 18)) & mask; out[2] = ((w1 >> 28) | (w2 << 36)) & mask; @@ -4108,36 +4108,36 @@ inline const uint8_t* unpack46_64(const uint8_t* in, uint64_t* out){ out[30] = ((w21 >> 36) | (w22 << 28)) & mask; out[31] = w22 >> 18; - return in + (23 * 8); + return in + (23 ); } -inline const uint8_t* unpack47_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack47_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 47) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = static_cast(LoadInt(in + 23 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = static_cast(LoadInt(reinterpret_cast(in + 23))); out[0] = (w0) & mask; out[1] = ((w0 >> 47) | (w1 << 17)) & mask; out[2] = ((w1 >> 30) | (w2 << 34)) & mask; @@ -4171,36 +4171,36 @@ inline const uint8_t* unpack47_64(const uint8_t* in, uint64_t* out){ out[30] = (w22 >> 2) & mask; out[31] = ((w22 >> 49) | (w23 << 15)) & mask; - return in + (23 * 8 + 4); + return in + (23); } -inline const uint8_t* unpack48_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack48_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 48) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = LoadInt(in + 23 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23 ); out[0] = (w0) & mask; out[1] = ((w0 >> 48) | (w1 << 16)) & mask; out[2] = ((w1 >> 32) | (w2 << 32)) & mask; @@ -4234,37 +4234,37 @@ inline const uint8_t* unpack48_64(const uint8_t* in, uint64_t* out){ out[30] = ((w22 >> 32) | (w23 << 32)) & mask; out[31] = w23 >> 16; - return in + (24 * 8); + return in + (24 ); } -inline const uint8_t* unpack49_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack49_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 49) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = LoadInt(in + 23 * 8); - const auto w24 = static_cast(LoadInt(in + 24 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = static_cast(LoadInt(reinterpret_cast(in + 24))); out[0] = (w0) & mask; out[1] = ((w0 >> 49) | (w1 << 15)) & mask; out[2] = ((w1 >> 34) | (w2 << 30)) & mask; @@ -4298,37 +4298,37 @@ inline const uint8_t* unpack49_64(const uint8_t* in, uint64_t* out){ out[30] = ((w22 >> 62) | (w23 << 2)) & mask; out[31] = ((w23 >> 47) | (w24 << 17)) & mask; - return in + (24 * 8 + 4); + return in + (24); } -inline const uint8_t* unpack50_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack50_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 50) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = LoadInt(in + 23 * 8); - const auto w24 = LoadInt(in + 24 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24 ); out[0] = (w0) & mask; out[1] = ((w0 >> 50) | (w1 << 14)) & mask; out[2] = ((w1 >> 36) | (w2 << 28)) & mask; @@ -4362,38 +4362,38 @@ inline const uint8_t* unpack50_64(const uint8_t* in, uint64_t* out){ out[30] = ((w23 >> 28) | (w24 << 36)) & mask; out[31] = w24 >> 14; - return in + (25 * 8); + return in + (25 ); } -inline const uint8_t* unpack51_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack51_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 51) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = LoadInt(in + 23 * 8); - const auto w24 = LoadInt(in + 24 * 8); - const auto w25 = static_cast(LoadInt(in + 25 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = static_cast(LoadInt(reinterpret_cast(in + 25))); out[0] = (w0) & mask; out[1] = ((w0 >> 51) | (w1 << 13)) & mask; out[2] = ((w1 >> 38) | (w2 << 26)) & mask; @@ -4427,38 +4427,38 @@ inline const uint8_t* unpack51_64(const uint8_t* in, uint64_t* out){ out[30] = ((w23 >> 58) | (w24 << 6)) & mask; out[31] = ((w24 >> 45) | (w25 << 19)) & mask; - return in + (25 * 8 + 4); + return in + (25); } -inline const uint8_t* unpack52_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack52_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 52) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = LoadInt(in + 23 * 8); - const auto w24 = LoadInt(in + 24 * 8); - const auto w25 = LoadInt(in + 25 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25 ); out[0] = (w0) & mask; out[1] = ((w0 >> 52) | (w1 << 12)) & mask; out[2] = ((w1 >> 40) | (w2 << 24)) & mask; @@ -4492,39 +4492,39 @@ inline const uint8_t* unpack52_64(const uint8_t* in, uint64_t* out){ out[30] = ((w24 >> 24) | (w25 << 40)) & mask; out[31] = w25 >> 12; - return in + (26 * 8); + return in + (26 ); } -inline const uint8_t* unpack53_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack53_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 53) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = LoadInt(in + 23 * 8); - const auto w24 = LoadInt(in + 24 * 8); - const auto w25 = LoadInt(in + 25 * 8); - const auto w26 = static_cast(LoadInt(in + 26 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25); + const auto w26 = static_cast(LoadInt(reinterpret_cast(in + 26))); out[0] = (w0) & mask; out[1] = ((w0 >> 53) | (w1 << 11)) & mask; out[2] = ((w1 >> 42) | (w2 << 22)) & mask; @@ -4558,39 +4558,39 @@ inline const uint8_t* unpack53_64(const uint8_t* in, uint64_t* out){ out[30] = ((w24 >> 54) | (w25 << 10)) & mask; out[31] = ((w25 >> 43) | (w26 << 21)) & mask; - return in + (26 * 8 + 4); + return in + (26); } -inline const uint8_t* unpack54_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack54_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 54) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = LoadInt(in + 23 * 8); - const auto w24 = LoadInt(in + 24 * 8); - const auto w25 = LoadInt(in + 25 * 8); - const auto w26 = LoadInt(in + 26 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25); + const auto w26 = LoadInt(in + 26 ); out[0] = (w0) & mask; out[1] = ((w0 >> 54) | (w1 << 10)) & mask; out[2] = ((w1 >> 44) | (w2 << 20)) & mask; @@ -4624,40 +4624,40 @@ inline const uint8_t* unpack54_64(const uint8_t* in, uint64_t* out){ out[30] = ((w25 >> 20) | (w26 << 44)) & mask; out[31] = w26 >> 10; - return in + (27 * 8); + return in + (27 ); } -inline const uint8_t* unpack55_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack55_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 55) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = LoadInt(in + 23 * 8); - const auto w24 = LoadInt(in + 24 * 8); - const auto w25 = LoadInt(in + 25 * 8); - const auto w26 = LoadInt(in + 26 * 8); - const auto w27 = static_cast(LoadInt(in + 27 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25); + const auto w26 = LoadInt(in + 26); + const auto w27 = static_cast(LoadInt(reinterpret_cast(in + 27))); out[0] = (w0) & mask; out[1] = ((w0 >> 55) | (w1 << 9)) & mask; out[2] = ((w1 >> 46) | (w2 << 18)) & mask; @@ -4691,40 +4691,40 @@ inline const uint8_t* unpack55_64(const uint8_t* in, uint64_t* out){ out[30] = ((w25 >> 50) | (w26 << 14)) & mask; out[31] = ((w26 >> 41) | (w27 << 23)) & mask; - return in + (27 * 8 + 4); + return in + (27); } -inline const uint8_t* unpack56_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack56_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 56) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = LoadInt(in + 23 * 8); - const auto w24 = LoadInt(in + 24 * 8); - const auto w25 = LoadInt(in + 25 * 8); - const auto w26 = LoadInt(in + 26 * 8); - const auto w27 = LoadInt(in + 27 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25); + const auto w26 = LoadInt(in + 26); + const auto w27 = LoadInt(in + 27 ); out[0] = (w0) & mask; out[1] = ((w0 >> 56) | (w1 << 8)) & mask; out[2] = ((w1 >> 48) | (w2 << 16)) & mask; @@ -4758,41 +4758,41 @@ inline const uint8_t* unpack56_64(const uint8_t* in, uint64_t* out){ out[30] = ((w26 >> 16) | (w27 << 48)) & mask; out[31] = w27 >> 8; - return in + (28 * 8); + return in + (28 ); } -inline const uint8_t* unpack57_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack57_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 57) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = LoadInt(in + 23 * 8); - const auto w24 = LoadInt(in + 24 * 8); - const auto w25 = LoadInt(in + 25 * 8); - const auto w26 = LoadInt(in + 26 * 8); - const auto w27 = LoadInt(in + 27 * 8); - const auto w28 = static_cast(LoadInt(in + 28 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25); + const auto w26 = LoadInt(in + 26); + const auto w27 = LoadInt(in + 27); + const auto w28 = static_cast(LoadInt(reinterpret_cast(in + 28))); out[0] = (w0) & mask; out[1] = ((w0 >> 57) | (w1 << 7)) & mask; out[2] = ((w1 >> 50) | (w2 << 14)) & mask; @@ -4826,41 +4826,41 @@ inline const uint8_t* unpack57_64(const uint8_t* in, uint64_t* out){ out[30] = ((w26 >> 46) | (w27 << 18)) & mask; out[31] = ((w27 >> 39) | (w28 << 25)) & mask; - return in + (28 * 8 + 4); + return in + (28); } -inline const uint8_t* unpack58_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack58_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 58) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = LoadInt(in + 23 * 8); - const auto w24 = LoadInt(in + 24 * 8); - const auto w25 = LoadInt(in + 25 * 8); - const auto w26 = LoadInt(in + 26 * 8); - const auto w27 = LoadInt(in + 27 * 8); - const auto w28 = LoadInt(in + 28 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25); + const auto w26 = LoadInt(in + 26); + const auto w27 = LoadInt(in + 27); + const auto w28 = LoadInt(in + 28 ); out[0] = (w0) & mask; out[1] = ((w0 >> 58) | (w1 << 6)) & mask; out[2] = ((w1 >> 52) | (w2 << 12)) & mask; @@ -4894,42 +4894,42 @@ inline const uint8_t* unpack58_64(const uint8_t* in, uint64_t* out){ out[30] = ((w27 >> 12) | (w28 << 52)) & mask; out[31] = w28 >> 6; - return in + (29 * 8); + return in + (29 ); } -inline const uint8_t* unpack59_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack59_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 59) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = LoadInt(in + 23 * 8); - const auto w24 = LoadInt(in + 24 * 8); - const auto w25 = LoadInt(in + 25 * 8); - const auto w26 = LoadInt(in + 26 * 8); - const auto w27 = LoadInt(in + 27 * 8); - const auto w28 = LoadInt(in + 28 * 8); - const auto w29 = static_cast(LoadInt(in + 29 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25); + const auto w26 = LoadInt(in + 26); + const auto w27 = LoadInt(in + 27); + const auto w28 = LoadInt(in + 28); + const auto w29 = static_cast(LoadInt(reinterpret_cast(in + 29))); out[0] = (w0) & mask; out[1] = ((w0 >> 59) | (w1 << 5)) & mask; out[2] = ((w1 >> 54) | (w2 << 10)) & mask; @@ -4963,42 +4963,42 @@ inline const uint8_t* unpack59_64(const uint8_t* in, uint64_t* out){ out[30] = ((w27 >> 42) | (w28 << 22)) & mask; out[31] = ((w28 >> 37) | (w29 << 27)) & mask; - return in + (29 * 8 + 4); + return in + (29); } -inline const uint8_t* unpack60_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack60_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 60) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = LoadInt(in + 23 * 8); - const auto w24 = LoadInt(in + 24 * 8); - const auto w25 = LoadInt(in + 25 * 8); - const auto w26 = LoadInt(in + 26 * 8); - const auto w27 = LoadInt(in + 27 * 8); - const auto w28 = LoadInt(in + 28 * 8); - const auto w29 = LoadInt(in + 29 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25); + const auto w26 = LoadInt(in + 26); + const auto w27 = LoadInt(in + 27); + const auto w28 = LoadInt(in + 28); + const auto w29 = LoadInt(in + 29 ); out[0] = (w0) & mask; out[1] = ((w0 >> 60) | (w1 << 4)) & mask; out[2] = ((w1 >> 56) | (w2 << 8)) & mask; @@ -5032,43 +5032,43 @@ inline const uint8_t* unpack60_64(const uint8_t* in, uint64_t* out){ out[30] = ((w28 >> 8) | (w29 << 56)) & mask; out[31] = w29 >> 4; - return in + (30 * 8); + return in + (30 ); } -inline const uint8_t* unpack61_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack61_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 61) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = LoadInt(in + 23 * 8); - const auto w24 = LoadInt(in + 24 * 8); - const auto w25 = LoadInt(in + 25 * 8); - const auto w26 = LoadInt(in + 26 * 8); - const auto w27 = LoadInt(in + 27 * 8); - const auto w28 = LoadInt(in + 28 * 8); - const auto w29 = LoadInt(in + 29 * 8); - const auto w30 = static_cast(LoadInt(in + 30 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25); + const auto w26 = LoadInt(in + 26); + const auto w27 = LoadInt(in + 27); + const auto w28 = LoadInt(in + 28); + const auto w29 = LoadInt(in + 29); + const auto w30 = static_cast(LoadInt(reinterpret_cast(in + 30))); out[0] = (w0) & mask; out[1] = ((w0 >> 61) | (w1 << 3)) & mask; out[2] = ((w1 >> 58) | (w2 << 6)) & mask; @@ -5102,43 +5102,43 @@ inline const uint8_t* unpack61_64(const uint8_t* in, uint64_t* out){ out[30] = ((w28 >> 38) | (w29 << 26)) & mask; out[31] = ((w29 >> 35) | (w30 << 29)) & mask; - return in + (30 * 8 + 4); + return in + (30); } -inline const uint8_t* unpack62_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack62_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 62) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = LoadInt(in + 23 * 8); - const auto w24 = LoadInt(in + 24 * 8); - const auto w25 = LoadInt(in + 25 * 8); - const auto w26 = LoadInt(in + 26 * 8); - const auto w27 = LoadInt(in + 27 * 8); - const auto w28 = LoadInt(in + 28 * 8); - const auto w29 = LoadInt(in + 29 * 8); - const auto w30 = LoadInt(in + 30 * 8); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25); + const auto w26 = LoadInt(in + 26); + const auto w27 = LoadInt(in + 27); + const auto w28 = LoadInt(in + 28); + const auto w29 = LoadInt(in + 29); + const auto w30 = LoadInt(in + 30 ); out[0] = (w0) & mask; out[1] = ((w0 >> 62) | (w1 << 2)) & mask; out[2] = ((w1 >> 60) | (w2 << 4)) & mask; @@ -5172,44 +5172,44 @@ inline const uint8_t* unpack62_64(const uint8_t* in, uint64_t* out){ out[30] = ((w29 >> 4) | (w30 << 60)) & mask; out[31] = w30 >> 2; - return in + (31 * 8); + return in + (31 ); } -inline const uint8_t* unpack63_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack63_64(const uint64_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 63) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 * 8); - const auto w1 = LoadInt(in + 1 * 8); - const auto w2 = LoadInt(in + 2 * 8); - const auto w3 = LoadInt(in + 3 * 8); - const auto w4 = LoadInt(in + 4 * 8); - const auto w5 = LoadInt(in + 5 * 8); - const auto w6 = LoadInt(in + 6 * 8); - const auto w7 = LoadInt(in + 7 * 8); - const auto w8 = LoadInt(in + 8 * 8); - const auto w9 = LoadInt(in + 9 * 8); - const auto w10 = LoadInt(in + 10 * 8); - const auto w11 = LoadInt(in + 11 * 8); - const auto w12 = LoadInt(in + 12 * 8); - const auto w13 = LoadInt(in + 13 * 8); - const auto w14 = LoadInt(in + 14 * 8); - const auto w15 = LoadInt(in + 15 * 8); - const auto w16 = LoadInt(in + 16 * 8); - const auto w17 = LoadInt(in + 17 * 8); - const auto w18 = LoadInt(in + 18 * 8); - const auto w19 = LoadInt(in + 19 * 8); - const auto w20 = LoadInt(in + 20 * 8); - const auto w21 = LoadInt(in + 21 * 8); - const auto w22 = LoadInt(in + 22 * 8); - const auto w23 = LoadInt(in + 23 * 8); - const auto w24 = LoadInt(in + 24 * 8); - const auto w25 = LoadInt(in + 25 * 8); - const auto w26 = LoadInt(in + 26 * 8); - const auto w27 = LoadInt(in + 27 * 8); - const auto w28 = LoadInt(in + 28 * 8); - const auto w29 = LoadInt(in + 29 * 8); - const auto w30 = LoadInt(in + 30 * 8); - const auto w31 = static_cast(LoadInt(in + 31 * 8)); + const auto w0 = LoadInt(in + 0); + const auto w1 = LoadInt(in + 1); + const auto w2 = LoadInt(in + 2); + const auto w3 = LoadInt(in + 3); + const auto w4 = LoadInt(in + 4); + const auto w5 = LoadInt(in + 5); + const auto w6 = LoadInt(in + 6); + const auto w7 = LoadInt(in + 7); + const auto w8 = LoadInt(in + 8); + const auto w9 = LoadInt(in + 9); + const auto w10 = LoadInt(in + 10); + const auto w11 = LoadInt(in + 11); + const auto w12 = LoadInt(in + 12); + const auto w13 = LoadInt(in + 13); + const auto w14 = LoadInt(in + 14); + const auto w15 = LoadInt(in + 15); + const auto w16 = LoadInt(in + 16); + const auto w17 = LoadInt(in + 17); + const auto w18 = LoadInt(in + 18); + const auto w19 = LoadInt(in + 19); + const auto w20 = LoadInt(in + 20); + const auto w21 = LoadInt(in + 21); + const auto w22 = LoadInt(in + 22); + const auto w23 = LoadInt(in + 23); + const auto w24 = LoadInt(in + 24); + const auto w25 = LoadInt(in + 25); + const auto w26 = LoadInt(in + 26); + const auto w27 = LoadInt(in + 27); + const auto w28 = LoadInt(in + 28); + const auto w29 = LoadInt(in + 29); + const auto w30 = LoadInt(in + 30); + const auto w31 = static_cast(LoadInt(reinterpret_cast(in + 31))); out[0] = (w0) & mask; out[1] = ((w0 >> 63) | (w1 << 1)) & mask; out[2] = ((w1 >> 62) | (w2 << 2)) & mask; @@ -5243,14 +5243,14 @@ inline const uint8_t* unpack63_64(const uint8_t* in, uint64_t* out){ out[30] = ((w29 >> 34) | (w30 << 30)) & mask; out[31] = ((w30 >> 33) | (w31 << 31)) & mask; - return in + (31 * 8 + 4); + return in + (31); } -inline const uint8_t* unpack64_64(const uint8_t* in, uint64_t* out){ +inline const uint64_t* unpack64_64(const uint64_t* in, uint64_t* out){ for(int k = 0; k < 32; k += 1) { - out[k] = LoadInt(in + (k * 8)); + out[k] = LoadInt(in + (k )); } - return in + (8 * 32); + return in + ( 32); } } // namespace arrow::internal From b9afb2845c60663f05a4de08019acf128c70c57c Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 18 Sep 2025 11:31:58 +0200 Subject: [PATCH 11/76] Revert: reinterpret cast This reverts commit 2aeb76ea7dc3620dc678b55e3f60f5842ba6ce91. --- cpp/src/arrow/util/bpacking.cc | 6 +- cpp/src/arrow/util/bpacking_scalar_codegen.py | 25 +- .../util/bpacking_scalar_generated_internal.h | 3436 ++++++++--------- 3 files changed, 1733 insertions(+), 1734 deletions(-) diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index 73aa7939b20..fefca194518 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -35,8 +35,7 @@ namespace arrow { namespace internal { -int unpack32_scalar(const uint8_t* in_, uint32_t* out, int batch_size, int num_bits) { - auto in = reinterpret_cast(in_); +int unpack32_scalar(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { batch_size = batch_size / 32 * 32; int num_loops = batch_size / 32; @@ -177,8 +176,7 @@ int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { #endif } -int unpack64_scalar(const uint8_t* in_, uint64_t* out, int batch_size, int num_bits) { - auto in = reinterpret_cast(in_); +int unpack64_scalar(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { batch_size = batch_size / 32 * 32; int num_loops = batch_size / 32; diff --git a/cpp/src/arrow/util/bpacking_scalar_codegen.py b/cpp/src/arrow/util/bpacking_scalar_codegen.py index 7cec6b9ac42..8882e9a7c83 100644 --- a/cpp/src/arrow/util/bpacking_scalar_codegen.py +++ b/cpp/src/arrow/util/bpacking_scalar_codegen.py @@ -67,8 +67,8 @@ namespace arrow::internal { template -Int LoadInt(const Int* in) { - return bit_util::FromLittleEndian(util::SafeLoad(in)); +Int LoadInt(const uint8_t* in) { + return bit_util::FromLittleEndian(util::SafeLoadAs(in)); } """ @@ -109,8 +109,8 @@ def howmanybytes(self, bit: int) -> int: def unpack_signature(self, bit: int) -> str: return ( - f"inline const {self.unsigned_type}* unpack{bit}_{self.out_bit_width}" - f"(const {self.unsigned_type}* in, {self.unsigned_type}* out)" + f"inline const uint8_t* unpack{bit}_{self.out_bit_width}" + f"(const uint8_t* in, {self.unsigned_type}* out)" "{" ) @@ -125,10 +125,10 @@ def print_unpack_last(self) -> None: print(f" for(int k = 0; k < {self.howmany}; k += 1) {{") print( f" out[k] = LoadInt<{self.unsigned_type}>(" - f"in + (k ));" + f"in + (k * {self.out_byte_width}));" ) print(" }") - print(f" return in + ( {self.howmany});") + print(f" return in + ({self.out_byte_width} * {self.howmany});") print("}") def print_unpack_k(self, bit: int) -> None: @@ -143,20 +143,20 @@ def print_unpack_k(self, bit: int) -> None: for k in range(self.howmanywords(bit) - 1): print( f" const auto w{k} = LoadInt<{self.unsigned_type}>(" - f"in + {k});" + f"in + {k} * {self.out_byte_width});" ) k = self.howmanywords(bit) - 1 use_smart_halving = self.smart_halve and bit % 2 == 1 if use_smart_halving: print( - f" const auto w{k} = static_cast<{self.unsigned_type}>(LoadInt(" - f"reinterpret_cast(in + {k})));" + f" const auto w{k} = static_cast<{self.unsigned_type}>(LoadInt<{self.unsigned_type_half}>(" + f"in + {k} * {self.out_byte_width}));" ) else: print( f" const auto w{k} = LoadInt<{self.unsigned_type}>(" - f"in + {k} );" + f"in + {k} * {self.out_byte_width});" ) for j in range(self.howmany): @@ -181,10 +181,11 @@ def print_unpack_k(self, bit: int) -> None: if use_smart_halving: print( - f" return in + ({self.howmanywords(bit) - 1});" + f" return in + ({self.howmanywords(bit) - 1} * {self.out_byte_width}" + f" + {self.out_byte_width // 2});" ) else: - print(f" return in + ({self.howmanywords(bit)} );") + print(f" return in + ({self.howmanywords(bit)} * {self.out_byte_width});") print("}") def print_all(self) -> None: diff --git a/cpp/src/arrow/util/bpacking_scalar_generated_internal.h b/cpp/src/arrow/util/bpacking_scalar_generated_internal.h index c1787fc53b7..d215fcfbc46 100644 --- a/cpp/src/arrow/util/bpacking_scalar_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_scalar_generated_internal.h @@ -39,19 +39,19 @@ namespace arrow::internal { template -Int LoadInt(const Int* in) { - return bit_util::FromLittleEndian(util::SafeLoad(in)); +Int LoadInt(const uint8_t* in) { + return bit_util::FromLittleEndian(util::SafeLoadAs(in)); } -inline const uint32_t* unpack0_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack0_32(const uint8_t* in, uint32_t* out){ std::memset(out, 0, 32 * 4); return in; } -inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 1) - uint32_t{1}); - const auto w0 = LoadInt(in + 0 ); + const auto w0 = LoadInt(in + 0 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 1) & mask; out[2] = (w0 >> 2) & mask; @@ -85,14 +85,14 @@ inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out){ out[30] = (w0 >> 30) & mask; out[31] = w0 >> 31; - return in + (1 ); + return in + (1 * 4); } -inline const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 2) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 2) & mask; out[2] = (w0 >> 4) & mask; @@ -126,15 +126,15 @@ inline const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out){ out[30] = (w1 >> 28) & mask; out[31] = w1 >> 30; - return in + (2 ); + return in + (2 * 4); } -inline const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 3) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 3) & mask; out[2] = (w0 >> 6) & mask; @@ -168,16 +168,16 @@ inline const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out){ out[30] = (w2 >> 26) & mask; out[31] = w2 >> 29; - return in + (3 ); + return in + (3 * 4); } -inline const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 4) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 4) & mask; out[2] = (w0 >> 8) & mask; @@ -211,17 +211,17 @@ inline const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out){ out[30] = (w3 >> 24) & mask; out[31] = w3 >> 28; - return in + (4 ); + return in + (4 * 4); } -inline const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 5) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 5) & mask; out[2] = (w0 >> 10) & mask; @@ -255,18 +255,18 @@ inline const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out){ out[30] = (w4 >> 22) & mask; out[31] = w4 >> 27; - return in + (5 ); + return in + (5 * 4); } -inline const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 6) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 6) & mask; out[2] = (w0 >> 12) & mask; @@ -300,19 +300,19 @@ inline const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out){ out[30] = (w5 >> 20) & mask; out[31] = w5 >> 26; - return in + (6 ); + return in + (6 * 4); } -inline const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 7) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 7) & mask; out[2] = (w0 >> 14) & mask; @@ -346,20 +346,20 @@ inline const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out){ out[30] = (w6 >> 18) & mask; out[31] = w6 >> 25; - return in + (7 ); + return in + (7 * 4); } -inline const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 8) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 8) & mask; out[2] = (w0 >> 16) & mask; @@ -393,21 +393,21 @@ inline const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out){ out[30] = (w7 >> 16) & mask; out[31] = w7 >> 24; - return in + (8 ); + return in + (8 * 4); } -inline const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 9) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 9) & mask; out[2] = (w0 >> 18) & mask; @@ -441,22 +441,22 @@ inline const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out){ out[30] = (w8 >> 14) & mask; out[31] = w8 >> 23; - return in + (9 ); + return in + (9 * 4); } -inline const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 10) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 10) & mask; out[2] = (w0 >> 20) & mask; @@ -490,23 +490,23 @@ inline const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out){ out[30] = (w9 >> 12) & mask; out[31] = w9 >> 22; - return in + (10 ); + return in + (10 * 4); } -inline const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 11) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 11) & mask; out[2] = ((w0 >> 22) | (w1 << 10)) & mask; @@ -540,24 +540,24 @@ inline const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out){ out[30] = (w10 >> 10) & mask; out[31] = w10 >> 21; - return in + (11 ); + return in + (11 * 4); } -inline const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 12) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 12) & mask; out[2] = ((w0 >> 24) | (w1 << 8)) & mask; @@ -591,25 +591,25 @@ inline const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out){ out[30] = (w11 >> 8) & mask; out[31] = w11 >> 20; - return in + (12 ); + return in + (12 * 4); } -inline const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 13) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 13) & mask; out[2] = ((w0 >> 26) | (w1 << 6)) & mask; @@ -643,26 +643,26 @@ inline const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out){ out[30] = (w12 >> 6) & mask; out[31] = w12 >> 19; - return in + (13 ); + return in + (13 * 4); } -inline const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 14) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 14) & mask; out[2] = ((w0 >> 28) | (w1 << 4)) & mask; @@ -696,27 +696,27 @@ inline const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out){ out[30] = (w13 >> 4) & mask; out[31] = w13 >> 18; - return in + (14 ); + return in + (14 * 4); } -inline const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 15) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); out[0] = (w0) & mask; out[1] = (w0 >> 15) & mask; out[2] = ((w0 >> 30) | (w1 << 2)) & mask; @@ -750,28 +750,28 @@ inline const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out){ out[30] = (w14 >> 2) & mask; out[31] = w14 >> 17; - return in + (15 ); + return in + (15 * 4); } -inline const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 16) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); out[0] = (w0) & mask; out[1] = w0 >> 16; out[2] = (w1) & mask; @@ -805,29 +805,29 @@ inline const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out){ out[30] = (w15) & mask; out[31] = w15 >> 16; - return in + (16 ); + return in + (16 * 4); } -inline const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 17) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 17) | (w1 << 15)) & mask; out[2] = (w1 >> 2) & mask; @@ -861,30 +861,30 @@ inline const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out){ out[30] = ((w15 >> 30) | (w16 << 2)) & mask; out[31] = w16 >> 15; - return in + (17 ); + return in + (17 * 4); } -inline const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 18) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 18) | (w1 << 14)) & mask; out[2] = (w1 >> 4) & mask; @@ -918,31 +918,31 @@ inline const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out){ out[30] = ((w16 >> 28) | (w17 << 4)) & mask; out[31] = w17 >> 14; - return in + (18 ); + return in + (18 * 4); } -inline const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 19) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 19) | (w1 << 13)) & mask; out[2] = (w1 >> 6) & mask; @@ -976,32 +976,32 @@ inline const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out){ out[30] = ((w17 >> 26) | (w18 << 6)) & mask; out[31] = w18 >> 13; - return in + (19 ); + return in + (19 * 4); } -inline const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 20) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 20) | (w1 << 12)) & mask; out[2] = (w1 >> 8) & mask; @@ -1035,33 +1035,33 @@ inline const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out){ out[30] = ((w18 >> 24) | (w19 << 8)) & mask; out[31] = w19 >> 12; - return in + (20 ); + return in + (20 * 4); } -inline const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 21) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 21) | (w1 << 11)) & mask; out[2] = (w1 >> 10) & mask; @@ -1095,34 +1095,34 @@ inline const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out){ out[30] = ((w19 >> 22) | (w20 << 10)) & mask; out[31] = w20 >> 11; - return in + (21 ); + return in + (21 * 4); } -inline const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 22) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 22) | (w1 << 10)) & mask; out[2] = ((w1 >> 12) | (w2 << 20)) & mask; @@ -1156,35 +1156,35 @@ inline const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out){ out[30] = ((w20 >> 20) | (w21 << 12)) & mask; out[31] = w21 >> 10; - return in + (22 ); + return in + (22 * 4); } -inline const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 23) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 23) | (w1 << 9)) & mask; out[2] = ((w1 >> 14) | (w2 << 18)) & mask; @@ -1218,36 +1218,36 @@ inline const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out){ out[30] = ((w21 >> 18) | (w22 << 14)) & mask; out[31] = w22 >> 9; - return in + (23 ); + return in + (23 * 4); } -inline const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 24) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); + const auto w23 = LoadInt(in + 23 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 24) | (w1 << 8)) & mask; out[2] = ((w1 >> 16) | (w2 << 16)) & mask; @@ -1281,37 +1281,37 @@ inline const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out){ out[30] = ((w22 >> 16) | (w23 << 16)) & mask; out[31] = w23 >> 8; - return in + (24 ); + return in + (24 * 4); } -inline const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 25) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); + const auto w23 = LoadInt(in + 23 * 4); + const auto w24 = LoadInt(in + 24 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 25) | (w1 << 7)) & mask; out[2] = ((w1 >> 18) | (w2 << 14)) & mask; @@ -1345,38 +1345,38 @@ inline const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out){ out[30] = ((w23 >> 14) | (w24 << 18)) & mask; out[31] = w24 >> 7; - return in + (25 ); + return in + (25 * 4); } -inline const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 26) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); + const auto w23 = LoadInt(in + 23 * 4); + const auto w24 = LoadInt(in + 24 * 4); + const auto w25 = LoadInt(in + 25 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 26) | (w1 << 6)) & mask; out[2] = ((w1 >> 20) | (w2 << 12)) & mask; @@ -1410,39 +1410,39 @@ inline const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out){ out[30] = ((w24 >> 12) | (w25 << 20)) & mask; out[31] = w25 >> 6; - return in + (26 ); + return in + (26 * 4); } -inline const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 27) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25); - const auto w26 = LoadInt(in + 26 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); + const auto w23 = LoadInt(in + 23 * 4); + const auto w24 = LoadInt(in + 24 * 4); + const auto w25 = LoadInt(in + 25 * 4); + const auto w26 = LoadInt(in + 26 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 27) | (w1 << 5)) & mask; out[2] = ((w1 >> 22) | (w2 << 10)) & mask; @@ -1476,40 +1476,40 @@ inline const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out){ out[30] = ((w25 >> 10) | (w26 << 22)) & mask; out[31] = w26 >> 5; - return in + (27 ); + return in + (27 * 4); } -inline const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 28) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25); - const auto w26 = LoadInt(in + 26); - const auto w27 = LoadInt(in + 27 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); + const auto w23 = LoadInt(in + 23 * 4); + const auto w24 = LoadInt(in + 24 * 4); + const auto w25 = LoadInt(in + 25 * 4); + const auto w26 = LoadInt(in + 26 * 4); + const auto w27 = LoadInt(in + 27 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 28) | (w1 << 4)) & mask; out[2] = ((w1 >> 24) | (w2 << 8)) & mask; @@ -1543,41 +1543,41 @@ inline const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out){ out[30] = ((w26 >> 8) | (w27 << 24)) & mask; out[31] = w27 >> 4; - return in + (28 ); + return in + (28 * 4); } -inline const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 29) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25); - const auto w26 = LoadInt(in + 26); - const auto w27 = LoadInt(in + 27); - const auto w28 = LoadInt(in + 28 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); + const auto w23 = LoadInt(in + 23 * 4); + const auto w24 = LoadInt(in + 24 * 4); + const auto w25 = LoadInt(in + 25 * 4); + const auto w26 = LoadInt(in + 26 * 4); + const auto w27 = LoadInt(in + 27 * 4); + const auto w28 = LoadInt(in + 28 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 29) | (w1 << 3)) & mask; out[2] = ((w1 >> 26) | (w2 << 6)) & mask; @@ -1611,42 +1611,42 @@ inline const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out){ out[30] = ((w27 >> 6) | (w28 << 26)) & mask; out[31] = w28 >> 3; - return in + (29 ); + return in + (29 * 4); } -inline const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 30) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25); - const auto w26 = LoadInt(in + 26); - const auto w27 = LoadInt(in + 27); - const auto w28 = LoadInt(in + 28); - const auto w29 = LoadInt(in + 29 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); + const auto w23 = LoadInt(in + 23 * 4); + const auto w24 = LoadInt(in + 24 * 4); + const auto w25 = LoadInt(in + 25 * 4); + const auto w26 = LoadInt(in + 26 * 4); + const auto w27 = LoadInt(in + 27 * 4); + const auto w28 = LoadInt(in + 28 * 4); + const auto w29 = LoadInt(in + 29 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 30) | (w1 << 2)) & mask; out[2] = ((w1 >> 28) | (w2 << 4)) & mask; @@ -1680,43 +1680,43 @@ inline const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out){ out[30] = ((w28 >> 4) | (w29 << 28)) & mask; out[31] = w29 >> 2; - return in + (30 ); + return in + (30 * 4); } -inline const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out){ constexpr uint32_t mask = ((uint32_t{1} << 31) - uint32_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25); - const auto w26 = LoadInt(in + 26); - const auto w27 = LoadInt(in + 27); - const auto w28 = LoadInt(in + 28); - const auto w29 = LoadInt(in + 29); - const auto w30 = LoadInt(in + 30 ); + const auto w0 = LoadInt(in + 0 * 4); + const auto w1 = LoadInt(in + 1 * 4); + const auto w2 = LoadInt(in + 2 * 4); + const auto w3 = LoadInt(in + 3 * 4); + const auto w4 = LoadInt(in + 4 * 4); + const auto w5 = LoadInt(in + 5 * 4); + const auto w6 = LoadInt(in + 6 * 4); + const auto w7 = LoadInt(in + 7 * 4); + const auto w8 = LoadInt(in + 8 * 4); + const auto w9 = LoadInt(in + 9 * 4); + const auto w10 = LoadInt(in + 10 * 4); + const auto w11 = LoadInt(in + 11 * 4); + const auto w12 = LoadInt(in + 12 * 4); + const auto w13 = LoadInt(in + 13 * 4); + const auto w14 = LoadInt(in + 14 * 4); + const auto w15 = LoadInt(in + 15 * 4); + const auto w16 = LoadInt(in + 16 * 4); + const auto w17 = LoadInt(in + 17 * 4); + const auto w18 = LoadInt(in + 18 * 4); + const auto w19 = LoadInt(in + 19 * 4); + const auto w20 = LoadInt(in + 20 * 4); + const auto w21 = LoadInt(in + 21 * 4); + const auto w22 = LoadInt(in + 22 * 4); + const auto w23 = LoadInt(in + 23 * 4); + const auto w24 = LoadInt(in + 24 * 4); + const auto w25 = LoadInt(in + 25 * 4); + const auto w26 = LoadInt(in + 26 * 4); + const auto w27 = LoadInt(in + 27 * 4); + const auto w28 = LoadInt(in + 28 * 4); + const auto w29 = LoadInt(in + 29 * 4); + const auto w30 = LoadInt(in + 30 * 4); out[0] = (w0) & mask; out[1] = ((w0 >> 31) | (w1 << 1)) & mask; out[2] = ((w1 >> 30) | (w2 << 2)) & mask; @@ -1750,25 +1750,25 @@ inline const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out){ out[30] = ((w29 >> 2) | (w30 << 30)) & mask; out[31] = w30 >> 1; - return in + (31 ); + return in + (31 * 4); } -inline const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out){ +inline const uint8_t* unpack32_32(const uint8_t* in, uint32_t* out){ for(int k = 0; k < 32; k += 1) { - out[k] = LoadInt(in + (k )); + out[k] = LoadInt(in + (k * 4)); } - return in + ( 32); + return in + (4 * 32); } -inline const uint64_t* unpack0_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out){ std::memset(out, 0, 32 * 8); return in; } -inline const uint64_t* unpack1_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack1_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 1) - uint64_t{1}); - const auto w0 = static_cast(LoadInt(reinterpret_cast(in + 0))); + const auto w0 = static_cast(LoadInt(in + 0 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 1) & mask; out[2] = (w0 >> 2) & mask; @@ -1802,13 +1802,13 @@ inline const uint64_t* unpack1_64(const uint64_t* in, uint64_t* out){ out[30] = (w0 >> 30) & mask; out[31] = (w0 >> 31) & mask; - return in + (0); + return in + (0 * 8 + 4); } -inline const uint64_t* unpack2_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack2_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 2) - uint64_t{1}); - const auto w0 = LoadInt(in + 0 ); + const auto w0 = LoadInt(in + 0 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 2) & mask; out[2] = (w0 >> 4) & mask; @@ -1842,14 +1842,14 @@ inline const uint64_t* unpack2_64(const uint64_t* in, uint64_t* out){ out[30] = (w0 >> 60) & mask; out[31] = w0 >> 62; - return in + (1 ); + return in + (1 * 8); } -inline const uint64_t* unpack3_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack3_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 3) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = static_cast(LoadInt(reinterpret_cast(in + 1))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = static_cast(LoadInt(in + 1 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 3) & mask; out[2] = (w0 >> 6) & mask; @@ -1883,14 +1883,14 @@ inline const uint64_t* unpack3_64(const uint64_t* in, uint64_t* out){ out[30] = (w1 >> 26) & mask; out[31] = (w1 >> 29) & mask; - return in + (1); + return in + (1 * 8 + 4); } -inline const uint64_t* unpack4_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack4_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 4) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 4) & mask; out[2] = (w0 >> 8) & mask; @@ -1924,15 +1924,15 @@ inline const uint64_t* unpack4_64(const uint64_t* in, uint64_t* out){ out[30] = (w1 >> 56) & mask; out[31] = w1 >> 60; - return in + (2 ); + return in + (2 * 8); } -inline const uint64_t* unpack5_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack5_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 5) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = static_cast(LoadInt(reinterpret_cast(in + 2))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = static_cast(LoadInt(in + 2 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 5) & mask; out[2] = (w0 >> 10) & mask; @@ -1966,15 +1966,15 @@ inline const uint64_t* unpack5_64(const uint64_t* in, uint64_t* out){ out[30] = (w2 >> 22) & mask; out[31] = (w2 >> 27) & mask; - return in + (2); + return in + (2 * 8 + 4); } -inline const uint64_t* unpack6_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack6_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 6) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 6) & mask; out[2] = (w0 >> 12) & mask; @@ -2008,16 +2008,16 @@ inline const uint64_t* unpack6_64(const uint64_t* in, uint64_t* out){ out[30] = (w2 >> 52) & mask; out[31] = w2 >> 58; - return in + (3 ); + return in + (3 * 8); } -inline const uint64_t* unpack7_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack7_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 7) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = static_cast(LoadInt(reinterpret_cast(in + 3))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = static_cast(LoadInt(in + 3 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 7) & mask; out[2] = (w0 >> 14) & mask; @@ -2051,16 +2051,16 @@ inline const uint64_t* unpack7_64(const uint64_t* in, uint64_t* out){ out[30] = (w3 >> 18) & mask; out[31] = (w3 >> 25) & mask; - return in + (3); + return in + (3 * 8 + 4); } -inline const uint64_t* unpack8_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack8_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 8) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 8) & mask; out[2] = (w0 >> 16) & mask; @@ -2094,17 +2094,17 @@ inline const uint64_t* unpack8_64(const uint64_t* in, uint64_t* out){ out[30] = (w3 >> 48) & mask; out[31] = w3 >> 56; - return in + (4 ); + return in + (4 * 8); } -inline const uint64_t* unpack9_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack9_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 9) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = static_cast(LoadInt(reinterpret_cast(in + 4))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = static_cast(LoadInt(in + 4 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 9) & mask; out[2] = (w0 >> 18) & mask; @@ -2138,17 +2138,17 @@ inline const uint64_t* unpack9_64(const uint64_t* in, uint64_t* out){ out[30] = (w4 >> 14) & mask; out[31] = (w4 >> 23) & mask; - return in + (4); + return in + (4 * 8 + 4); } -inline const uint64_t* unpack10_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack10_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 10) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 10) & mask; out[2] = (w0 >> 20) & mask; @@ -2182,18 +2182,18 @@ inline const uint64_t* unpack10_64(const uint64_t* in, uint64_t* out){ out[30] = (w4 >> 44) & mask; out[31] = w4 >> 54; - return in + (5 ); + return in + (5 * 8); } -inline const uint64_t* unpack11_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack11_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 11) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = static_cast(LoadInt(reinterpret_cast(in + 5))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = static_cast(LoadInt(in + 5 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 11) & mask; out[2] = (w0 >> 22) & mask; @@ -2227,18 +2227,18 @@ inline const uint64_t* unpack11_64(const uint64_t* in, uint64_t* out){ out[30] = (w5 >> 10) & mask; out[31] = (w5 >> 21) & mask; - return in + (5); + return in + (5 * 8 + 4); } -inline const uint64_t* unpack12_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack12_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 12) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 12) & mask; out[2] = (w0 >> 24) & mask; @@ -2272,19 +2272,19 @@ inline const uint64_t* unpack12_64(const uint64_t* in, uint64_t* out){ out[30] = (w5 >> 40) & mask; out[31] = w5 >> 52; - return in + (6 ); + return in + (6 * 8); } -inline const uint64_t* unpack13_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack13_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 13) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = static_cast(LoadInt(reinterpret_cast(in + 6))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = static_cast(LoadInt(in + 6 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 13) & mask; out[2] = (w0 >> 26) & mask; @@ -2318,19 +2318,19 @@ inline const uint64_t* unpack13_64(const uint64_t* in, uint64_t* out){ out[30] = (w6 >> 6) & mask; out[31] = (w6 >> 19) & mask; - return in + (6); + return in + (6 * 8 + 4); } -inline const uint64_t* unpack14_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack14_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 14) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 14) & mask; out[2] = (w0 >> 28) & mask; @@ -2364,20 +2364,20 @@ inline const uint64_t* unpack14_64(const uint64_t* in, uint64_t* out){ out[30] = (w6 >> 36) & mask; out[31] = w6 >> 50; - return in + (7 ); + return in + (7 * 8); } -inline const uint64_t* unpack15_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack15_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 15) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = static_cast(LoadInt(reinterpret_cast(in + 7))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = static_cast(LoadInt(in + 7 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 15) & mask; out[2] = (w0 >> 30) & mask; @@ -2411,20 +2411,20 @@ inline const uint64_t* unpack15_64(const uint64_t* in, uint64_t* out){ out[30] = (w7 >> 2) & mask; out[31] = (w7 >> 17) & mask; - return in + (7); + return in + (7 * 8 + 4); } -inline const uint64_t* unpack16_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack16_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 16) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 16) & mask; out[2] = (w0 >> 32) & mask; @@ -2458,21 +2458,21 @@ inline const uint64_t* unpack16_64(const uint64_t* in, uint64_t* out){ out[30] = (w7 >> 32) & mask; out[31] = w7 >> 48; - return in + (8 ); + return in + (8 * 8); } -inline const uint64_t* unpack17_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack17_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 17) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = static_cast(LoadInt(reinterpret_cast(in + 8))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = static_cast(LoadInt(in + 8 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 17) & mask; out[2] = (w0 >> 34) & mask; @@ -2506,21 +2506,21 @@ inline const uint64_t* unpack17_64(const uint64_t* in, uint64_t* out){ out[30] = ((w7 >> 62) | (w8 << 2)) & mask; out[31] = (w8 >> 15) & mask; - return in + (8); + return in + (8 * 8 + 4); } -inline const uint64_t* unpack18_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack18_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 18) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 18) & mask; out[2] = (w0 >> 36) & mask; @@ -2554,22 +2554,22 @@ inline const uint64_t* unpack18_64(const uint64_t* in, uint64_t* out){ out[30] = (w8 >> 28) & mask; out[31] = w8 >> 46; - return in + (9 ); + return in + (9 * 8); } -inline const uint64_t* unpack19_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack19_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 19) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = static_cast(LoadInt(reinterpret_cast(in + 9))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = static_cast(LoadInt(in + 9 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 19) & mask; out[2] = (w0 >> 38) & mask; @@ -2603,22 +2603,22 @@ inline const uint64_t* unpack19_64(const uint64_t* in, uint64_t* out){ out[30] = ((w8 >> 58) | (w9 << 6)) & mask; out[31] = (w9 >> 13) & mask; - return in + (9); + return in + (9 * 8 + 4); } -inline const uint64_t* unpack20_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack20_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 20) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 20) & mask; out[2] = (w0 >> 40) & mask; @@ -2652,23 +2652,23 @@ inline const uint64_t* unpack20_64(const uint64_t* in, uint64_t* out){ out[30] = (w9 >> 24) & mask; out[31] = w9 >> 44; - return in + (10 ); + return in + (10 * 8); } -inline const uint64_t* unpack21_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack21_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 21) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = static_cast(LoadInt(reinterpret_cast(in + 10))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = static_cast(LoadInt(in + 10 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 21) & mask; out[2] = (w0 >> 42) & mask; @@ -2702,23 +2702,23 @@ inline const uint64_t* unpack21_64(const uint64_t* in, uint64_t* out){ out[30] = ((w9 >> 54) | (w10 << 10)) & mask; out[31] = (w10 >> 11) & mask; - return in + (10); + return in + (10 * 8 + 4); } -inline const uint64_t* unpack22_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack22_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 22) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 22) & mask; out[2] = ((w0 >> 44) | (w1 << 20)) & mask; @@ -2752,24 +2752,24 @@ inline const uint64_t* unpack22_64(const uint64_t* in, uint64_t* out){ out[30] = (w10 >> 20) & mask; out[31] = w10 >> 42; - return in + (11 ); + return in + (11 * 8); } -inline const uint64_t* unpack23_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack23_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 23) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = static_cast(LoadInt(reinterpret_cast(in + 11))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = static_cast(LoadInt(in + 11 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 23) & mask; out[2] = ((w0 >> 46) | (w1 << 18)) & mask; @@ -2803,24 +2803,24 @@ inline const uint64_t* unpack23_64(const uint64_t* in, uint64_t* out){ out[30] = ((w10 >> 50) | (w11 << 14)) & mask; out[31] = (w11 >> 9) & mask; - return in + (11); + return in + (11 * 8 + 4); } -inline const uint64_t* unpack24_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack24_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 24) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 24) & mask; out[2] = ((w0 >> 48) | (w1 << 16)) & mask; @@ -2854,25 +2854,25 @@ inline const uint64_t* unpack24_64(const uint64_t* in, uint64_t* out){ out[30] = (w11 >> 16) & mask; out[31] = w11 >> 40; - return in + (12 ); + return in + (12 * 8); } -inline const uint64_t* unpack25_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack25_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 25) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = static_cast(LoadInt(reinterpret_cast(in + 12))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = static_cast(LoadInt(in + 12 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 25) & mask; out[2] = ((w0 >> 50) | (w1 << 14)) & mask; @@ -2906,25 +2906,25 @@ inline const uint64_t* unpack25_64(const uint64_t* in, uint64_t* out){ out[30] = ((w11 >> 46) | (w12 << 18)) & mask; out[31] = (w12 >> 7) & mask; - return in + (12); + return in + (12 * 8 + 4); } -inline const uint64_t* unpack26_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack26_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 26) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 26) & mask; out[2] = ((w0 >> 52) | (w1 << 12)) & mask; @@ -2958,26 +2958,26 @@ inline const uint64_t* unpack26_64(const uint64_t* in, uint64_t* out){ out[30] = (w12 >> 12) & mask; out[31] = w12 >> 38; - return in + (13 ); + return in + (13 * 8); } -inline const uint64_t* unpack27_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack27_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 27) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = static_cast(LoadInt(reinterpret_cast(in + 13))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = static_cast(LoadInt(in + 13 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 27) & mask; out[2] = ((w0 >> 54) | (w1 << 10)) & mask; @@ -3011,26 +3011,26 @@ inline const uint64_t* unpack27_64(const uint64_t* in, uint64_t* out){ out[30] = ((w12 >> 42) | (w13 << 22)) & mask; out[31] = (w13 >> 5) & mask; - return in + (13); + return in + (13 * 8 + 4); } -inline const uint64_t* unpack28_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack28_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 28) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 28) & mask; out[2] = ((w0 >> 56) | (w1 << 8)) & mask; @@ -3064,27 +3064,27 @@ inline const uint64_t* unpack28_64(const uint64_t* in, uint64_t* out){ out[30] = (w13 >> 8) & mask; out[31] = w13 >> 36; - return in + (14 ); + return in + (14 * 8); } -inline const uint64_t* unpack29_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack29_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 29) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = static_cast(LoadInt(reinterpret_cast(in + 14))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = static_cast(LoadInt(in + 14 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 29) & mask; out[2] = ((w0 >> 58) | (w1 << 6)) & mask; @@ -3118,27 +3118,27 @@ inline const uint64_t* unpack29_64(const uint64_t* in, uint64_t* out){ out[30] = ((w13 >> 38) | (w14 << 26)) & mask; out[31] = (w14 >> 3) & mask; - return in + (14); + return in + (14 * 8 + 4); } -inline const uint64_t* unpack30_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack30_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 30) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); out[0] = (w0) & mask; out[1] = (w0 >> 30) & mask; out[2] = ((w0 >> 60) | (w1 << 4)) & mask; @@ -3172,28 +3172,28 @@ inline const uint64_t* unpack30_64(const uint64_t* in, uint64_t* out){ out[30] = (w14 >> 4) & mask; out[31] = w14 >> 34; - return in + (15 ); + return in + (15 * 8); } -inline const uint64_t* unpack31_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack31_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 31) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = static_cast(LoadInt(reinterpret_cast(in + 15))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = static_cast(LoadInt(in + 15 * 8)); out[0] = (w0) & mask; out[1] = (w0 >> 31) & mask; out[2] = ((w0 >> 62) | (w1 << 2)) & mask; @@ -3227,28 +3227,28 @@ inline const uint64_t* unpack31_64(const uint64_t* in, uint64_t* out){ out[30] = ((w14 >> 34) | (w15 << 30)) & mask; out[31] = (w15 >> 1) & mask; - return in + (15); + return in + (15 * 8 + 4); } -inline const uint64_t* unpack32_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack32_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 32) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); out[0] = (w0) & mask; out[1] = w0 >> 32; out[2] = (w1) & mask; @@ -3282,29 +3282,29 @@ inline const uint64_t* unpack32_64(const uint64_t* in, uint64_t* out){ out[30] = (w15) & mask; out[31] = w15 >> 32; - return in + (16 ); + return in + (16 * 8); } -inline const uint64_t* unpack33_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack33_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 33) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = static_cast(LoadInt(reinterpret_cast(in + 16))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = static_cast(LoadInt(in + 16 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 33) | (w1 << 31)) & mask; out[2] = (w1 >> 2) & mask; @@ -3338,29 +3338,29 @@ inline const uint64_t* unpack33_64(const uint64_t* in, uint64_t* out){ out[30] = (w15 >> 30) & mask; out[31] = ((w15 >> 63) | (w16 << 1)) & mask; - return in + (16); + return in + (16 * 8 + 4); } -inline const uint64_t* unpack34_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack34_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 34) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 34) | (w1 << 30)) & mask; out[2] = (w1 >> 4) & mask; @@ -3394,30 +3394,30 @@ inline const uint64_t* unpack34_64(const uint64_t* in, uint64_t* out){ out[30] = ((w15 >> 60) | (w16 << 4)) & mask; out[31] = w16 >> 30; - return in + (17 ); + return in + (17 * 8); } -inline const uint64_t* unpack35_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack35_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 35) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = static_cast(LoadInt(reinterpret_cast(in + 17))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = static_cast(LoadInt(in + 17 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 35) | (w1 << 29)) & mask; out[2] = (w1 >> 6) & mask; @@ -3451,30 +3451,30 @@ inline const uint64_t* unpack35_64(const uint64_t* in, uint64_t* out){ out[30] = (w16 >> 26) & mask; out[31] = ((w16 >> 61) | (w17 << 3)) & mask; - return in + (17); + return in + (17 * 8 + 4); } -inline const uint64_t* unpack36_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack36_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 36) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 36) | (w1 << 28)) & mask; out[2] = (w1 >> 8) & mask; @@ -3508,31 +3508,31 @@ inline const uint64_t* unpack36_64(const uint64_t* in, uint64_t* out){ out[30] = ((w16 >> 56) | (w17 << 8)) & mask; out[31] = w17 >> 28; - return in + (18 ); + return in + (18 * 8); } -inline const uint64_t* unpack37_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack37_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 37) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = static_cast(LoadInt(reinterpret_cast(in + 18))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = static_cast(LoadInt(in + 18 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 37) | (w1 << 27)) & mask; out[2] = (w1 >> 10) & mask; @@ -3566,31 +3566,31 @@ inline const uint64_t* unpack37_64(const uint64_t* in, uint64_t* out){ out[30] = (w17 >> 22) & mask; out[31] = ((w17 >> 59) | (w18 << 5)) & mask; - return in + (18); + return in + (18 * 8 + 4); } -inline const uint64_t* unpack38_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack38_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 38) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 38) | (w1 << 26)) & mask; out[2] = (w1 >> 12) & mask; @@ -3624,32 +3624,32 @@ inline const uint64_t* unpack38_64(const uint64_t* in, uint64_t* out){ out[30] = ((w17 >> 52) | (w18 << 12)) & mask; out[31] = w18 >> 26; - return in + (19 ); + return in + (19 * 8); } -inline const uint64_t* unpack39_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack39_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 39) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = static_cast(LoadInt(reinterpret_cast(in + 19))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = static_cast(LoadInt(in + 19 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 39) | (w1 << 25)) & mask; out[2] = (w1 >> 14) & mask; @@ -3683,32 +3683,32 @@ inline const uint64_t* unpack39_64(const uint64_t* in, uint64_t* out){ out[30] = (w18 >> 18) & mask; out[31] = ((w18 >> 57) | (w19 << 7)) & mask; - return in + (19); + return in + (19 * 8 + 4); } -inline const uint64_t* unpack40_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack40_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 40) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 40) | (w1 << 24)) & mask; out[2] = (w1 >> 16) & mask; @@ -3742,33 +3742,33 @@ inline const uint64_t* unpack40_64(const uint64_t* in, uint64_t* out){ out[30] = ((w18 >> 48) | (w19 << 16)) & mask; out[31] = w19 >> 24; - return in + (20 ); + return in + (20 * 8); } -inline const uint64_t* unpack41_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack41_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 41) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = static_cast(LoadInt(reinterpret_cast(in + 20))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = static_cast(LoadInt(in + 20 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 41) | (w1 << 23)) & mask; out[2] = (w1 >> 18) & mask; @@ -3802,33 +3802,33 @@ inline const uint64_t* unpack41_64(const uint64_t* in, uint64_t* out){ out[30] = (w19 >> 14) & mask; out[31] = ((w19 >> 55) | (w20 << 9)) & mask; - return in + (20); + return in + (20 * 8 + 4); } -inline const uint64_t* unpack42_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack42_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 42) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 42) | (w1 << 22)) & mask; out[2] = (w1 >> 20) & mask; @@ -3862,34 +3862,34 @@ inline const uint64_t* unpack42_64(const uint64_t* in, uint64_t* out){ out[30] = ((w19 >> 44) | (w20 << 20)) & mask; out[31] = w20 >> 22; - return in + (21 ); + return in + (21 * 8); } -inline const uint64_t* unpack43_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack43_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 43) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = static_cast(LoadInt(reinterpret_cast(in + 21))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = static_cast(LoadInt(in + 21 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 43) | (w1 << 21)) & mask; out[2] = ((w1 >> 22) | (w2 << 42)) & mask; @@ -3923,34 +3923,34 @@ inline const uint64_t* unpack43_64(const uint64_t* in, uint64_t* out){ out[30] = (w20 >> 10) & mask; out[31] = ((w20 >> 53) | (w21 << 11)) & mask; - return in + (21); + return in + (21 * 8 + 4); } -inline const uint64_t* unpack44_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack44_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 44) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 44) | (w1 << 20)) & mask; out[2] = ((w1 >> 24) | (w2 << 40)) & mask; @@ -3984,35 +3984,35 @@ inline const uint64_t* unpack44_64(const uint64_t* in, uint64_t* out){ out[30] = ((w20 >> 40) | (w21 << 24)) & mask; out[31] = w21 >> 20; - return in + (22 ); + return in + (22 * 8); } -inline const uint64_t* unpack45_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack45_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 45) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = static_cast(LoadInt(reinterpret_cast(in + 22))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = static_cast(LoadInt(in + 22 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 45) | (w1 << 19)) & mask; out[2] = ((w1 >> 26) | (w2 << 38)) & mask; @@ -4046,35 +4046,35 @@ inline const uint64_t* unpack45_64(const uint64_t* in, uint64_t* out){ out[30] = (w21 >> 6) & mask; out[31] = ((w21 >> 51) | (w22 << 13)) & mask; - return in + (22); + return in + (22 * 8 + 4); } -inline const uint64_t* unpack46_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack46_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 46) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 46) | (w1 << 18)) & mask; out[2] = ((w1 >> 28) | (w2 << 36)) & mask; @@ -4108,36 +4108,36 @@ inline const uint64_t* unpack46_64(const uint64_t* in, uint64_t* out){ out[30] = ((w21 >> 36) | (w22 << 28)) & mask; out[31] = w22 >> 18; - return in + (23 ); + return in + (23 * 8); } -inline const uint64_t* unpack47_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack47_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 47) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = static_cast(LoadInt(reinterpret_cast(in + 23))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = static_cast(LoadInt(in + 23 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 47) | (w1 << 17)) & mask; out[2] = ((w1 >> 30) | (w2 << 34)) & mask; @@ -4171,36 +4171,36 @@ inline const uint64_t* unpack47_64(const uint64_t* in, uint64_t* out){ out[30] = (w22 >> 2) & mask; out[31] = ((w22 >> 49) | (w23 << 15)) & mask; - return in + (23); + return in + (23 * 8 + 4); } -inline const uint64_t* unpack48_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack48_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 48) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 48) | (w1 << 16)) & mask; out[2] = ((w1 >> 32) | (w2 << 32)) & mask; @@ -4234,37 +4234,37 @@ inline const uint64_t* unpack48_64(const uint64_t* in, uint64_t* out){ out[30] = ((w22 >> 32) | (w23 << 32)) & mask; out[31] = w23 >> 16; - return in + (24 ); + return in + (24 * 8); } -inline const uint64_t* unpack49_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack49_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 49) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = static_cast(LoadInt(reinterpret_cast(in + 24))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = static_cast(LoadInt(in + 24 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 49) | (w1 << 15)) & mask; out[2] = ((w1 >> 34) | (w2 << 30)) & mask; @@ -4298,37 +4298,37 @@ inline const uint64_t* unpack49_64(const uint64_t* in, uint64_t* out){ out[30] = ((w22 >> 62) | (w23 << 2)) & mask; out[31] = ((w23 >> 47) | (w24 << 17)) & mask; - return in + (24); + return in + (24 * 8 + 4); } -inline const uint64_t* unpack50_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack50_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 50) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 50) | (w1 << 14)) & mask; out[2] = ((w1 >> 36) | (w2 << 28)) & mask; @@ -4362,38 +4362,38 @@ inline const uint64_t* unpack50_64(const uint64_t* in, uint64_t* out){ out[30] = ((w23 >> 28) | (w24 << 36)) & mask; out[31] = w24 >> 14; - return in + (25 ); + return in + (25 * 8); } -inline const uint64_t* unpack51_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack51_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 51) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = static_cast(LoadInt(reinterpret_cast(in + 25))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = static_cast(LoadInt(in + 25 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 51) | (w1 << 13)) & mask; out[2] = ((w1 >> 38) | (w2 << 26)) & mask; @@ -4427,38 +4427,38 @@ inline const uint64_t* unpack51_64(const uint64_t* in, uint64_t* out){ out[30] = ((w23 >> 58) | (w24 << 6)) & mask; out[31] = ((w24 >> 45) | (w25 << 19)) & mask; - return in + (25); + return in + (25 * 8 + 4); } -inline const uint64_t* unpack52_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack52_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 52) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 52) | (w1 << 12)) & mask; out[2] = ((w1 >> 40) | (w2 << 24)) & mask; @@ -4492,39 +4492,39 @@ inline const uint64_t* unpack52_64(const uint64_t* in, uint64_t* out){ out[30] = ((w24 >> 24) | (w25 << 40)) & mask; out[31] = w25 >> 12; - return in + (26 ); + return in + (26 * 8); } -inline const uint64_t* unpack53_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack53_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 53) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25); - const auto w26 = static_cast(LoadInt(reinterpret_cast(in + 26))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = static_cast(LoadInt(in + 26 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 53) | (w1 << 11)) & mask; out[2] = ((w1 >> 42) | (w2 << 22)) & mask; @@ -4558,39 +4558,39 @@ inline const uint64_t* unpack53_64(const uint64_t* in, uint64_t* out){ out[30] = ((w24 >> 54) | (w25 << 10)) & mask; out[31] = ((w25 >> 43) | (w26 << 21)) & mask; - return in + (26); + return in + (26 * 8 + 4); } -inline const uint64_t* unpack54_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack54_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 54) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25); - const auto w26 = LoadInt(in + 26 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 54) | (w1 << 10)) & mask; out[2] = ((w1 >> 44) | (w2 << 20)) & mask; @@ -4624,40 +4624,40 @@ inline const uint64_t* unpack54_64(const uint64_t* in, uint64_t* out){ out[30] = ((w25 >> 20) | (w26 << 44)) & mask; out[31] = w26 >> 10; - return in + (27 ); + return in + (27 * 8); } -inline const uint64_t* unpack55_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack55_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 55) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25); - const auto w26 = LoadInt(in + 26); - const auto w27 = static_cast(LoadInt(reinterpret_cast(in + 27))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = static_cast(LoadInt(in + 27 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 55) | (w1 << 9)) & mask; out[2] = ((w1 >> 46) | (w2 << 18)) & mask; @@ -4691,40 +4691,40 @@ inline const uint64_t* unpack55_64(const uint64_t* in, uint64_t* out){ out[30] = ((w25 >> 50) | (w26 << 14)) & mask; out[31] = ((w26 >> 41) | (w27 << 23)) & mask; - return in + (27); + return in + (27 * 8 + 4); } -inline const uint64_t* unpack56_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack56_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 56) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25); - const auto w26 = LoadInt(in + 26); - const auto w27 = LoadInt(in + 27 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = LoadInt(in + 27 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 56) | (w1 << 8)) & mask; out[2] = ((w1 >> 48) | (w2 << 16)) & mask; @@ -4758,41 +4758,41 @@ inline const uint64_t* unpack56_64(const uint64_t* in, uint64_t* out){ out[30] = ((w26 >> 16) | (w27 << 48)) & mask; out[31] = w27 >> 8; - return in + (28 ); + return in + (28 * 8); } -inline const uint64_t* unpack57_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack57_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 57) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25); - const auto w26 = LoadInt(in + 26); - const auto w27 = LoadInt(in + 27); - const auto w28 = static_cast(LoadInt(reinterpret_cast(in + 28))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = LoadInt(in + 27 * 8); + const auto w28 = static_cast(LoadInt(in + 28 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 57) | (w1 << 7)) & mask; out[2] = ((w1 >> 50) | (w2 << 14)) & mask; @@ -4826,41 +4826,41 @@ inline const uint64_t* unpack57_64(const uint64_t* in, uint64_t* out){ out[30] = ((w26 >> 46) | (w27 << 18)) & mask; out[31] = ((w27 >> 39) | (w28 << 25)) & mask; - return in + (28); + return in + (28 * 8 + 4); } -inline const uint64_t* unpack58_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack58_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 58) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25); - const auto w26 = LoadInt(in + 26); - const auto w27 = LoadInt(in + 27); - const auto w28 = LoadInt(in + 28 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = LoadInt(in + 27 * 8); + const auto w28 = LoadInt(in + 28 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 58) | (w1 << 6)) & mask; out[2] = ((w1 >> 52) | (w2 << 12)) & mask; @@ -4894,42 +4894,42 @@ inline const uint64_t* unpack58_64(const uint64_t* in, uint64_t* out){ out[30] = ((w27 >> 12) | (w28 << 52)) & mask; out[31] = w28 >> 6; - return in + (29 ); + return in + (29 * 8); } -inline const uint64_t* unpack59_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack59_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 59) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25); - const auto w26 = LoadInt(in + 26); - const auto w27 = LoadInt(in + 27); - const auto w28 = LoadInt(in + 28); - const auto w29 = static_cast(LoadInt(reinterpret_cast(in + 29))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = LoadInt(in + 27 * 8); + const auto w28 = LoadInt(in + 28 * 8); + const auto w29 = static_cast(LoadInt(in + 29 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 59) | (w1 << 5)) & mask; out[2] = ((w1 >> 54) | (w2 << 10)) & mask; @@ -4963,42 +4963,42 @@ inline const uint64_t* unpack59_64(const uint64_t* in, uint64_t* out){ out[30] = ((w27 >> 42) | (w28 << 22)) & mask; out[31] = ((w28 >> 37) | (w29 << 27)) & mask; - return in + (29); + return in + (29 * 8 + 4); } -inline const uint64_t* unpack60_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack60_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 60) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25); - const auto w26 = LoadInt(in + 26); - const auto w27 = LoadInt(in + 27); - const auto w28 = LoadInt(in + 28); - const auto w29 = LoadInt(in + 29 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = LoadInt(in + 27 * 8); + const auto w28 = LoadInt(in + 28 * 8); + const auto w29 = LoadInt(in + 29 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 60) | (w1 << 4)) & mask; out[2] = ((w1 >> 56) | (w2 << 8)) & mask; @@ -5032,43 +5032,43 @@ inline const uint64_t* unpack60_64(const uint64_t* in, uint64_t* out){ out[30] = ((w28 >> 8) | (w29 << 56)) & mask; out[31] = w29 >> 4; - return in + (30 ); + return in + (30 * 8); } -inline const uint64_t* unpack61_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack61_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 61) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25); - const auto w26 = LoadInt(in + 26); - const auto w27 = LoadInt(in + 27); - const auto w28 = LoadInt(in + 28); - const auto w29 = LoadInt(in + 29); - const auto w30 = static_cast(LoadInt(reinterpret_cast(in + 30))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = LoadInt(in + 27 * 8); + const auto w28 = LoadInt(in + 28 * 8); + const auto w29 = LoadInt(in + 29 * 8); + const auto w30 = static_cast(LoadInt(in + 30 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 61) | (w1 << 3)) & mask; out[2] = ((w1 >> 58) | (w2 << 6)) & mask; @@ -5102,43 +5102,43 @@ inline const uint64_t* unpack61_64(const uint64_t* in, uint64_t* out){ out[30] = ((w28 >> 38) | (w29 << 26)) & mask; out[31] = ((w29 >> 35) | (w30 << 29)) & mask; - return in + (30); + return in + (30 * 8 + 4); } -inline const uint64_t* unpack62_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack62_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 62) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25); - const auto w26 = LoadInt(in + 26); - const auto w27 = LoadInt(in + 27); - const auto w28 = LoadInt(in + 28); - const auto w29 = LoadInt(in + 29); - const auto w30 = LoadInt(in + 30 ); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = LoadInt(in + 27 * 8); + const auto w28 = LoadInt(in + 28 * 8); + const auto w29 = LoadInt(in + 29 * 8); + const auto w30 = LoadInt(in + 30 * 8); out[0] = (w0) & mask; out[1] = ((w0 >> 62) | (w1 << 2)) & mask; out[2] = ((w1 >> 60) | (w2 << 4)) & mask; @@ -5172,44 +5172,44 @@ inline const uint64_t* unpack62_64(const uint64_t* in, uint64_t* out){ out[30] = ((w29 >> 4) | (w30 << 60)) & mask; out[31] = w30 >> 2; - return in + (31 ); + return in + (31 * 8); } -inline const uint64_t* unpack63_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack63_64(const uint8_t* in, uint64_t* out){ constexpr uint64_t mask = ((uint64_t{1} << 63) - uint64_t{1}); - const auto w0 = LoadInt(in + 0); - const auto w1 = LoadInt(in + 1); - const auto w2 = LoadInt(in + 2); - const auto w3 = LoadInt(in + 3); - const auto w4 = LoadInt(in + 4); - const auto w5 = LoadInt(in + 5); - const auto w6 = LoadInt(in + 6); - const auto w7 = LoadInt(in + 7); - const auto w8 = LoadInt(in + 8); - const auto w9 = LoadInt(in + 9); - const auto w10 = LoadInt(in + 10); - const auto w11 = LoadInt(in + 11); - const auto w12 = LoadInt(in + 12); - const auto w13 = LoadInt(in + 13); - const auto w14 = LoadInt(in + 14); - const auto w15 = LoadInt(in + 15); - const auto w16 = LoadInt(in + 16); - const auto w17 = LoadInt(in + 17); - const auto w18 = LoadInt(in + 18); - const auto w19 = LoadInt(in + 19); - const auto w20 = LoadInt(in + 20); - const auto w21 = LoadInt(in + 21); - const auto w22 = LoadInt(in + 22); - const auto w23 = LoadInt(in + 23); - const auto w24 = LoadInt(in + 24); - const auto w25 = LoadInt(in + 25); - const auto w26 = LoadInt(in + 26); - const auto w27 = LoadInt(in + 27); - const auto w28 = LoadInt(in + 28); - const auto w29 = LoadInt(in + 29); - const auto w30 = LoadInt(in + 30); - const auto w31 = static_cast(LoadInt(reinterpret_cast(in + 31))); + const auto w0 = LoadInt(in + 0 * 8); + const auto w1 = LoadInt(in + 1 * 8); + const auto w2 = LoadInt(in + 2 * 8); + const auto w3 = LoadInt(in + 3 * 8); + const auto w4 = LoadInt(in + 4 * 8); + const auto w5 = LoadInt(in + 5 * 8); + const auto w6 = LoadInt(in + 6 * 8); + const auto w7 = LoadInt(in + 7 * 8); + const auto w8 = LoadInt(in + 8 * 8); + const auto w9 = LoadInt(in + 9 * 8); + const auto w10 = LoadInt(in + 10 * 8); + const auto w11 = LoadInt(in + 11 * 8); + const auto w12 = LoadInt(in + 12 * 8); + const auto w13 = LoadInt(in + 13 * 8); + const auto w14 = LoadInt(in + 14 * 8); + const auto w15 = LoadInt(in + 15 * 8); + const auto w16 = LoadInt(in + 16 * 8); + const auto w17 = LoadInt(in + 17 * 8); + const auto w18 = LoadInt(in + 18 * 8); + const auto w19 = LoadInt(in + 19 * 8); + const auto w20 = LoadInt(in + 20 * 8); + const auto w21 = LoadInt(in + 21 * 8); + const auto w22 = LoadInt(in + 22 * 8); + const auto w23 = LoadInt(in + 23 * 8); + const auto w24 = LoadInt(in + 24 * 8); + const auto w25 = LoadInt(in + 25 * 8); + const auto w26 = LoadInt(in + 26 * 8); + const auto w27 = LoadInt(in + 27 * 8); + const auto w28 = LoadInt(in + 28 * 8); + const auto w29 = LoadInt(in + 29 * 8); + const auto w30 = LoadInt(in + 30 * 8); + const auto w31 = static_cast(LoadInt(in + 31 * 8)); out[0] = (w0) & mask; out[1] = ((w0 >> 63) | (w1 << 1)) & mask; out[2] = ((w1 >> 62) | (w2 << 2)) & mask; @@ -5243,14 +5243,14 @@ inline const uint64_t* unpack63_64(const uint64_t* in, uint64_t* out){ out[30] = ((w29 >> 34) | (w30 << 30)) & mask; out[31] = ((w30 >> 33) | (w31 << 31)) & mask; - return in + (31); + return in + (31 * 8 + 4); } -inline const uint64_t* unpack64_64(const uint64_t* in, uint64_t* out){ +inline const uint8_t* unpack64_64(const uint8_t* in, uint64_t* out){ for(int k = 0; k < 32; k += 1) { - out[k] = LoadInt(in + (k )); + out[k] = LoadInt(in + (k * 8)); } - return in + ( 32); + return in + (8 * 32); } } // namespace arrow::internal From 7f96639f884836f374f44a04631220dc1cee9c62 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 18 Sep 2025 15:07:46 +0200 Subject: [PATCH 12/76] Simplify simd generator code --- cpp/src/arrow/util/bpacking_avx2.cc | 3 +- cpp/src/arrow/util/bpacking_avx512.cc | 3 +- cpp/src/arrow/util/bpacking_neon.cc | 3 +- cpp/src/arrow/util/bpacking_scalar_codegen.py | 8 +- cpp/src/arrow/util/bpacking_simd_codegen.py | 227 +++++++++--------- 5 files changed, 124 insertions(+), 120 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_avx2.cc b/cpp/src/arrow/util/bpacking_avx2.cc index c6d1b4546ce..88cfe5c7d08 100644 --- a/cpp/src/arrow/util/bpacking_avx2.cc +++ b/cpp/src/arrow/util/bpacking_avx2.cc @@ -22,8 +22,7 @@ namespace arrow::internal { int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>(in, out, batch_size, - num_bits); + return unpack32_specialized>(in, out, batch_size, num_bits); } } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_avx512.cc b/cpp/src/arrow/util/bpacking_avx512.cc index 29a7c133f30..8332c6156f0 100644 --- a/cpp/src/arrow/util/bpacking_avx512.cc +++ b/cpp/src/arrow/util/bpacking_avx512.cc @@ -22,8 +22,7 @@ namespace arrow::internal { int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>(in, out, batch_size, - num_bits); + return unpack32_specialized>(in, out, batch_size, num_bits); } } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_neon.cc b/cpp/src/arrow/util/bpacking_neon.cc index 517e2d95a70..ec783d8b741 100644 --- a/cpp/src/arrow/util/bpacking_neon.cc +++ b/cpp/src/arrow/util/bpacking_neon.cc @@ -22,8 +22,7 @@ namespace arrow::internal { int unpack32_neon(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>(in, out, batch_size, - num_bits); + return unpack32_specialized>(in, out, batch_size, num_bits); } } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_scalar_codegen.py b/cpp/src/arrow/util/bpacking_scalar_codegen.py index 8882e9a7c83..fe29f74e08d 100644 --- a/cpp/src/arrow/util/bpacking_scalar_codegen.py +++ b/cpp/src/arrow/util/bpacking_scalar_codegen.py @@ -199,11 +199,15 @@ def print_all(self) -> None: self.print_unpack_last() -if __name__ == "__main__": - print(LICENSE) +def print_note(): print("// WARNING: this file is generated, DO NOT EDIT.") print("// Usage:") print(f"// python {' '.join(sys.orig_argv[1:])}") + + +if __name__ == "__main__": + print(LICENSE) + print_note() print(HEADER) ScalarUnpackGenerator(32, smart_halve=False).print_all() diff --git a/cpp/src/arrow/util/bpacking_simd_codegen.py b/cpp/src/arrow/util/bpacking_simd_codegen.py index b0c81c1a272..d81be5187bb 100755 --- a/cpp/src/arrow/util/bpacking_simd_codegen.py +++ b/cpp/src/arrow/util/bpacking_simd_codegen.py @@ -22,73 +22,121 @@ # python bpacking_simd_codegen.py 256 > bpacking_simd256_generated_internal.h # python bpacking_simd_codegen.py 512 > bpacking_simd512_generated_internal.h +import dataclasses import sys from textwrap import dedent, indent +LICENSE = """// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +""" + +HEADER = """ +#pragma once + +#include +#include + +#include + +#include "arrow/util/ubsan.h" + +namespace arrow::internal { + +using ::arrow::util::SafeLoadAs; +""" + +FOOTER = """ +} // namespace arrow::internal +""" + + +@dataclasses.dataclass class UnpackGenerator: - def __init__(self, simd_width, out_width, out_type): - self.simd_width = simd_width - self.out_width = out_width - if simd_width % out_width != 0: + out_bit_width: int + simd_bit_width: int + + @property + def simd_byte_width(self) -> int: + return self.simd_bit_width // 8 + + @property + def simd_value_count(self) -> int: + return self.simd_bit_width // self.out_bit_width + + @property + def out_byte_width(self) -> int: + return self.out_bit_width // 8 + + @property + def out_type(self) -> str: + return f"uint{self.out_bit_width}_t" + + def __post_init__(self): + if self.simd_bit_width % self.out_bit_width != 0: raise ("SIMD bit width should be a multiple of output width") - self.simd_byte_width = simd_width // 8 - self.out_byte_width = out_width // 8 - self.out_type = out_type - def print_unpack_bit0_func(self): - ty = self.out_type - print( - f"inline static const uint8_t* unpack0_{self.out_width}(const uint8_t* in, {ty}* out) {{" + def unpack_signature(self, bit: int) -> str: + return ( + f"inline static const uint8_t* unpack{bit}_{self.out_bit_width}" + f"(const uint8_t* in, {self.out_type}* out) {{" ) - print(f" std::memset(out, 0x0, {self.out_width} * sizeof(*out));") - print(f" out += {self.out_width};") - print("") + + def print_unpack_bit0_func(self): + print(self.unpack_signature(0)) + print(f" std::memset(out, 0x0, {self.out_bit_width} * sizeof(*out));") + print(f" out += {self.out_bit_width};") print(" return in;") print("}") def print_unpack_bitmax_func(self): - ty = self.out_type - print( - f"inline static const uint8_t* unpack{self.out_width}_{self.out_width}(const uint8_t* in, {ty}* out) {{" - ) - print(f" std::memcpy(out, in, {self.out_width} * sizeof(*out));") - print(f" in += {self.out_byte_width} * {self.out_width};") - print(f" out += {self.out_width};") - print("") + print(self.unpack_signature(self.out_bit_width)) + print(f" std::memcpy(out, in, {self.out_bit_width} * sizeof(*out));") + print(f" in += {self.out_byte_width} * {self.out_bit_width};") + print(f" out += {self.out_bit_width};") print(" return in;") print("}") - def print_unpack_bit_func(self, bit): + def print_unpack_bit_func(self, bit: int): + print(self.unpack_signature(bit)) + def p(code, level=1): print(indent(code, prefix=" " * level)) mask = (1 << bit) - 1 - ty = self.out_type - bytes_per_batch = self.simd_byte_width - words_per_batch = bytes_per_batch // self.out_byte_width - print( - f"inline static const uint8_t* unpack{bit}_{self.out_width}(const uint8_t* in, {ty}* out) {{" - ) p( dedent(f"""\ - using simd_batch = xsimd::make_sized_batch_t<{ty}, {self.simd_width // self.out_width}>; + using simd_batch = xsimd::make_sized_batch_t<{self.out_type}, {self.simd_value_count}>; - {ty} mask = 0x{mask:0x}; + constexpr {self.out_type} kMask = 0x{mask:0x}; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; """) ) def safe_load(index): - return f"SafeLoadAs<{ty}>(in + {self.out_byte_width} * {index})" + return f"SafeLoadAs<{self.out_type}>(in + {self.out_byte_width} * {index})" def static_cast_as_needed(str): - if self.out_width < 32: - return f"static_cast<{ty}>({str})" + if self.out_bit_width < 32: + return f"static_cast<{self.out_type}>({str})" return str shift = 0 @@ -96,21 +144,21 @@ def static_cast_as_needed(str): in_index = 0 inls = [] - for i in range(self.out_width): - if shift + bit == self.out_width: + for i in range(self.out_bit_width): + if shift + bit == self.out_bit_width: shifts.append(shift) inls.append(safe_load(in_index)) in_index += 1 shift = 0 - elif shift + bit > self.out_width: # cross the boundary + elif shift + bit > self.out_bit_width: # cross the boundary inls.append( static_cast_as_needed( f"{safe_load(in_index)} >> {shift} " - f"| {safe_load(in_index + 1)} << {self.out_width - shift}" + f"| {safe_load(in_index + 1)} << {self.out_bit_width - shift}" ) ) in_index += 1 - shift = bit - (self.out_width - shift) + shift = bit - (self.out_bit_width - shift) shifts.append(0) # zero shift else: shifts.append(shift) @@ -124,8 +172,8 @@ def static_cast_as_needed(str): out += {words_per_batch}; """) - for start in range(0, self.out_width, words_per_batch): - stop = start + words_per_batch + for start in range(0, self.out_bit_width, self.simd_value_count): + stop = start + self.simd_value_count p(f"""// extract {bit}-bit bundles {start} to {stop - 1}""") p("words = simd_batch{") for word_part in inls[start:stop]: @@ -134,7 +182,7 @@ def static_cast_as_needed(str): p( one_word_template.format( shifts=", ".join(map(str, shifts[start:stop])), - words_per_batch=words_per_batch, + words_per_batch=self.simd_value_count, ) ) @@ -145,85 +193,41 @@ def static_cast_as_needed(str): ) print("}") + def print_all(self): + print("template<>") + print(f"struct Simd{self.simd_bit_width}Unpacker<{self.out_type}> {{") + + self.print_unpack_bit0_func() + print() + for i in range(1, self.out_bit_width): + self.print_unpack_bit_func(i) + print() + self.print_unpack_bitmax_func() + + print("}; // struct Unpacker") -def print_copyright(): - print( - dedent("""\ - // Licensed to the Apache Software Foundation (ASF) under one - // or more contributor license agreements. See the NOTICE file - // distributed with this work for additional information - // regarding copyright ownership. The ASF licenses this file - // to you under the Apache License, Version 2.0 (the - // "License"); you may not use this file except in compliance - // with the License. You may obtain a copy of the License at - // - // http://www.apache.org/licenses/LICENSE-2.0 - // - // Unless required by applicable law or agreed to in writing, - // software distributed under the License is distributed on an - // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - // KIND, either express or implied. See the License for the - // specific language governing permissions and limitations - // under the License. - """) - ) def print_note(): - print("// Automatically generated file; DO NOT EDIT.") - print() + print("// WARNING: this file is generated, DO NOT EDIT.") + print("// Usage:") + print(f"// python {' '.join(sys.orig_argv[1:])}") def main(simd_width, outputs): - print_copyright() + print(LICENSE) print_note() + print(HEADER) - struct_name = f"UnpackBits{simd_width}" - - # NOTE: templating the UnpackBits struct on the dispatch level avoids - # potential name collisions if there are several UnpackBits generations - # with the same SIMD width on a given architecture. - - print( - dedent(f"""\ - #pragma once - - #include - #include + print("template") + print(f"struct Simd{simd_width}Unpacker;") - #include - - #include "arrow/util/dispatch_internal.h" - #include "arrow/util/ubsan.h" - - namespace arrow::internal {{ - namespace {{ - - using ::arrow::util::SafeLoadAs; - - template - struct {struct_name} {{ - """) - ) - - for out_width, out_type in outputs: - gen = UnpackGenerator(simd_width, out_width, out_type) - gen.print_unpack_bit0_func() - print() - for i in range(1, out_width): - gen.print_unpack_bit_func(i) - print() - gen.print_unpack_bitmax_func() + for out_width in outputs: + gen = UnpackGenerator(out_width, simd_width) + gen.print_all() print() - print( - dedent(f"""\ - }}; // struct {struct_name} - - }} // namespace - }} // namespace arrow::internal - """) - ) + print(FOOTER) if __name__ == "__main__": @@ -235,5 +239,4 @@ def main(simd_width, outputs): except ValueError: raise ValueError(usage) - outputs = [(16, "uint16_t"), (32, "uint32_t")] - main(simd_width, outputs) + main(simd_width, [16, 32]) From ded98a4ef023c124a1f1ff6ec8141c5faedcfad6 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 18 Sep 2025 15:10:25 +0200 Subject: [PATCH 13/76] Gen: regenerate simd files --- .../bpacking_simd128_generated_internal.h | 207 +++++++++--------- .../bpacking_simd256_generated_internal.h | 207 +++++++++--------- .../bpacking_simd512_generated_internal.h | 207 +++++++++--------- 3 files changed, 309 insertions(+), 312 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h index b47e2d3a627..d2a012fc521 100644 --- a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h @@ -15,7 +15,9 @@ // specific language governing permissions and limitations // under the License. -// Automatically generated file; DO NOT EDIT. +// WARNING: this file is generated, DO NOT EDIT. +// Usage: +// python cpp/src/arrow/util/bpacking_simd_codegen.py 128 #pragma once @@ -24,30 +26,28 @@ #include -#include "arrow/util/dispatch_internal.h" #include "arrow/util/ubsan.h" namespace arrow::internal { -namespace { using ::arrow::util::SafeLoadAs; -template -struct UnpackBits128 { - +template +struct Simd128Unpacker; +template<> +struct Simd128Unpacker { inline static const uint8_t* unpack0_16(const uint8_t* in, uint16_t* out) { std::memset(out, 0x0, 16 * sizeof(*out)); out += 16; - return in; } inline static const uint8_t* unpack1_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x1; + constexpr uint16_t kMask = 0x1; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -90,9 +90,9 @@ inline static const uint8_t* unpack1_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack2_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x3; + constexpr uint16_t kMask = 0x3; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -135,9 +135,9 @@ inline static const uint8_t* unpack2_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack3_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x7; + constexpr uint16_t kMask = 0x7; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -180,9 +180,9 @@ inline static const uint8_t* unpack3_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack4_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0xf; + constexpr uint16_t kMask = 0xf; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -225,9 +225,9 @@ inline static const uint8_t* unpack4_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack5_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x1f; + constexpr uint16_t kMask = 0x1f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -270,9 +270,9 @@ inline static const uint8_t* unpack5_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack6_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x3f; + constexpr uint16_t kMask = 0x3f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -315,9 +315,9 @@ inline static const uint8_t* unpack6_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack7_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x7f; + constexpr uint16_t kMask = 0x7f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -360,9 +360,9 @@ inline static const uint8_t* unpack7_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack8_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0xff; + constexpr uint16_t kMask = 0xff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -405,9 +405,9 @@ inline static const uint8_t* unpack8_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack9_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x1ff; + constexpr uint16_t kMask = 0x1ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -450,9 +450,9 @@ inline static const uint8_t* unpack9_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack10_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x3ff; + constexpr uint16_t kMask = 0x3ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -495,9 +495,9 @@ inline static const uint8_t* unpack10_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack11_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x7ff; + constexpr uint16_t kMask = 0x7ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -540,9 +540,9 @@ inline static const uint8_t* unpack11_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack12_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0xfff; + constexpr uint16_t kMask = 0xfff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -585,9 +585,9 @@ inline static const uint8_t* unpack12_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack13_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x1fff; + constexpr uint16_t kMask = 0x1fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -630,9 +630,9 @@ inline static const uint8_t* unpack13_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack14_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x3fff; + constexpr uint16_t kMask = 0x3fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -675,9 +675,9 @@ inline static const uint8_t* unpack14_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack15_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x7fff; + constexpr uint16_t kMask = 0x7fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -721,23 +721,24 @@ inline static const uint8_t* unpack16_16(const uint8_t* in, uint16_t* out) { std::memcpy(out, in, 16 * sizeof(*out)); in += 2 * 16; out += 16; - return in; } +}; // struct Unpacker +template<> +struct Simd128Unpacker { inline static const uint8_t* unpack0_32(const uint8_t* in, uint32_t* out) { std::memset(out, 0x0, 32 * sizeof(*out)); out += 32; - return in; } inline static const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1; + constexpr uint32_t kMask = 0x1; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -844,9 +845,9 @@ inline static const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3; + constexpr uint32_t kMask = 0x3; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -953,9 +954,9 @@ inline static const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7; + constexpr uint32_t kMask = 0x7; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1062,9 +1063,9 @@ inline static const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xf; + constexpr uint32_t kMask = 0xf; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1171,9 +1172,9 @@ inline static const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1f; + constexpr uint32_t kMask = 0x1f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1280,9 +1281,9 @@ inline static const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3f; + constexpr uint32_t kMask = 0x3f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1389,9 +1390,9 @@ inline static const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7f; + constexpr uint32_t kMask = 0x7f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1498,9 +1499,9 @@ inline static const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xff; + constexpr uint32_t kMask = 0xff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1607,9 +1608,9 @@ inline static const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1ff; + constexpr uint32_t kMask = 0x1ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1716,9 +1717,9 @@ inline static const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3ff; + constexpr uint32_t kMask = 0x3ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1825,9 +1826,9 @@ inline static const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7ff; + constexpr uint32_t kMask = 0x7ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1934,9 +1935,9 @@ inline static const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xfff; + constexpr uint32_t kMask = 0xfff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2043,9 +2044,9 @@ inline static const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1fff; + constexpr uint32_t kMask = 0x1fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2152,9 +2153,9 @@ inline static const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3fff; + constexpr uint32_t kMask = 0x3fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2261,9 +2262,9 @@ inline static const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7fff; + constexpr uint32_t kMask = 0x7fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2370,9 +2371,9 @@ inline static const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xffff; + constexpr uint32_t kMask = 0xffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2479,9 +2480,9 @@ inline static const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1ffff; + constexpr uint32_t kMask = 0x1ffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2588,9 +2589,9 @@ inline static const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3ffff; + constexpr uint32_t kMask = 0x3ffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2697,9 +2698,9 @@ inline static const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7ffff; + constexpr uint32_t kMask = 0x7ffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2806,9 +2807,9 @@ inline static const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xfffff; + constexpr uint32_t kMask = 0xfffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2915,9 +2916,9 @@ inline static const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1fffff; + constexpr uint32_t kMask = 0x1fffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -3024,9 +3025,9 @@ inline static const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3fffff; + constexpr uint32_t kMask = 0x3fffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -3133,9 +3134,9 @@ inline static const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7fffff; + constexpr uint32_t kMask = 0x7fffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -3242,9 +3243,9 @@ inline static const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xffffff; + constexpr uint32_t kMask = 0xffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -3351,9 +3352,9 @@ inline static const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1ffffff; + constexpr uint32_t kMask = 0x1ffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -3460,9 +3461,9 @@ inline static const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3ffffff; + constexpr uint32_t kMask = 0x3ffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -3569,9 +3570,9 @@ inline static const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7ffffff; + constexpr uint32_t kMask = 0x7ffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -3678,9 +3679,9 @@ inline static const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xfffffff; + constexpr uint32_t kMask = 0xfffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -3787,9 +3788,9 @@ inline static const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1fffffff; + constexpr uint32_t kMask = 0x1fffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -3896,9 +3897,9 @@ inline static const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3fffffff; + constexpr uint32_t kMask = 0x3fffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -4005,9 +4006,9 @@ inline static const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7fffffff; + constexpr uint32_t kMask = 0x7fffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -4115,12 +4116,10 @@ inline static const uint8_t* unpack32_32(const uint8_t* in, uint32_t* out) { std::memcpy(out, in, 32 * sizeof(*out)); in += 4 * 32; out += 32; - return in; } +}; // struct Unpacker -}; // struct UnpackBits128 -} // namespace } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h index 6e187831f1e..8b791f08757 100644 --- a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h @@ -15,7 +15,9 @@ // specific language governing permissions and limitations // under the License. -// Automatically generated file; DO NOT EDIT. +// WARNING: this file is generated, DO NOT EDIT. +// Usage: +// python cpp/src/arrow/util/bpacking_simd_codegen.py 256 #pragma once @@ -24,30 +26,28 @@ #include -#include "arrow/util/dispatch_internal.h" #include "arrow/util/ubsan.h" namespace arrow::internal { -namespace { using ::arrow::util::SafeLoadAs; -template -struct UnpackBits256 { - +template +struct Simd256Unpacker; +template<> +struct Simd256Unpacker { inline static const uint8_t* unpack0_16(const uint8_t* in, uint16_t* out) { std::memset(out, 0x0, 16 * sizeof(*out)); out += 16; - return in; } inline static const uint8_t* unpack1_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x1; + constexpr uint16_t kMask = 0x1; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -82,9 +82,9 @@ inline static const uint8_t* unpack1_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack2_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x3; + constexpr uint16_t kMask = 0x3; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -119,9 +119,9 @@ inline static const uint8_t* unpack2_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack3_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x7; + constexpr uint16_t kMask = 0x7; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -156,9 +156,9 @@ inline static const uint8_t* unpack3_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack4_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0xf; + constexpr uint16_t kMask = 0xf; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -193,9 +193,9 @@ inline static const uint8_t* unpack4_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack5_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x1f; + constexpr uint16_t kMask = 0x1f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -230,9 +230,9 @@ inline static const uint8_t* unpack5_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack6_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x3f; + constexpr uint16_t kMask = 0x3f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -267,9 +267,9 @@ inline static const uint8_t* unpack6_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack7_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x7f; + constexpr uint16_t kMask = 0x7f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -304,9 +304,9 @@ inline static const uint8_t* unpack7_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack8_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0xff; + constexpr uint16_t kMask = 0xff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -341,9 +341,9 @@ inline static const uint8_t* unpack8_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack9_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x1ff; + constexpr uint16_t kMask = 0x1ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -378,9 +378,9 @@ inline static const uint8_t* unpack9_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack10_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x3ff; + constexpr uint16_t kMask = 0x3ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -415,9 +415,9 @@ inline static const uint8_t* unpack10_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack11_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x7ff; + constexpr uint16_t kMask = 0x7ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -452,9 +452,9 @@ inline static const uint8_t* unpack11_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack12_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0xfff; + constexpr uint16_t kMask = 0xfff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -489,9 +489,9 @@ inline static const uint8_t* unpack12_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack13_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x1fff; + constexpr uint16_t kMask = 0x1fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -526,9 +526,9 @@ inline static const uint8_t* unpack13_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack14_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x3fff; + constexpr uint16_t kMask = 0x3fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -563,9 +563,9 @@ inline static const uint8_t* unpack14_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack15_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x7fff; + constexpr uint16_t kMask = 0x7fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -601,23 +601,24 @@ inline static const uint8_t* unpack16_16(const uint8_t* in, uint16_t* out) { std::memcpy(out, in, 16 * sizeof(*out)); in += 2 * 16; out += 16; - return in; } +}; // struct Unpacker +template<> +struct Simd256Unpacker { inline static const uint8_t* unpack0_32(const uint8_t* in, uint32_t* out) { std::memset(out, 0x0, 32 * sizeof(*out)); out += 32; - return in; } inline static const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1; + constexpr uint32_t kMask = 0x1; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -692,9 +693,9 @@ inline static const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3; + constexpr uint32_t kMask = 0x3; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -769,9 +770,9 @@ inline static const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7; + constexpr uint32_t kMask = 0x7; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -846,9 +847,9 @@ inline static const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xf; + constexpr uint32_t kMask = 0xf; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -923,9 +924,9 @@ inline static const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1f; + constexpr uint32_t kMask = 0x1f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1000,9 +1001,9 @@ inline static const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3f; + constexpr uint32_t kMask = 0x3f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1077,9 +1078,9 @@ inline static const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7f; + constexpr uint32_t kMask = 0x7f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1154,9 +1155,9 @@ inline static const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xff; + constexpr uint32_t kMask = 0xff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1231,9 +1232,9 @@ inline static const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1ff; + constexpr uint32_t kMask = 0x1ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1308,9 +1309,9 @@ inline static const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3ff; + constexpr uint32_t kMask = 0x3ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1385,9 +1386,9 @@ inline static const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7ff; + constexpr uint32_t kMask = 0x7ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1462,9 +1463,9 @@ inline static const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xfff; + constexpr uint32_t kMask = 0xfff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1539,9 +1540,9 @@ inline static const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1fff; + constexpr uint32_t kMask = 0x1fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1616,9 +1617,9 @@ inline static const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3fff; + constexpr uint32_t kMask = 0x3fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1693,9 +1694,9 @@ inline static const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7fff; + constexpr uint32_t kMask = 0x7fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1770,9 +1771,9 @@ inline static const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xffff; + constexpr uint32_t kMask = 0xffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1847,9 +1848,9 @@ inline static const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1ffff; + constexpr uint32_t kMask = 0x1ffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1924,9 +1925,9 @@ inline static const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3ffff; + constexpr uint32_t kMask = 0x3ffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2001,9 +2002,9 @@ inline static const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7ffff; + constexpr uint32_t kMask = 0x7ffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2078,9 +2079,9 @@ inline static const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xfffff; + constexpr uint32_t kMask = 0xfffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2155,9 +2156,9 @@ inline static const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1fffff; + constexpr uint32_t kMask = 0x1fffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2232,9 +2233,9 @@ inline static const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3fffff; + constexpr uint32_t kMask = 0x3fffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2309,9 +2310,9 @@ inline static const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7fffff; + constexpr uint32_t kMask = 0x7fffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2386,9 +2387,9 @@ inline static const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xffffff; + constexpr uint32_t kMask = 0xffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2463,9 +2464,9 @@ inline static const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1ffffff; + constexpr uint32_t kMask = 0x1ffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2540,9 +2541,9 @@ inline static const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3ffffff; + constexpr uint32_t kMask = 0x3ffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2617,9 +2618,9 @@ inline static const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7ffffff; + constexpr uint32_t kMask = 0x7ffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2694,9 +2695,9 @@ inline static const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xfffffff; + constexpr uint32_t kMask = 0xfffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2771,9 +2772,9 @@ inline static const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1fffffff; + constexpr uint32_t kMask = 0x1fffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2848,9 +2849,9 @@ inline static const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3fffffff; + constexpr uint32_t kMask = 0x3fffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2925,9 +2926,9 @@ inline static const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7fffffff; + constexpr uint32_t kMask = 0x7fffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -3003,12 +3004,10 @@ inline static const uint8_t* unpack32_32(const uint8_t* in, uint32_t* out) { std::memcpy(out, in, 32 * sizeof(*out)); in += 4 * 32; out += 32; - return in; } +}; // struct Unpacker -}; // struct UnpackBits256 -} // namespace } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_simd512_generated_internal.h b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h index da53ddfd381..9848c55957e 100644 --- a/cpp/src/arrow/util/bpacking_simd512_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h @@ -15,7 +15,9 @@ // specific language governing permissions and limitations // under the License. -// Automatically generated file; DO NOT EDIT. +// WARNING: this file is generated, DO NOT EDIT. +// Usage: +// python cpp/src/arrow/util/bpacking_simd_codegen.py 512 #pragma once @@ -24,30 +26,28 @@ #include -#include "arrow/util/dispatch_internal.h" #include "arrow/util/ubsan.h" namespace arrow::internal { -namespace { using ::arrow::util::SafeLoadAs; -template -struct UnpackBits512 { - +template +struct Simd512Unpacker; +template<> +struct Simd512Unpacker { inline static const uint8_t* unpack0_16(const uint8_t* in, uint16_t* out) { std::memset(out, 0x0, 16 * sizeof(*out)); out += 16; - return in; } inline static const uint8_t* unpack1_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x1; + constexpr uint16_t kMask = 0x1; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -82,9 +82,9 @@ inline static const uint8_t* unpack1_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack2_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x3; + constexpr uint16_t kMask = 0x3; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -119,9 +119,9 @@ inline static const uint8_t* unpack2_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack3_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x7; + constexpr uint16_t kMask = 0x7; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -156,9 +156,9 @@ inline static const uint8_t* unpack3_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack4_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0xf; + constexpr uint16_t kMask = 0xf; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -193,9 +193,9 @@ inline static const uint8_t* unpack4_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack5_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x1f; + constexpr uint16_t kMask = 0x1f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -230,9 +230,9 @@ inline static const uint8_t* unpack5_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack6_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x3f; + constexpr uint16_t kMask = 0x3f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -267,9 +267,9 @@ inline static const uint8_t* unpack6_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack7_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x7f; + constexpr uint16_t kMask = 0x7f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -304,9 +304,9 @@ inline static const uint8_t* unpack7_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack8_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0xff; + constexpr uint16_t kMask = 0xff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -341,9 +341,9 @@ inline static const uint8_t* unpack8_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack9_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x1ff; + constexpr uint16_t kMask = 0x1ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -378,9 +378,9 @@ inline static const uint8_t* unpack9_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack10_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x3ff; + constexpr uint16_t kMask = 0x3ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -415,9 +415,9 @@ inline static const uint8_t* unpack10_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack11_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x7ff; + constexpr uint16_t kMask = 0x7ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -452,9 +452,9 @@ inline static const uint8_t* unpack11_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack12_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0xfff; + constexpr uint16_t kMask = 0xfff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -489,9 +489,9 @@ inline static const uint8_t* unpack12_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack13_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x1fff; + constexpr uint16_t kMask = 0x1fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -526,9 +526,9 @@ inline static const uint8_t* unpack13_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack14_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x3fff; + constexpr uint16_t kMask = 0x3fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -563,9 +563,9 @@ inline static const uint8_t* unpack14_16(const uint8_t* in, uint16_t* out) { inline static const uint8_t* unpack15_16(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint16_t mask = 0x7fff; + constexpr uint16_t kMask = 0x7fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -601,23 +601,24 @@ inline static const uint8_t* unpack16_16(const uint8_t* in, uint16_t* out) { std::memcpy(out, in, 16 * sizeof(*out)); in += 2 * 16; out += 16; - return in; } +}; // struct Unpacker +template<> +struct Simd512Unpacker { inline static const uint8_t* unpack0_32(const uint8_t* in, uint32_t* out) { std::memset(out, 0x0, 32 * sizeof(*out)); out += 32; - return in; } inline static const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1; + constexpr uint32_t kMask = 0x1; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -676,9 +677,9 @@ inline static const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3; + constexpr uint32_t kMask = 0x3; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -737,9 +738,9 @@ inline static const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7; + constexpr uint32_t kMask = 0x7; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -798,9 +799,9 @@ inline static const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xf; + constexpr uint32_t kMask = 0xf; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -859,9 +860,9 @@ inline static const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1f; + constexpr uint32_t kMask = 0x1f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -920,9 +921,9 @@ inline static const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3f; + constexpr uint32_t kMask = 0x3f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -981,9 +982,9 @@ inline static const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7f; + constexpr uint32_t kMask = 0x7f; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1042,9 +1043,9 @@ inline static const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xff; + constexpr uint32_t kMask = 0xff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1103,9 +1104,9 @@ inline static const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1ff; + constexpr uint32_t kMask = 0x1ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1164,9 +1165,9 @@ inline static const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3ff; + constexpr uint32_t kMask = 0x3ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1225,9 +1226,9 @@ inline static const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7ff; + constexpr uint32_t kMask = 0x7ff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1286,9 +1287,9 @@ inline static const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xfff; + constexpr uint32_t kMask = 0xfff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1347,9 +1348,9 @@ inline static const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1fff; + constexpr uint32_t kMask = 0x1fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1408,9 +1409,9 @@ inline static const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3fff; + constexpr uint32_t kMask = 0x3fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1469,9 +1470,9 @@ inline static const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7fff; + constexpr uint32_t kMask = 0x7fff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1530,9 +1531,9 @@ inline static const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xffff; + constexpr uint32_t kMask = 0xffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1591,9 +1592,9 @@ inline static const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1ffff; + constexpr uint32_t kMask = 0x1ffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1652,9 +1653,9 @@ inline static const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3ffff; + constexpr uint32_t kMask = 0x3ffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1713,9 +1714,9 @@ inline static const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7ffff; + constexpr uint32_t kMask = 0x7ffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1774,9 +1775,9 @@ inline static const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xfffff; + constexpr uint32_t kMask = 0xfffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1835,9 +1836,9 @@ inline static const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1fffff; + constexpr uint32_t kMask = 0x1fffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1896,9 +1897,9 @@ inline static const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3fffff; + constexpr uint32_t kMask = 0x3fffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -1957,9 +1958,9 @@ inline static const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7fffff; + constexpr uint32_t kMask = 0x7fffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2018,9 +2019,9 @@ inline static const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xffffff; + constexpr uint32_t kMask = 0xffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2079,9 +2080,9 @@ inline static const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1ffffff; + constexpr uint32_t kMask = 0x1ffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2140,9 +2141,9 @@ inline static const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3ffffff; + constexpr uint32_t kMask = 0x3ffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2201,9 +2202,9 @@ inline static const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7ffffff; + constexpr uint32_t kMask = 0x7ffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2262,9 +2263,9 @@ inline static const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0xfffffff; + constexpr uint32_t kMask = 0xfffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2323,9 +2324,9 @@ inline static const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x1fffffff; + constexpr uint32_t kMask = 0x1fffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2384,9 +2385,9 @@ inline static const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x3fffffff; + constexpr uint32_t kMask = 0x3fffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2445,9 +2446,9 @@ inline static const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out) { inline static const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; - uint32_t mask = 0x7fffffff; + constexpr uint32_t kMask = 0x7fffffff; - simd_batch masks(mask); + simd_batch masks(kMask); simd_batch words, shifts; simd_batch results; @@ -2507,12 +2508,10 @@ inline static const uint8_t* unpack32_32(const uint8_t* in, uint32_t* out) { std::memcpy(out, in, 32 * sizeof(*out)); in += 4 * 32; out += 32; - return in; } +}; // struct Unpacker -}; // struct UnpackBits512 -} // namespace } // namespace arrow::internal From cbb57ad5ce962c3d432c8511badefbc7e188eb0c Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 18 Sep 2025 16:21:55 +0200 Subject: [PATCH 14/76] Use templated method in SimdUnpacker --- cpp/src/arrow/util/bpacking_simd_codegen.py | 37 ++++-- cpp/src/arrow/util/bpacking_simd_internal.h | 120 ++++++++------------ 2 files changed, 74 insertions(+), 83 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_codegen.py b/cpp/src/arrow/util/bpacking_simd_codegen.py index d81be5187bb..48c9fe8d22d 100755 --- a/cpp/src/arrow/util/bpacking_simd_codegen.py +++ b/cpp/src/arrow/util/bpacking_simd_codegen.py @@ -90,21 +90,38 @@ def __post_init__(self): if self.simd_bit_width % self.out_bit_width != 0: raise ("SIMD bit width should be a multiple of output width") - def unpack_signature(self, bit: int) -> str: - return ( - f"inline static const uint8_t* unpack{bit}_{self.out_bit_width}" - f"(const uint8_t* in, {self.out_type}* out) {{" + def print_unpack_signature(self, bit: int | None) -> str: + if bit is None: + print("template") + static = "static " + specialized = "" + end = ";" + else: + print("template<>") + static = "" + specialized = f"<{bit}>" + end = " {" + + print( + f"{static}const uint8_t* unpack{specialized}" + f"(const uint8_t* in, {self.out_type}* out){end}" ) + def print_struct_header(self): + print("template<>") + print(f"struct Simd{self.simd_bit_width}Unpacker<{self.out_type}> {{") + print() + self.print_unpack_signature(None) + def print_unpack_bit0_func(self): - print(self.unpack_signature(0)) + self.print_unpack_signature(0) print(f" std::memset(out, 0x0, {self.out_bit_width} * sizeof(*out));") print(f" out += {self.out_bit_width};") print(" return in;") print("}") def print_unpack_bitmax_func(self): - print(self.unpack_signature(self.out_bit_width)) + self.print_unpack_signature(self.out_bit_width) print(f" std::memcpy(out, in, {self.out_bit_width} * sizeof(*out));") print(f" in += {self.out_byte_width} * {self.out_bit_width};") print(f" out += {self.out_bit_width};") @@ -112,7 +129,7 @@ def print_unpack_bitmax_func(self): print("}") def print_unpack_bit_func(self, bit: int): - print(self.unpack_signature(bit)) + self.print_unpack_signature(bit) def p(code, level=1): print(indent(code, prefix=" " * level)) @@ -194,8 +211,8 @@ def static_cast_as_needed(str): print("}") def print_all(self): - print("template<>") - print(f"struct Simd{self.simd_bit_width}Unpacker<{self.out_type}> {{") + self.print_struct_header() + print() self.print_unpack_bit0_func() print() @@ -207,7 +224,6 @@ def print_all(self): print("}; // struct Unpacker") - def print_note(): print("// WARNING: this file is generated, DO NOT EDIT.") print("// Usage:") @@ -221,6 +237,7 @@ def main(simd_width, outputs): print("template") print(f"struct Simd{simd_width}Unpacker;") + print() for out_width in outputs: gen = UnpackGenerator(out_width, simd_width) diff --git a/cpp/src/arrow/util/bpacking_simd_internal.h b/cpp/src/arrow/util/bpacking_simd_internal.h index 8a3dc0d8af7..2613140aa6e 100644 --- a/cpp/src/arrow/util/bpacking_simd_internal.h +++ b/cpp/src/arrow/util/bpacking_simd_internal.h @@ -18,115 +18,90 @@ #include "arrow/util/dispatch_internal.h" #include "arrow/util/logging.h" -namespace arrow { -namespace internal { +namespace arrow::internal { -template -static int unpack32_specialized(const uint8_t* in, uint32_t* out, int batch_size, - int num_bits) { +template +int unpack(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { batch_size = batch_size / 32 * 32; int num_loops = batch_size / 32; + for (int i = 0; i < num_loops; ++i) { + in = Unpacker::template unpack(in, out + i * 32); + } + + return batch_size; +} + +template +static int unpack32_specialized(const uint8_t* in, uint32_t* out, int batch_size, + int num_bits) { switch (num_bits) { case 0: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack0_32(in, out + i * 32); - break; + return unpack<0, Unpacker>(in, out, batch_size, num_bits); case 1: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack1_32(in, out + i * 32); - break; + return unpack<1, Unpacker>(in, out, batch_size, num_bits); case 2: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack2_32(in, out + i * 32); - break; + return unpack<2, Unpacker>(in, out, batch_size, num_bits); case 3: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack3_32(in, out + i * 32); - break; + return unpack<3, Unpacker>(in, out, batch_size, num_bits); case 4: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack4_32(in, out + i * 32); - break; + return unpack<4, Unpacker>(in, out, batch_size, num_bits); case 5: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack5_32(in, out + i * 32); - break; + return unpack<5, Unpacker>(in, out, batch_size, num_bits); case 6: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack6_32(in, out + i * 32); - break; + return unpack<6, Unpacker>(in, out, batch_size, num_bits); case 7: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack7_32(in, out + i * 32); - break; + return unpack<7, Unpacker>(in, out, batch_size, num_bits); case 8: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack8_32(in, out + i * 32); - break; + return unpack<8, Unpacker>(in, out, batch_size, num_bits); case 9: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack9_32(in, out + i * 32); - break; + return unpack<9, Unpacker>(in, out, batch_size, num_bits); case 10: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack10_32(in, out + i * 32); - break; + return unpack<10, Unpacker>(in, out, batch_size, num_bits); case 11: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack11_32(in, out + i * 32); - break; + return unpack<11, Unpacker>(in, out, batch_size, num_bits); case 12: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack12_32(in, out + i * 32); - break; + return unpack<12, Unpacker>(in, out, batch_size, num_bits); case 13: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack13_32(in, out + i * 32); - break; + return unpack<13, Unpacker>(in, out, batch_size, num_bits); case 14: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack14_32(in, out + i * 32); - break; + return unpack<14, Unpacker>(in, out, batch_size, num_bits); case 15: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack15_32(in, out + i * 32); - break; + return unpack<15, Unpacker>(in, out, batch_size, num_bits); case 16: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack16_32(in, out + i * 32); - break; + return unpack<16, Unpacker>(in, out, batch_size, num_bits); case 17: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack17_32(in, out + i * 32); - break; + return unpack<17, Unpacker>(in, out, batch_size, num_bits); case 18: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack18_32(in, out + i * 32); - break; + return unpack<18, Unpacker>(in, out, batch_size, num_bits); case 19: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack19_32(in, out + i * 32); - break; + return unpack<19, Unpacker>(in, out, batch_size, num_bits); case 20: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack20_32(in, out + i * 32); - break; + return unpack<20, Unpacker>(in, out, batch_size, num_bits); case 21: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack21_32(in, out + i * 32); - break; + return unpack<21, Unpacker>(in, out, batch_size, num_bits); case 22: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack22_32(in, out + i * 32); - break; + return unpack<22, Unpacker>(in, out, batch_size, num_bits); case 23: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack23_32(in, out + i * 32); - break; + return unpack<23, Unpacker>(in, out, batch_size, num_bits); case 24: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack24_32(in, out + i * 32); - break; + return unpack<24, Unpacker>(in, out, batch_size, num_bits); case 25: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack25_32(in, out + i * 32); - break; + return unpack<25, Unpacker>(in, out, batch_size, num_bits); case 26: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack26_32(in, out + i * 32); - break; + return unpack<26, Unpacker>(in, out, batch_size, num_bits); case 27: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack27_32(in, out + i * 32); - break; + return unpack<27, Unpacker>(in, out, batch_size, num_bits); case 28: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack28_32(in, out + i * 32); - break; + return unpack<28, Unpacker>(in, out, batch_size, num_bits); case 29: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack29_32(in, out + i * 32); - break; + return unpack<29, Unpacker>(in, out, batch_size, num_bits); case 30: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack30_32(in, out + i * 32); - break; + return unpack<30, Unpacker>(in, out, batch_size, num_bits); case 31: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack31_32(in, out + i * 32); - break; + return unpack<31, Unpacker>(in, out, batch_size, num_bits); case 32: - for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack32_32(in, out + i * 32); - break; + return unpack<32, Unpacker>(in, out, batch_size, num_bits); default: ARROW_DCHECK(false) << "Unsupported num_bits"; } @@ -134,5 +109,4 @@ static int unpack32_specialized(const uint8_t* in, uint32_t* out, int batch_size return batch_size; } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal From 23ed46ffc6ebd5b9002ea8c403686aadbbe406da Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 18 Sep 2025 16:22:21 +0200 Subject: [PATCH 15/76] Gen: regenerate simd files --- .../bpacking_simd128_generated_internal.h | 159 ++++++++++++------ .../bpacking_simd256_generated_internal.h | 159 ++++++++++++------ .../bpacking_simd512_generated_internal.h | 159 ++++++++++++------ 3 files changed, 327 insertions(+), 150 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h index d2a012fc521..fc5e151eb94 100644 --- a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h @@ -34,15 +34,22 @@ using ::arrow::util::SafeLoadAs; template struct Simd128Unpacker; + template<> struct Simd128Unpacker { -inline static const uint8_t* unpack0_16(const uint8_t* in, uint16_t* out) { + +template +static const uint8_t* unpack(const uint8_t* in, uint16_t* out); + +template<> +const uint8_t* unpack<0>(const uint8_t* in, uint16_t* out) { std::memset(out, 0x0, 16 * sizeof(*out)); out += 16; return in; } -inline static const uint8_t* unpack1_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<1>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x1; @@ -87,7 +94,8 @@ inline static const uint8_t* unpack1_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack2_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<2>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x3; @@ -132,7 +140,8 @@ inline static const uint8_t* unpack2_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack3_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<3>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x7; @@ -177,7 +186,8 @@ inline static const uint8_t* unpack3_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack4_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<4>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0xf; @@ -222,7 +232,8 @@ inline static const uint8_t* unpack4_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack5_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<5>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x1f; @@ -267,7 +278,8 @@ inline static const uint8_t* unpack5_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack6_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<6>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x3f; @@ -312,7 +324,8 @@ inline static const uint8_t* unpack6_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack7_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<7>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x7f; @@ -357,7 +370,8 @@ inline static const uint8_t* unpack7_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack8_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<8>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0xff; @@ -402,7 +416,8 @@ inline static const uint8_t* unpack8_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack9_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<9>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x1ff; @@ -447,7 +462,8 @@ inline static const uint8_t* unpack9_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack10_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<10>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x3ff; @@ -492,7 +508,8 @@ inline static const uint8_t* unpack10_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack11_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<11>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x7ff; @@ -537,7 +554,8 @@ inline static const uint8_t* unpack11_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack12_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<12>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0xfff; @@ -582,7 +600,8 @@ inline static const uint8_t* unpack12_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack13_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<13>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x1fff; @@ -627,7 +646,8 @@ inline static const uint8_t* unpack13_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack14_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<14>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x3fff; @@ -672,7 +692,8 @@ inline static const uint8_t* unpack14_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack15_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<15>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x7fff; @@ -717,7 +738,8 @@ inline static const uint8_t* unpack15_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack16_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<16>(const uint8_t* in, uint16_t* out) { std::memcpy(out, in, 16 * sizeof(*out)); in += 2 * 16; out += 16; @@ -727,13 +749,19 @@ inline static const uint8_t* unpack16_16(const uint8_t* in, uint16_t* out) { template<> struct Simd128Unpacker { -inline static const uint8_t* unpack0_32(const uint8_t* in, uint32_t* out) { + +template +static const uint8_t* unpack(const uint8_t* in, uint32_t* out); + +template<> +const uint8_t* unpack<0>(const uint8_t* in, uint32_t* out) { std::memset(out, 0x0, 32 * sizeof(*out)); out += 32; return in; } -inline static const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1; @@ -842,7 +870,8 @@ inline static const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3; @@ -951,7 +980,8 @@ inline static const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7; @@ -1060,7 +1090,8 @@ inline static const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xf; @@ -1169,7 +1200,8 @@ inline static const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1f; @@ -1278,7 +1310,8 @@ inline static const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3f; @@ -1387,7 +1420,8 @@ inline static const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7f; @@ -1496,7 +1530,8 @@ inline static const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xff; @@ -1605,7 +1640,8 @@ inline static const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1ff; @@ -1714,7 +1750,8 @@ inline static const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3ff; @@ -1823,7 +1860,8 @@ inline static const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7ff; @@ -1932,7 +1970,8 @@ inline static const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xfff; @@ -2041,7 +2080,8 @@ inline static const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1fff; @@ -2150,7 +2190,8 @@ inline static const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3fff; @@ -2259,7 +2300,8 @@ inline static const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7fff; @@ -2368,7 +2410,8 @@ inline static const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xffff; @@ -2477,7 +2520,8 @@ inline static const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1ffff; @@ -2586,7 +2630,8 @@ inline static const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3ffff; @@ -2695,7 +2740,8 @@ inline static const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7ffff; @@ -2804,7 +2850,8 @@ inline static const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xfffff; @@ -2913,7 +2960,8 @@ inline static const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1fffff; @@ -3022,7 +3070,8 @@ inline static const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3fffff; @@ -3131,7 +3180,8 @@ inline static const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7fffff; @@ -3240,7 +3290,8 @@ inline static const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xffffff; @@ -3349,7 +3400,8 @@ inline static const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1ffffff; @@ -3458,7 +3510,8 @@ inline static const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3ffffff; @@ -3567,7 +3620,8 @@ inline static const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7ffffff; @@ -3676,7 +3730,8 @@ inline static const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xfffffff; @@ -3785,7 +3840,8 @@ inline static const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1fffffff; @@ -3894,7 +3950,8 @@ inline static const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3fffffff; @@ -4003,7 +4060,8 @@ inline static const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<31>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7fffffff; @@ -4112,7 +4170,8 @@ inline static const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack32_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<32>(const uint8_t* in, uint32_t* out) { std::memcpy(out, in, 32 * sizeof(*out)); in += 4 * 32; out += 32; diff --git a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h index 8b791f08757..069c44548b4 100644 --- a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h @@ -34,15 +34,22 @@ using ::arrow::util::SafeLoadAs; template struct Simd256Unpacker; + template<> struct Simd256Unpacker { -inline static const uint8_t* unpack0_16(const uint8_t* in, uint16_t* out) { + +template +static const uint8_t* unpack(const uint8_t* in, uint16_t* out); + +template<> +const uint8_t* unpack<0>(const uint8_t* in, uint16_t* out) { std::memset(out, 0x0, 16 * sizeof(*out)); out += 16; return in; } -inline static const uint8_t* unpack1_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<1>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x1; @@ -79,7 +86,8 @@ inline static const uint8_t* unpack1_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack2_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<2>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x3; @@ -116,7 +124,8 @@ inline static const uint8_t* unpack2_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack3_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<3>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x7; @@ -153,7 +162,8 @@ inline static const uint8_t* unpack3_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack4_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<4>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0xf; @@ -190,7 +200,8 @@ inline static const uint8_t* unpack4_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack5_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<5>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x1f; @@ -227,7 +238,8 @@ inline static const uint8_t* unpack5_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack6_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<6>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x3f; @@ -264,7 +276,8 @@ inline static const uint8_t* unpack6_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack7_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<7>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x7f; @@ -301,7 +314,8 @@ inline static const uint8_t* unpack7_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack8_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<8>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0xff; @@ -338,7 +352,8 @@ inline static const uint8_t* unpack8_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack9_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<9>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x1ff; @@ -375,7 +390,8 @@ inline static const uint8_t* unpack9_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack10_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<10>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x3ff; @@ -412,7 +428,8 @@ inline static const uint8_t* unpack10_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack11_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<11>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x7ff; @@ -449,7 +466,8 @@ inline static const uint8_t* unpack11_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack12_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<12>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0xfff; @@ -486,7 +504,8 @@ inline static const uint8_t* unpack12_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack13_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<13>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x1fff; @@ -523,7 +542,8 @@ inline static const uint8_t* unpack13_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack14_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<14>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x3fff; @@ -560,7 +580,8 @@ inline static const uint8_t* unpack14_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack15_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<15>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x7fff; @@ -597,7 +618,8 @@ inline static const uint8_t* unpack15_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack16_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<16>(const uint8_t* in, uint16_t* out) { std::memcpy(out, in, 16 * sizeof(*out)); in += 2 * 16; out += 16; @@ -607,13 +629,19 @@ inline static const uint8_t* unpack16_16(const uint8_t* in, uint16_t* out) { template<> struct Simd256Unpacker { -inline static const uint8_t* unpack0_32(const uint8_t* in, uint32_t* out) { + +template +static const uint8_t* unpack(const uint8_t* in, uint32_t* out); + +template<> +const uint8_t* unpack<0>(const uint8_t* in, uint32_t* out) { std::memset(out, 0x0, 32 * sizeof(*out)); out += 32; return in; } -inline static const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1; @@ -690,7 +718,8 @@ inline static const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3; @@ -767,7 +796,8 @@ inline static const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7; @@ -844,7 +874,8 @@ inline static const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xf; @@ -921,7 +952,8 @@ inline static const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1f; @@ -998,7 +1030,8 @@ inline static const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3f; @@ -1075,7 +1108,8 @@ inline static const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7f; @@ -1152,7 +1186,8 @@ inline static const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xff; @@ -1229,7 +1264,8 @@ inline static const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1ff; @@ -1306,7 +1342,8 @@ inline static const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3ff; @@ -1383,7 +1420,8 @@ inline static const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7ff; @@ -1460,7 +1498,8 @@ inline static const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xfff; @@ -1537,7 +1576,8 @@ inline static const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1fff; @@ -1614,7 +1654,8 @@ inline static const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3fff; @@ -1691,7 +1732,8 @@ inline static const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7fff; @@ -1768,7 +1810,8 @@ inline static const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xffff; @@ -1845,7 +1888,8 @@ inline static const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1ffff; @@ -1922,7 +1966,8 @@ inline static const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3ffff; @@ -1999,7 +2044,8 @@ inline static const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7ffff; @@ -2076,7 +2122,8 @@ inline static const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xfffff; @@ -2153,7 +2200,8 @@ inline static const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1fffff; @@ -2230,7 +2278,8 @@ inline static const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3fffff; @@ -2307,7 +2356,8 @@ inline static const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7fffff; @@ -2384,7 +2434,8 @@ inline static const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xffffff; @@ -2461,7 +2512,8 @@ inline static const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1ffffff; @@ -2538,7 +2590,8 @@ inline static const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3ffffff; @@ -2615,7 +2668,8 @@ inline static const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7ffffff; @@ -2692,7 +2746,8 @@ inline static const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xfffffff; @@ -2769,7 +2824,8 @@ inline static const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1fffffff; @@ -2846,7 +2902,8 @@ inline static const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3fffffff; @@ -2923,7 +2980,8 @@ inline static const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<31>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7fffffff; @@ -3000,7 +3058,8 @@ inline static const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack32_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<32>(const uint8_t* in, uint32_t* out) { std::memcpy(out, in, 32 * sizeof(*out)); in += 4 * 32; out += 32; diff --git a/cpp/src/arrow/util/bpacking_simd512_generated_internal.h b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h index 9848c55957e..66564225dc5 100644 --- a/cpp/src/arrow/util/bpacking_simd512_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h @@ -34,15 +34,22 @@ using ::arrow::util::SafeLoadAs; template struct Simd512Unpacker; + template<> struct Simd512Unpacker { -inline static const uint8_t* unpack0_16(const uint8_t* in, uint16_t* out) { + +template +static const uint8_t* unpack(const uint8_t* in, uint16_t* out); + +template<> +const uint8_t* unpack<0>(const uint8_t* in, uint16_t* out) { std::memset(out, 0x0, 16 * sizeof(*out)); out += 16; return in; } -inline static const uint8_t* unpack1_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<1>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x1; @@ -79,7 +86,8 @@ inline static const uint8_t* unpack1_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack2_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<2>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x3; @@ -116,7 +124,8 @@ inline static const uint8_t* unpack2_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack3_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<3>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x7; @@ -153,7 +162,8 @@ inline static const uint8_t* unpack3_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack4_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<4>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0xf; @@ -190,7 +200,8 @@ inline static const uint8_t* unpack4_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack5_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<5>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x1f; @@ -227,7 +238,8 @@ inline static const uint8_t* unpack5_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack6_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<6>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x3f; @@ -264,7 +276,8 @@ inline static const uint8_t* unpack6_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack7_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<7>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x7f; @@ -301,7 +314,8 @@ inline static const uint8_t* unpack7_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack8_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<8>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0xff; @@ -338,7 +352,8 @@ inline static const uint8_t* unpack8_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack9_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<9>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x1ff; @@ -375,7 +390,8 @@ inline static const uint8_t* unpack9_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack10_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<10>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x3ff; @@ -412,7 +428,8 @@ inline static const uint8_t* unpack10_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack11_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<11>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x7ff; @@ -449,7 +466,8 @@ inline static const uint8_t* unpack11_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack12_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<12>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0xfff; @@ -486,7 +504,8 @@ inline static const uint8_t* unpack12_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack13_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<13>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x1fff; @@ -523,7 +542,8 @@ inline static const uint8_t* unpack13_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack14_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<14>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x3fff; @@ -560,7 +580,8 @@ inline static const uint8_t* unpack14_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack15_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<15>(const uint8_t* in, uint16_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint16_t kMask = 0x7fff; @@ -597,7 +618,8 @@ inline static const uint8_t* unpack15_16(const uint8_t* in, uint16_t* out) { return in; } -inline static const uint8_t* unpack16_16(const uint8_t* in, uint16_t* out) { +template<> +const uint8_t* unpack<16>(const uint8_t* in, uint16_t* out) { std::memcpy(out, in, 16 * sizeof(*out)); in += 2 * 16; out += 16; @@ -607,13 +629,19 @@ inline static const uint8_t* unpack16_16(const uint8_t* in, uint16_t* out) { template<> struct Simd512Unpacker { -inline static const uint8_t* unpack0_32(const uint8_t* in, uint32_t* out) { + +template +static const uint8_t* unpack(const uint8_t* in, uint32_t* out); + +template<> +const uint8_t* unpack<0>(const uint8_t* in, uint32_t* out) { std::memset(out, 0x0, 32 * sizeof(*out)); out += 32; return in; } -inline static const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1; @@ -674,7 +702,8 @@ inline static const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3; @@ -735,7 +764,8 @@ inline static const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7; @@ -796,7 +826,8 @@ inline static const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xf; @@ -857,7 +888,8 @@ inline static const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1f; @@ -918,7 +950,8 @@ inline static const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3f; @@ -979,7 +1012,8 @@ inline static const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7f; @@ -1040,7 +1074,8 @@ inline static const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xff; @@ -1101,7 +1136,8 @@ inline static const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1ff; @@ -1162,7 +1198,8 @@ inline static const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3ff; @@ -1223,7 +1260,8 @@ inline static const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7ff; @@ -1284,7 +1322,8 @@ inline static const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xfff; @@ -1345,7 +1384,8 @@ inline static const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1fff; @@ -1406,7 +1446,8 @@ inline static const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3fff; @@ -1467,7 +1508,8 @@ inline static const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7fff; @@ -1528,7 +1570,8 @@ inline static const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xffff; @@ -1589,7 +1632,8 @@ inline static const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1ffff; @@ -1650,7 +1694,8 @@ inline static const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3ffff; @@ -1711,7 +1756,8 @@ inline static const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7ffff; @@ -1772,7 +1818,8 @@ inline static const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xfffff; @@ -1833,7 +1880,8 @@ inline static const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1fffff; @@ -1894,7 +1942,8 @@ inline static const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3fffff; @@ -1955,7 +2004,8 @@ inline static const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7fffff; @@ -2016,7 +2066,8 @@ inline static const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xffffff; @@ -2077,7 +2128,8 @@ inline static const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1ffffff; @@ -2138,7 +2190,8 @@ inline static const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3ffffff; @@ -2199,7 +2252,8 @@ inline static const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7ffffff; @@ -2260,7 +2314,8 @@ inline static const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0xfffffff; @@ -2321,7 +2376,8 @@ inline static const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x1fffffff; @@ -2382,7 +2438,8 @@ inline static const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x3fffffff; @@ -2443,7 +2500,8 @@ inline static const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<31>(const uint8_t* in, uint32_t* out) { using simd_batch = xsimd::make_sized_batch_t; constexpr uint32_t kMask = 0x7fffffff; @@ -2504,7 +2562,8 @@ inline static const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out) { return in; } -inline static const uint8_t* unpack32_32(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* unpack<32>(const uint8_t* in, uint32_t* out) { std::memcpy(out, in, 32 * sizeof(*out)); in += 4 * 32; out += 32; From 325cd9fd0e9a772363325dc886b89b4b4c2950a9 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 18 Sep 2025 17:19:05 +0200 Subject: [PATCH 16/76] Slight improvement to SIMD codegen --- cpp/src/arrow/util/bpacking_simd_codegen.py | 124 ++++++++++++-------- 1 file changed, 78 insertions(+), 46 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd_codegen.py b/cpp/src/arrow/util/bpacking_simd_codegen.py index 48c9fe8d22d..400c5bc4876 100755 --- a/cpp/src/arrow/util/bpacking_simd_codegen.py +++ b/cpp/src/arrow/util/bpacking_simd_codegen.py @@ -45,28 +45,9 @@ // under the License. """ -HEADER = """ -#pragma once - -#include -#include - -#include - -#include "arrow/util/ubsan.h" - -namespace arrow::internal { - -using ::arrow::util::SafeLoadAs; -""" - -FOOTER = """ -} // namespace arrow::internal -""" - @dataclasses.dataclass -class UnpackGenerator: +class UnpackStructGenerator: out_bit_width: int simd_bit_width: int @@ -107,12 +88,6 @@ def print_unpack_signature(self, bit: int | None) -> str: f"(const uint8_t* in, {self.out_type}* out){end}" ) - def print_struct_header(self): - print("template<>") - print(f"struct Simd{self.simd_bit_width}Unpacker<{self.out_type}> {{") - print() - self.print_unpack_signature(None) - def print_unpack_bit0_func(self): self.print_unpack_signature(0) print(f" std::memset(out, 0x0, {self.out_bit_width} * sizeof(*out));") @@ -138,8 +113,6 @@ def p(code, level=1): p( dedent(f"""\ - using simd_batch = xsimd::make_sized_batch_t<{self.out_type}, {self.simd_value_count}>; - constexpr {self.out_type} kMask = 0x{mask:0x}; simd_batch masks(kMask); @@ -210,8 +183,28 @@ def static_cast_as_needed(str): ) print("}") - def print_all(self): - self.print_struct_header() + def print_struct_declaration(self): + print("template") + print(f"struct Simd{self.simd_bit_width}Unpacker;") + + def print_struct_top(self): + print("template<>") + print(f"struct Simd{self.simd_bit_width}Unpacker<{self.out_type}> {{") + print() + print(f"using out_type = {self.out_type};") + print() + print( + "using simd_batch = xsimd::make_sized_batch_t<" + f"{self.out_type}, {self.simd_value_count}>;" + ) + print() + self.print_unpack_signature(None) + + def print_struct_bottom(self): + print("}; // struct Unpacker") + + def print_struct(self): + self.print_struct_top() print() self.print_unpack_bit0_func() @@ -221,30 +214,69 @@ def print_all(self): print() self.print_unpack_bitmax_func() - print("}; // struct Unpacker") + self.print_struct_bottom() -def print_note(): - print("// WARNING: this file is generated, DO NOT EDIT.") - print("// Usage:") - print(f"// python {' '.join(sys.orig_argv[1:])}") +@dataclasses.dataclass +class UnpackFileGenerator: + generators: list[UnpackStructGenerator] + def print_license(self): + print(LICENSE) -def main(simd_width, outputs): - print(LICENSE) - print_note() - print(HEADER) + def print_note(self): + print("// WARNING: this file is generated, DO NOT EDIT.") + print("// Usage:") + print(f"// python {' '.join(sys.orig_argv[1:])}") + + def print_headers(self): + print("#include ") + print("#include ") + print() + print("#include ") + print() + print('#include "arrow/util/ubsan.h"') + + def print_file_top(self): + print("#pragma once") + print() + self.print_headers() + print() + print("namespace arrow::internal {") + print() + print("using ::arrow::util::SafeLoadAs;") - print("template") - print(f"struct Simd{simd_width}Unpacker;") - print() + def print_file_bottom(self): + print("} // namespace arrow::internal") - for out_width in outputs: - gen = UnpackGenerator(out_width, simd_width) - gen.print_all() + def print_structs(self): + delclared = set() + + for gen in self.generators: + if gen.simd_bit_width not in delclared: + gen.print_struct_declaration() + print() + delclared.add(gen.simd_bit_width) + + gen.print_struct() + print() + + def print_file(self): + self.print_license() + self.print_note() + print() + self.print_file_top() print() + self.print_structs() + self.print_file_bottom() + + +def main(simd_width, outputs): + gen = UnpackFileGenerator( + [UnpackStructGenerator(out_width, simd_width) for out_width in outputs] + ) - print(FOOTER) + gen.print_file() if __name__ == "__main__": From 619d2a0457e18ca820ed517a226a2ce2c7b64fa9 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 18 Sep 2025 18:20:50 +0200 Subject: [PATCH 17/76] Use template functions in scalar codegen and factor dispatch --- cpp/src/arrow/util/bpacking.cc | 317 +----------------- cpp/src/arrow/util/bpacking_avx2.cc | 4 +- cpp/src/arrow/util/bpacking_avx512.cc | 4 +- .../arrow/util/bpacking_dispatch_internal.h | 251 ++++++++++++++ cpp/src/arrow/util/bpacking_neon.cc | 4 +- cpp/src/arrow/util/bpacking_scalar_codegen.py | 77 +++-- cpp/src/arrow/util/bpacking_simd_codegen.py | 3 +- cpp/src/arrow/util/bpacking_simd_internal.h | 112 ------- 8 files changed, 319 insertions(+), 453 deletions(-) create mode 100644 cpp/src/arrow/util/bpacking_dispatch_internal.h delete mode 100644 cpp/src/arrow/util/bpacking_simd_internal.h diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index fefca194518..903f5e0c144 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -15,12 +15,11 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/util/bpacking_dispatch_internal.h" #include "arrow/util/bpacking_internal.h" - #include "arrow/util/bpacking_scalar_generated_internal.h" #include "arrow/util/cpu_info.h" #include "arrow/util/dispatch_internal.h" -#include "arrow/util/logging_internal.h" #if defined(ARROW_HAVE_RUNTIME_AVX2) # include "arrow/util/bpacking_avx2_internal.h" @@ -36,114 +35,7 @@ namespace arrow { namespace internal { int unpack32_scalar(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { - batch_size = batch_size / 32 * 32; - int num_loops = batch_size / 32; - - switch (num_bits) { - case 0: - for (int i = 0; i < num_loops; ++i) in = unpack0_32(in, out + i * 32); - break; - case 1: - for (int i = 0; i < num_loops; ++i) in = unpack1_32(in, out + i * 32); - break; - case 2: - for (int i = 0; i < num_loops; ++i) in = unpack2_32(in, out + i * 32); - break; - case 3: - for (int i = 0; i < num_loops; ++i) in = unpack3_32(in, out + i * 32); - break; - case 4: - for (int i = 0; i < num_loops; ++i) in = unpack4_32(in, out + i * 32); - break; - case 5: - for (int i = 0; i < num_loops; ++i) in = unpack5_32(in, out + i * 32); - break; - case 6: - for (int i = 0; i < num_loops; ++i) in = unpack6_32(in, out + i * 32); - break; - case 7: - for (int i = 0; i < num_loops; ++i) in = unpack7_32(in, out + i * 32); - break; - case 8: - for (int i = 0; i < num_loops; ++i) in = unpack8_32(in, out + i * 32); - break; - case 9: - for (int i = 0; i < num_loops; ++i) in = unpack9_32(in, out + i * 32); - break; - case 10: - for (int i = 0; i < num_loops; ++i) in = unpack10_32(in, out + i * 32); - break; - case 11: - for (int i = 0; i < num_loops; ++i) in = unpack11_32(in, out + i * 32); - break; - case 12: - for (int i = 0; i < num_loops; ++i) in = unpack12_32(in, out + i * 32); - break; - case 13: - for (int i = 0; i < num_loops; ++i) in = unpack13_32(in, out + i * 32); - break; - case 14: - for (int i = 0; i < num_loops; ++i) in = unpack14_32(in, out + i * 32); - break; - case 15: - for (int i = 0; i < num_loops; ++i) in = unpack15_32(in, out + i * 32); - break; - case 16: - for (int i = 0; i < num_loops; ++i) in = unpack16_32(in, out + i * 32); - break; - case 17: - for (int i = 0; i < num_loops; ++i) in = unpack17_32(in, out + i * 32); - break; - case 18: - for (int i = 0; i < num_loops; ++i) in = unpack18_32(in, out + i * 32); - break; - case 19: - for (int i = 0; i < num_loops; ++i) in = unpack19_32(in, out + i * 32); - break; - case 20: - for (int i = 0; i < num_loops; ++i) in = unpack20_32(in, out + i * 32); - break; - case 21: - for (int i = 0; i < num_loops; ++i) in = unpack21_32(in, out + i * 32); - break; - case 22: - for (int i = 0; i < num_loops; ++i) in = unpack22_32(in, out + i * 32); - break; - case 23: - for (int i = 0; i < num_loops; ++i) in = unpack23_32(in, out + i * 32); - break; - case 24: - for (int i = 0; i < num_loops; ++i) in = unpack24_32(in, out + i * 32); - break; - case 25: - for (int i = 0; i < num_loops; ++i) in = unpack25_32(in, out + i * 32); - break; - case 26: - for (int i = 0; i < num_loops; ++i) in = unpack26_32(in, out + i * 32); - break; - case 27: - for (int i = 0; i < num_loops; ++i) in = unpack27_32(in, out + i * 32); - break; - case 28: - for (int i = 0; i < num_loops; ++i) in = unpack28_32(in, out + i * 32); - break; - case 29: - for (int i = 0; i < num_loops; ++i) in = unpack29_32(in, out + i * 32); - break; - case 30: - for (int i = 0; i < num_loops; ++i) in = unpack30_32(in, out + i * 32); - break; - case 31: - for (int i = 0; i < num_loops; ++i) in = unpack31_32(in, out + i * 32); - break; - case 32: - for (int i = 0; i < num_loops; ++i) in = unpack32_32(in, out + i * 32); - break; - default: - DCHECK(false) << "Unsupported num_bits"; - } - - return batch_size; + return unpack_jump32>(in, out, batch_size, num_bits); } namespace { @@ -177,210 +69,7 @@ int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { } int unpack64_scalar(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { - batch_size = batch_size / 32 * 32; - int num_loops = batch_size / 32; - - switch (num_bits) { - case 0: - for (int i = 0; i < num_loops; ++i) in = unpack0_64(in, out + i * 32); - break; - case 1: - for (int i = 0; i < num_loops; ++i) in = unpack1_64(in, out + i * 32); - break; - case 2: - for (int i = 0; i < num_loops; ++i) in = unpack2_64(in, out + i * 32); - break; - case 3: - for (int i = 0; i < num_loops; ++i) in = unpack3_64(in, out + i * 32); - break; - case 4: - for (int i = 0; i < num_loops; ++i) in = unpack4_64(in, out + i * 32); - break; - case 5: - for (int i = 0; i < num_loops; ++i) in = unpack5_64(in, out + i * 32); - break; - case 6: - for (int i = 0; i < num_loops; ++i) in = unpack6_64(in, out + i * 32); - break; - case 7: - for (int i = 0; i < num_loops; ++i) in = unpack7_64(in, out + i * 32); - break; - case 8: - for (int i = 0; i < num_loops; ++i) in = unpack8_64(in, out + i * 32); - break; - case 9: - for (int i = 0; i < num_loops; ++i) in = unpack9_64(in, out + i * 32); - break; - case 10: - for (int i = 0; i < num_loops; ++i) in = unpack10_64(in, out + i * 32); - break; - case 11: - for (int i = 0; i < num_loops; ++i) in = unpack11_64(in, out + i * 32); - break; - case 12: - for (int i = 0; i < num_loops; ++i) in = unpack12_64(in, out + i * 32); - break; - case 13: - for (int i = 0; i < num_loops; ++i) in = unpack13_64(in, out + i * 32); - break; - case 14: - for (int i = 0; i < num_loops; ++i) in = unpack14_64(in, out + i * 32); - break; - case 15: - for (int i = 0; i < num_loops; ++i) in = unpack15_64(in, out + i * 32); - break; - case 16: - for (int i = 0; i < num_loops; ++i) in = unpack16_64(in, out + i * 32); - break; - case 17: - for (int i = 0; i < num_loops; ++i) in = unpack17_64(in, out + i * 32); - break; - case 18: - for (int i = 0; i < num_loops; ++i) in = unpack18_64(in, out + i * 32); - break; - case 19: - for (int i = 0; i < num_loops; ++i) in = unpack19_64(in, out + i * 32); - break; - case 20: - for (int i = 0; i < num_loops; ++i) in = unpack20_64(in, out + i * 32); - break; - case 21: - for (int i = 0; i < num_loops; ++i) in = unpack21_64(in, out + i * 32); - break; - case 22: - for (int i = 0; i < num_loops; ++i) in = unpack22_64(in, out + i * 32); - break; - case 23: - for (int i = 0; i < num_loops; ++i) in = unpack23_64(in, out + i * 32); - break; - case 24: - for (int i = 0; i < num_loops; ++i) in = unpack24_64(in, out + i * 32); - break; - case 25: - for (int i = 0; i < num_loops; ++i) in = unpack25_64(in, out + i * 32); - break; - case 26: - for (int i = 0; i < num_loops; ++i) in = unpack26_64(in, out + i * 32); - break; - case 27: - for (int i = 0; i < num_loops; ++i) in = unpack27_64(in, out + i * 32); - break; - case 28: - for (int i = 0; i < num_loops; ++i) in = unpack28_64(in, out + i * 32); - break; - case 29: - for (int i = 0; i < num_loops; ++i) in = unpack29_64(in, out + i * 32); - break; - case 30: - for (int i = 0; i < num_loops; ++i) in = unpack30_64(in, out + i * 32); - break; - case 31: - for (int i = 0; i < num_loops; ++i) in = unpack31_64(in, out + i * 32); - break; - case 32: - for (int i = 0; i < num_loops; ++i) in = unpack32_64(in, out + i * 32); - break; - case 33: - for (int i = 0; i < num_loops; ++i) in = unpack33_64(in, out + i * 32); - break; - case 34: - for (int i = 0; i < num_loops; ++i) in = unpack34_64(in, out + i * 32); - break; - case 35: - for (int i = 0; i < num_loops; ++i) in = unpack35_64(in, out + i * 32); - break; - case 36: - for (int i = 0; i < num_loops; ++i) in = unpack36_64(in, out + i * 32); - break; - case 37: - for (int i = 0; i < num_loops; ++i) in = unpack37_64(in, out + i * 32); - break; - case 38: - for (int i = 0; i < num_loops; ++i) in = unpack38_64(in, out + i * 32); - break; - case 39: - for (int i = 0; i < num_loops; ++i) in = unpack39_64(in, out + i * 32); - break; - case 40: - for (int i = 0; i < num_loops; ++i) in = unpack40_64(in, out + i * 32); - break; - case 41: - for (int i = 0; i < num_loops; ++i) in = unpack41_64(in, out + i * 32); - break; - case 42: - for (int i = 0; i < num_loops; ++i) in = unpack42_64(in, out + i * 32); - break; - case 43: - for (int i = 0; i < num_loops; ++i) in = unpack43_64(in, out + i * 32); - break; - case 44: - for (int i = 0; i < num_loops; ++i) in = unpack44_64(in, out + i * 32); - break; - case 45: - for (int i = 0; i < num_loops; ++i) in = unpack45_64(in, out + i * 32); - break; - case 46: - for (int i = 0; i < num_loops; ++i) in = unpack46_64(in, out + i * 32); - break; - case 47: - for (int i = 0; i < num_loops; ++i) in = unpack47_64(in, out + i * 32); - break; - case 48: - for (int i = 0; i < num_loops; ++i) in = unpack48_64(in, out + i * 32); - break; - case 49: - for (int i = 0; i < num_loops; ++i) in = unpack49_64(in, out + i * 32); - break; - case 50: - for (int i = 0; i < num_loops; ++i) in = unpack50_64(in, out + i * 32); - break; - case 51: - for (int i = 0; i < num_loops; ++i) in = unpack51_64(in, out + i * 32); - break; - case 52: - for (int i = 0; i < num_loops; ++i) in = unpack52_64(in, out + i * 32); - break; - case 53: - for (int i = 0; i < num_loops; ++i) in = unpack53_64(in, out + i * 32); - break; - case 54: - for (int i = 0; i < num_loops; ++i) in = unpack54_64(in, out + i * 32); - break; - case 55: - for (int i = 0; i < num_loops; ++i) in = unpack55_64(in, out + i * 32); - break; - case 56: - for (int i = 0; i < num_loops; ++i) in = unpack56_64(in, out + i * 32); - break; - case 57: - for (int i = 0; i < num_loops; ++i) in = unpack57_64(in, out + i * 32); - break; - case 58: - for (int i = 0; i < num_loops; ++i) in = unpack58_64(in, out + i * 32); - break; - case 59: - for (int i = 0; i < num_loops; ++i) in = unpack59_64(in, out + i * 32); - break; - case 60: - for (int i = 0; i < num_loops; ++i) in = unpack60_64(in, out + i * 32); - break; - case 61: - for (int i = 0; i < num_loops; ++i) in = unpack61_64(in, out + i * 32); - break; - case 62: - for (int i = 0; i < num_loops; ++i) in = unpack62_64(in, out + i * 32); - break; - case 63: - for (int i = 0; i < num_loops; ++i) in = unpack63_64(in, out + i * 32); - break; - case 64: - for (int i = 0; i < num_loops; ++i) in = unpack64_64(in, out + i * 32); - break; - default: - DCHECK(false) << "Unsupported num_bits"; - } - - return batch_size; + return unpack_jump64>(in, out, batch_size, num_bits); } int unpack64(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { diff --git a/cpp/src/arrow/util/bpacking_avx2.cc b/cpp/src/arrow/util/bpacking_avx2.cc index 88cfe5c7d08..b31cdcfab72 100644 --- a/cpp/src/arrow/util/bpacking_avx2.cc +++ b/cpp/src/arrow/util/bpacking_avx2.cc @@ -16,13 +16,13 @@ // under the License. #include "arrow/util/bpacking_avx2_internal.h" +#include "arrow/util/bpacking_dispatch_internal.h" #include "arrow/util/bpacking_simd256_generated_internal.h" -#include "arrow/util/bpacking_simd_internal.h" namespace arrow::internal { int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>(in, out, batch_size, num_bits); + return unpack_jump32>(in, out, batch_size, num_bits); } } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_avx512.cc b/cpp/src/arrow/util/bpacking_avx512.cc index 8332c6156f0..b3c5b5f75a7 100644 --- a/cpp/src/arrow/util/bpacking_avx512.cc +++ b/cpp/src/arrow/util/bpacking_avx512.cc @@ -16,13 +16,13 @@ // under the License. #include "arrow/util/bpacking_avx512_internal.h" +#include "arrow/util/bpacking_dispatch_internal.h" #include "arrow/util/bpacking_simd512_generated_internal.h" -#include "arrow/util/bpacking_simd_internal.h" namespace arrow::internal { int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>(in, out, batch_size, num_bits); + return unpack_jump32>(in, out, batch_size, num_bits); } } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h new file mode 100644 index 00000000000..92b8bc867d7 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h @@ -0,0 +1,251 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/logging.h" + +namespace arrow::internal { + +template +int unpack(const uint8_t* in, typename Unpacker::out_type* out, int batch_size, + int num_bits) { + constexpr auto kValuesUnpacked = Unpacker::kValuesUnpacked; + batch_size = batch_size / kValuesUnpacked * kValuesUnpacked; + int num_loops = batch_size / kValuesUnpacked; + + for (int i = 0; i < num_loops; ++i) { + in = Unpacker::template unpack(in, out + i * kValuesUnpacked); + } + + return batch_size; +} + +template +static int unpack_jump32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { + switch (num_bits) { + case 0: + return unpack<0, Unpacker>(in, out, batch_size, num_bits); + case 1: + return unpack<1, Unpacker>(in, out, batch_size, num_bits); + case 2: + return unpack<2, Unpacker>(in, out, batch_size, num_bits); + case 3: + return unpack<3, Unpacker>(in, out, batch_size, num_bits); + case 4: + return unpack<4, Unpacker>(in, out, batch_size, num_bits); + case 5: + return unpack<5, Unpacker>(in, out, batch_size, num_bits); + case 6: + return unpack<6, Unpacker>(in, out, batch_size, num_bits); + case 7: + return unpack<7, Unpacker>(in, out, batch_size, num_bits); + case 8: + return unpack<8, Unpacker>(in, out, batch_size, num_bits); + case 9: + return unpack<9, Unpacker>(in, out, batch_size, num_bits); + case 10: + return unpack<10, Unpacker>(in, out, batch_size, num_bits); + case 11: + return unpack<11, Unpacker>(in, out, batch_size, num_bits); + case 12: + return unpack<12, Unpacker>(in, out, batch_size, num_bits); + case 13: + return unpack<13, Unpacker>(in, out, batch_size, num_bits); + case 14: + return unpack<14, Unpacker>(in, out, batch_size, num_bits); + case 15: + return unpack<15, Unpacker>(in, out, batch_size, num_bits); + case 16: + return unpack<16, Unpacker>(in, out, batch_size, num_bits); + case 17: + return unpack<17, Unpacker>(in, out, batch_size, num_bits); + case 18: + return unpack<18, Unpacker>(in, out, batch_size, num_bits); + case 19: + return unpack<19, Unpacker>(in, out, batch_size, num_bits); + case 20: + return unpack<20, Unpacker>(in, out, batch_size, num_bits); + case 21: + return unpack<21, Unpacker>(in, out, batch_size, num_bits); + case 22: + return unpack<22, Unpacker>(in, out, batch_size, num_bits); + case 23: + return unpack<23, Unpacker>(in, out, batch_size, num_bits); + case 24: + return unpack<24, Unpacker>(in, out, batch_size, num_bits); + case 25: + return unpack<25, Unpacker>(in, out, batch_size, num_bits); + case 26: + return unpack<26, Unpacker>(in, out, batch_size, num_bits); + case 27: + return unpack<27, Unpacker>(in, out, batch_size, num_bits); + case 28: + return unpack<28, Unpacker>(in, out, batch_size, num_bits); + case 29: + return unpack<29, Unpacker>(in, out, batch_size, num_bits); + case 30: + return unpack<30, Unpacker>(in, out, batch_size, num_bits); + case 31: + return unpack<31, Unpacker>(in, out, batch_size, num_bits); + case 32: + return unpack<32, Unpacker>(in, out, batch_size, num_bits); + default: + ARROW_DCHECK(false) << "Unsupported num_bits"; + } + + return 0; +} + +template +static int unpack_jump64(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { + switch (num_bits) { + case 0: + return unpack<0, Unpacker>(in, out, batch_size, num_bits); + case 1: + return unpack<1, Unpacker>(in, out, batch_size, num_bits); + case 2: + return unpack<2, Unpacker>(in, out, batch_size, num_bits); + case 3: + return unpack<3, Unpacker>(in, out, batch_size, num_bits); + case 4: + return unpack<4, Unpacker>(in, out, batch_size, num_bits); + case 5: + return unpack<5, Unpacker>(in, out, batch_size, num_bits); + case 6: + return unpack<6, Unpacker>(in, out, batch_size, num_bits); + case 7: + return unpack<7, Unpacker>(in, out, batch_size, num_bits); + case 8: + return unpack<8, Unpacker>(in, out, batch_size, num_bits); + case 9: + return unpack<9, Unpacker>(in, out, batch_size, num_bits); + case 10: + return unpack<10, Unpacker>(in, out, batch_size, num_bits); + case 11: + return unpack<11, Unpacker>(in, out, batch_size, num_bits); + case 12: + return unpack<12, Unpacker>(in, out, batch_size, num_bits); + case 13: + return unpack<13, Unpacker>(in, out, batch_size, num_bits); + case 14: + return unpack<14, Unpacker>(in, out, batch_size, num_bits); + case 15: + return unpack<15, Unpacker>(in, out, batch_size, num_bits); + case 16: + return unpack<16, Unpacker>(in, out, batch_size, num_bits); + case 17: + return unpack<17, Unpacker>(in, out, batch_size, num_bits); + case 18: + return unpack<18, Unpacker>(in, out, batch_size, num_bits); + case 19: + return unpack<19, Unpacker>(in, out, batch_size, num_bits); + case 20: + return unpack<20, Unpacker>(in, out, batch_size, num_bits); + case 21: + return unpack<21, Unpacker>(in, out, batch_size, num_bits); + case 22: + return unpack<22, Unpacker>(in, out, batch_size, num_bits); + case 23: + return unpack<23, Unpacker>(in, out, batch_size, num_bits); + case 24: + return unpack<24, Unpacker>(in, out, batch_size, num_bits); + case 25: + return unpack<25, Unpacker>(in, out, batch_size, num_bits); + case 26: + return unpack<26, Unpacker>(in, out, batch_size, num_bits); + case 27: + return unpack<27, Unpacker>(in, out, batch_size, num_bits); + case 28: + return unpack<28, Unpacker>(in, out, batch_size, num_bits); + case 29: + return unpack<29, Unpacker>(in, out, batch_size, num_bits); + case 30: + return unpack<30, Unpacker>(in, out, batch_size, num_bits); + case 31: + return unpack<31, Unpacker>(in, out, batch_size, num_bits); + case 32: + return unpack<32, Unpacker>(in, out, batch_size, num_bits); + case 33: + return unpack<33, Unpacker>(in, out, batch_size, num_bits); + case 34: + return unpack<34, Unpacker>(in, out, batch_size, num_bits); + case 35: + return unpack<35, Unpacker>(in, out, batch_size, num_bits); + case 36: + return unpack<36, Unpacker>(in, out, batch_size, num_bits); + case 37: + return unpack<37, Unpacker>(in, out, batch_size, num_bits); + case 38: + return unpack<38, Unpacker>(in, out, batch_size, num_bits); + case 39: + return unpack<39, Unpacker>(in, out, batch_size, num_bits); + case 40: + return unpack<40, Unpacker>(in, out, batch_size, num_bits); + case 41: + return unpack<41, Unpacker>(in, out, batch_size, num_bits); + case 42: + return unpack<42, Unpacker>(in, out, batch_size, num_bits); + case 43: + return unpack<43, Unpacker>(in, out, batch_size, num_bits); + case 44: + return unpack<44, Unpacker>(in, out, batch_size, num_bits); + case 45: + return unpack<45, Unpacker>(in, out, batch_size, num_bits); + case 46: + return unpack<46, Unpacker>(in, out, batch_size, num_bits); + case 47: + return unpack<47, Unpacker>(in, out, batch_size, num_bits); + case 48: + return unpack<48, Unpacker>(in, out, batch_size, num_bits); + case 49: + return unpack<49, Unpacker>(in, out, batch_size, num_bits); + case 50: + return unpack<50, Unpacker>(in, out, batch_size, num_bits); + case 51: + return unpack<51, Unpacker>(in, out, batch_size, num_bits); + case 52: + return unpack<52, Unpacker>(in, out, batch_size, num_bits); + case 53: + return unpack<53, Unpacker>(in, out, batch_size, num_bits); + case 54: + return unpack<54, Unpacker>(in, out, batch_size, num_bits); + case 55: + return unpack<55, Unpacker>(in, out, batch_size, num_bits); + case 56: + return unpack<56, Unpacker>(in, out, batch_size, num_bits); + case 57: + return unpack<57, Unpacker>(in, out, batch_size, num_bits); + case 58: + return unpack<58, Unpacker>(in, out, batch_size, num_bits); + case 59: + return unpack<59, Unpacker>(in, out, batch_size, num_bits); + case 60: + return unpack<60, Unpacker>(in, out, batch_size, num_bits); + case 61: + return unpack<61, Unpacker>(in, out, batch_size, num_bits); + case 62: + return unpack<62, Unpacker>(in, out, batch_size, num_bits); + case 63: + return unpack<63, Unpacker>(in, out, batch_size, num_bits); + case 64: + return unpack<64, Unpacker>(in, out, batch_size, num_bits); + default: + ARROW_DCHECK(false) << "Unsupported num_bits"; + } + return 0; +} + +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_neon.cc b/cpp/src/arrow/util/bpacking_neon.cc index ec783d8b741..087d9717787 100644 --- a/cpp/src/arrow/util/bpacking_neon.cc +++ b/cpp/src/arrow/util/bpacking_neon.cc @@ -15,14 +15,14 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/util/bpacking_dispatch_internal.h" #include "arrow/util/bpacking_neon_internal.h" #include "arrow/util/bpacking_simd128_generated_internal.h" -#include "arrow/util/bpacking_simd_internal.h" namespace arrow::internal { int unpack32_neon(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>(in, out, batch_size, num_bits); + return unpack_jump32>(in, out, batch_size, num_bits); } } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_scalar_codegen.py b/cpp/src/arrow/util/bpacking_scalar_codegen.py index fe29f74e08d..bdf63d88eb2 100644 --- a/cpp/src/arrow/util/bpacking_scalar_codegen.py +++ b/cpp/src/arrow/util/bpacking_scalar_codegen.py @@ -87,11 +87,11 @@ def out_byte_width(self) -> int: return self.out_bit_width // 8 @property - def unsigned_type(self) -> str: + def out_type(self) -> str: return f"uint{self.out_bit_width}_t" @property - def unsigned_type_half(self) -> str: + def out_type_half(self) -> str: return f"uint{self.out_bit_width // 2}_t" @property @@ -107,24 +107,34 @@ def howmanywords(self, bit: int) -> int: def howmanybytes(self, bit: int) -> int: return (self.howmany * bit + self.out_byte_width - 1) // self.out_byte_width - def unpack_signature(self, bit: int) -> str: - return ( - f"inline const uint8_t* unpack{bit}_{self.out_bit_width}" - f"(const uint8_t* in, {self.unsigned_type}* out)" - "{" + def print_unpack_signature(self, bit: int | None) -> str: + if bit is None: + print("template") + static = "static " + specialized = "" + end = ";" + else: + print("template<>") + static = "" + specialized = f"<{bit}>" + end = " {" + + print( + f"{static}const uint8_t* unpack{specialized}" + f"(const uint8_t* in, {self.out_type}* out){end}" ) def print_unpack_0(self) -> None: - print(self.unpack_signature(0)) + self.print_unpack_signature(0) print(f" std::memset(out, 0, {self.howmany} * {self.out_byte_width});") print(" return in;") print("}") def print_unpack_last(self) -> None: - print(self.unpack_signature(self.out_bit_width)) + self.print_unpack_signature(self.out_bit_width) print(f" for(int k = 0; k < {self.howmany}; k += 1) {{") print( - f" out[k] = LoadInt<{self.unsigned_type}>(" + f" out[k] = LoadInt<{self.out_type}>(" f"in + (k * {self.out_byte_width}));" ) print(" }") @@ -132,17 +142,17 @@ def print_unpack_last(self) -> None: print("}") def print_unpack_k(self, bit: int) -> None: - print(self.unpack_signature(bit)) + self.print_unpack_signature(bit) print( - f" constexpr {self.unsigned_type} mask = " - f"(({self.unsigned_type}{{1}} << {bit}) - {self.unsigned_type}{{1}});" + f" constexpr {self.out_type} mask = " + f"(({self.out_type}{{1}} << {bit}) - {self.out_type}{{1}});" ) print("") maskstr = " & mask" for k in range(self.howmanywords(bit) - 1): print( - f" const auto w{k} = LoadInt<{self.unsigned_type}>(" + f" const auto w{k} = LoadInt<{self.out_type}>(" f"in + {k} * {self.out_byte_width});" ) @@ -150,12 +160,12 @@ def print_unpack_k(self, bit: int) -> None: use_smart_halving = self.smart_halve and bit % 2 == 1 if use_smart_halving: print( - f" const auto w{k} = static_cast<{self.unsigned_type}>(LoadInt<{self.unsigned_type_half}>(" + f" const auto w{k} = static_cast<{self.out_type}>(LoadInt<{self.out_type_half}>(" f"in + {k} * {self.out_byte_width}));" ) else: print( - f" const auto w{k} = LoadInt<{self.unsigned_type}>(" + f" const auto w{k} = LoadInt<{self.out_type}>(" f"in + {k} * {self.out_byte_width});" ) @@ -188,7 +198,27 @@ def print_unpack_k(self, bit: int) -> None: print(f" return in + ({self.howmanywords(bit)} * {self.out_byte_width});") print("}") - def print_all(self) -> None: + def print_struct_declaration(self): + print("template") + print("struct ScalarUnpacker;") + + def print_struct_top(self): + print("template<>") + print(f"struct ScalarUnpacker<{self.out_type}> {{") + print() + print(f"using out_type = {self.out_type};") + print() + print(f"static constexpr int kValuesUnpacked = {self.howmany};") + print() + self.print_unpack_signature(None) + + def print_struct_bottom(self): + print("}; // struct") + + def print_struct(self): + self.print_struct_top() + print() + self.print_unpack_0() print("") @@ -198,6 +228,8 @@ def print_all(self) -> None: self.print_unpack_last() + self.print_struct_bottom() + def print_note(): print("// WARNING: this file is generated, DO NOT EDIT.") @@ -210,9 +242,14 @@ def print_note(): print_note() print(HEADER) - ScalarUnpackGenerator(32, smart_halve=False).print_all() - print("") + gen = ScalarUnpackGenerator(32, smart_halve=False) + gen.print_struct_declaration() + print() + + gen.print_struct() + print() - ScalarUnpackGenerator(64, smart_halve=True).print_all() + gen = ScalarUnpackGenerator(64, smart_halve=True) + gen.print_struct() print(FOOTER) diff --git a/cpp/src/arrow/util/bpacking_simd_codegen.py b/cpp/src/arrow/util/bpacking_simd_codegen.py index 400c5bc4876..5c627b59dcd 100755 --- a/cpp/src/arrow/util/bpacking_simd_codegen.py +++ b/cpp/src/arrow/util/bpacking_simd_codegen.py @@ -192,12 +192,13 @@ def print_struct_top(self): print(f"struct Simd{self.simd_bit_width}Unpacker<{self.out_type}> {{") print() print(f"using out_type = {self.out_type};") - print() print( "using simd_batch = xsimd::make_sized_batch_t<" f"{self.out_type}, {self.simd_value_count}>;" ) print() + print(f"static constexpr int kValuesUnpacked = {self.out_bit_width};") + print() self.print_unpack_signature(None) def print_struct_bottom(self): diff --git a/cpp/src/arrow/util/bpacking_simd_internal.h b/cpp/src/arrow/util/bpacking_simd_internal.h deleted file mode 100644 index 2613140aa6e..00000000000 --- a/cpp/src/arrow/util/bpacking_simd_internal.h +++ /dev/null @@ -1,112 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/util/dispatch_internal.h" -#include "arrow/util/logging.h" - -namespace arrow::internal { - -template -int unpack(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { - batch_size = batch_size / 32 * 32; - int num_loops = batch_size / 32; - - for (int i = 0; i < num_loops; ++i) { - in = Unpacker::template unpack(in, out + i * 32); - } - - return batch_size; -} - -template -static int unpack32_specialized(const uint8_t* in, uint32_t* out, int batch_size, - int num_bits) { - switch (num_bits) { - case 0: - return unpack<0, Unpacker>(in, out, batch_size, num_bits); - case 1: - return unpack<1, Unpacker>(in, out, batch_size, num_bits); - case 2: - return unpack<2, Unpacker>(in, out, batch_size, num_bits); - case 3: - return unpack<3, Unpacker>(in, out, batch_size, num_bits); - case 4: - return unpack<4, Unpacker>(in, out, batch_size, num_bits); - case 5: - return unpack<5, Unpacker>(in, out, batch_size, num_bits); - case 6: - return unpack<6, Unpacker>(in, out, batch_size, num_bits); - case 7: - return unpack<7, Unpacker>(in, out, batch_size, num_bits); - case 8: - return unpack<8, Unpacker>(in, out, batch_size, num_bits); - case 9: - return unpack<9, Unpacker>(in, out, batch_size, num_bits); - case 10: - return unpack<10, Unpacker>(in, out, batch_size, num_bits); - case 11: - return unpack<11, Unpacker>(in, out, batch_size, num_bits); - case 12: - return unpack<12, Unpacker>(in, out, batch_size, num_bits); - case 13: - return unpack<13, Unpacker>(in, out, batch_size, num_bits); - case 14: - return unpack<14, Unpacker>(in, out, batch_size, num_bits); - case 15: - return unpack<15, Unpacker>(in, out, batch_size, num_bits); - case 16: - return unpack<16, Unpacker>(in, out, batch_size, num_bits); - case 17: - return unpack<17, Unpacker>(in, out, batch_size, num_bits); - case 18: - return unpack<18, Unpacker>(in, out, batch_size, num_bits); - case 19: - return unpack<19, Unpacker>(in, out, batch_size, num_bits); - case 20: - return unpack<20, Unpacker>(in, out, batch_size, num_bits); - case 21: - return unpack<21, Unpacker>(in, out, batch_size, num_bits); - case 22: - return unpack<22, Unpacker>(in, out, batch_size, num_bits); - case 23: - return unpack<23, Unpacker>(in, out, batch_size, num_bits); - case 24: - return unpack<24, Unpacker>(in, out, batch_size, num_bits); - case 25: - return unpack<25, Unpacker>(in, out, batch_size, num_bits); - case 26: - return unpack<26, Unpacker>(in, out, batch_size, num_bits); - case 27: - return unpack<27, Unpacker>(in, out, batch_size, num_bits); - case 28: - return unpack<28, Unpacker>(in, out, batch_size, num_bits); - case 29: - return unpack<29, Unpacker>(in, out, batch_size, num_bits); - case 30: - return unpack<30, Unpacker>(in, out, batch_size, num_bits); - case 31: - return unpack<31, Unpacker>(in, out, batch_size, num_bits); - case 32: - return unpack<32, Unpacker>(in, out, batch_size, num_bits); - default: - ARROW_DCHECK(false) << "Unsupported num_bits"; - } - - return batch_size; -} - -} // namespace arrow::internal From 1b4839238ec059f211c779c3ef003cd13c9f40e0 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 18 Sep 2025 18:25:34 +0200 Subject: [PATCH 18/76] Gen: regenerate unpack files --- .../util/bpacking_scalar_generated_internal.h | 319 ++++++++++++------ .../bpacking_simd128_generated_internal.h | 104 +----- .../bpacking_simd256_generated_internal.h | 104 +----- .../bpacking_simd512_generated_internal.h | 104 +----- 4 files changed, 251 insertions(+), 380 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_scalar_generated_internal.h b/cpp/src/arrow/util/bpacking_scalar_generated_internal.h index d215fcfbc46..824cf9306e0 100644 --- a/cpp/src/arrow/util/bpacking_scalar_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_scalar_generated_internal.h @@ -43,12 +43,27 @@ Int LoadInt(const uint8_t* in) { return bit_util::FromLittleEndian(util::SafeLoadAs(in)); } -inline const uint8_t* unpack0_32(const uint8_t* in, uint32_t* out){ +template +struct ScalarUnpacker; + +template<> +struct ScalarUnpacker { + +using out_type = uint32_t; + +static constexpr int kValuesUnpacked = 32; + +template +static const uint8_t* unpack(const uint8_t* in, uint32_t* out); + +template<> +const uint8_t* unpack<0>(const uint8_t* in, uint32_t* out) { std::memset(out, 0, 32 * 4); return in; } -inline const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 1) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -88,7 +103,8 @@ inline const uint8_t* unpack1_32(const uint8_t* in, uint32_t* out){ return in + (1 * 4); } -inline const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 2) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -129,7 +145,8 @@ inline const uint8_t* unpack2_32(const uint8_t* in, uint32_t* out){ return in + (2 * 4); } -inline const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 3) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -171,7 +188,8 @@ inline const uint8_t* unpack3_32(const uint8_t* in, uint32_t* out){ return in + (3 * 4); } -inline const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 4) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -214,7 +232,8 @@ inline const uint8_t* unpack4_32(const uint8_t* in, uint32_t* out){ return in + (4 * 4); } -inline const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 5) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -258,7 +277,8 @@ inline const uint8_t* unpack5_32(const uint8_t* in, uint32_t* out){ return in + (5 * 4); } -inline const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 6) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -303,7 +323,8 @@ inline const uint8_t* unpack6_32(const uint8_t* in, uint32_t* out){ return in + (6 * 4); } -inline const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 7) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -349,7 +370,8 @@ inline const uint8_t* unpack7_32(const uint8_t* in, uint32_t* out){ return in + (7 * 4); } -inline const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 8) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -396,7 +418,8 @@ inline const uint8_t* unpack8_32(const uint8_t* in, uint32_t* out){ return in + (8 * 4); } -inline const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 9) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -444,7 +467,8 @@ inline const uint8_t* unpack9_32(const uint8_t* in, uint32_t* out){ return in + (9 * 4); } -inline const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 10) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -493,7 +517,8 @@ inline const uint8_t* unpack10_32(const uint8_t* in, uint32_t* out){ return in + (10 * 4); } -inline const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 11) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -543,7 +568,8 @@ inline const uint8_t* unpack11_32(const uint8_t* in, uint32_t* out){ return in + (11 * 4); } -inline const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 12) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -594,7 +620,8 @@ inline const uint8_t* unpack12_32(const uint8_t* in, uint32_t* out){ return in + (12 * 4); } -inline const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 13) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -646,7 +673,8 @@ inline const uint8_t* unpack13_32(const uint8_t* in, uint32_t* out){ return in + (13 * 4); } -inline const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 14) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -699,7 +727,8 @@ inline const uint8_t* unpack14_32(const uint8_t* in, uint32_t* out){ return in + (14 * 4); } -inline const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 15) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -753,7 +782,8 @@ inline const uint8_t* unpack15_32(const uint8_t* in, uint32_t* out){ return in + (15 * 4); } -inline const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 16) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -808,7 +838,8 @@ inline const uint8_t* unpack16_32(const uint8_t* in, uint32_t* out){ return in + (16 * 4); } -inline const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 17) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -864,7 +895,8 @@ inline const uint8_t* unpack17_32(const uint8_t* in, uint32_t* out){ return in + (17 * 4); } -inline const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 18) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -921,7 +953,8 @@ inline const uint8_t* unpack18_32(const uint8_t* in, uint32_t* out){ return in + (18 * 4); } -inline const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 19) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -979,7 +1012,8 @@ inline const uint8_t* unpack19_32(const uint8_t* in, uint32_t* out){ return in + (19 * 4); } -inline const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 20) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1038,7 +1072,8 @@ inline const uint8_t* unpack20_32(const uint8_t* in, uint32_t* out){ return in + (20 * 4); } -inline const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 21) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1098,7 +1133,8 @@ inline const uint8_t* unpack21_32(const uint8_t* in, uint32_t* out){ return in + (21 * 4); } -inline const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 22) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1159,7 +1195,8 @@ inline const uint8_t* unpack22_32(const uint8_t* in, uint32_t* out){ return in + (22 * 4); } -inline const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 23) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1221,7 +1258,8 @@ inline const uint8_t* unpack23_32(const uint8_t* in, uint32_t* out){ return in + (23 * 4); } -inline const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 24) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1284,7 +1322,8 @@ inline const uint8_t* unpack24_32(const uint8_t* in, uint32_t* out){ return in + (24 * 4); } -inline const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 25) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1348,7 +1387,8 @@ inline const uint8_t* unpack25_32(const uint8_t* in, uint32_t* out){ return in + (25 * 4); } -inline const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 26) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1413,7 +1453,8 @@ inline const uint8_t* unpack26_32(const uint8_t* in, uint32_t* out){ return in + (26 * 4); } -inline const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 27) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1479,7 +1520,8 @@ inline const uint8_t* unpack27_32(const uint8_t* in, uint32_t* out){ return in + (27 * 4); } -inline const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 28) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1546,7 +1588,8 @@ inline const uint8_t* unpack28_32(const uint8_t* in, uint32_t* out){ return in + (28 * 4); } -inline const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 29) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1614,7 +1657,8 @@ inline const uint8_t* unpack29_32(const uint8_t* in, uint32_t* out){ return in + (29 * 4); } -inline const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 30) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1683,7 +1727,8 @@ inline const uint8_t* unpack30_32(const uint8_t* in, uint32_t* out){ return in + (30 * 4); } -inline const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<31>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 31) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1753,19 +1798,33 @@ inline const uint8_t* unpack31_32(const uint8_t* in, uint32_t* out){ return in + (31 * 4); } -inline const uint8_t* unpack32_32(const uint8_t* in, uint32_t* out){ +template<> +const uint8_t* unpack<32>(const uint8_t* in, uint32_t* out) { for(int k = 0; k < 32; k += 1) { out[k] = LoadInt(in + (k * 4)); } return in + (4 * 32); } +}; // struct + +template<> +struct ScalarUnpacker { + +using out_type = uint64_t; + +static constexpr int kValuesUnpacked = 32; + +template +static const uint8_t* unpack(const uint8_t* in, uint64_t* out); -inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<0>(const uint8_t* in, uint64_t* out) { std::memset(out, 0, 32 * 8); return in; } -inline const uint8_t* unpack1_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<1>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 1) - uint64_t{1}); const auto w0 = static_cast(LoadInt(in + 0 * 8)); @@ -1805,7 +1864,8 @@ inline const uint8_t* unpack1_64(const uint8_t* in, uint64_t* out){ return in + (0 * 8 + 4); } -inline const uint8_t* unpack2_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<2>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 2) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -1845,7 +1905,8 @@ inline const uint8_t* unpack2_64(const uint8_t* in, uint64_t* out){ return in + (1 * 8); } -inline const uint8_t* unpack3_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<3>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 3) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -1886,7 +1947,8 @@ inline const uint8_t* unpack3_64(const uint8_t* in, uint64_t* out){ return in + (1 * 8 + 4); } -inline const uint8_t* unpack4_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<4>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 4) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -1927,7 +1989,8 @@ inline const uint8_t* unpack4_64(const uint8_t* in, uint64_t* out){ return in + (2 * 8); } -inline const uint8_t* unpack5_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<5>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 5) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -1969,7 +2032,8 @@ inline const uint8_t* unpack5_64(const uint8_t* in, uint64_t* out){ return in + (2 * 8 + 4); } -inline const uint8_t* unpack6_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<6>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 6) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2011,7 +2075,8 @@ inline const uint8_t* unpack6_64(const uint8_t* in, uint64_t* out){ return in + (3 * 8); } -inline const uint8_t* unpack7_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<7>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 7) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2054,7 +2119,8 @@ inline const uint8_t* unpack7_64(const uint8_t* in, uint64_t* out){ return in + (3 * 8 + 4); } -inline const uint8_t* unpack8_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<8>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 8) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2097,7 +2163,8 @@ inline const uint8_t* unpack8_64(const uint8_t* in, uint64_t* out){ return in + (4 * 8); } -inline const uint8_t* unpack9_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<9>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 9) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2141,7 +2208,8 @@ inline const uint8_t* unpack9_64(const uint8_t* in, uint64_t* out){ return in + (4 * 8 + 4); } -inline const uint8_t* unpack10_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<10>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 10) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2185,7 +2253,8 @@ inline const uint8_t* unpack10_64(const uint8_t* in, uint64_t* out){ return in + (5 * 8); } -inline const uint8_t* unpack11_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<11>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 11) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2230,7 +2299,8 @@ inline const uint8_t* unpack11_64(const uint8_t* in, uint64_t* out){ return in + (5 * 8 + 4); } -inline const uint8_t* unpack12_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<12>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 12) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2275,7 +2345,8 @@ inline const uint8_t* unpack12_64(const uint8_t* in, uint64_t* out){ return in + (6 * 8); } -inline const uint8_t* unpack13_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<13>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 13) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2321,7 +2392,8 @@ inline const uint8_t* unpack13_64(const uint8_t* in, uint64_t* out){ return in + (6 * 8 + 4); } -inline const uint8_t* unpack14_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<14>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 14) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2367,7 +2439,8 @@ inline const uint8_t* unpack14_64(const uint8_t* in, uint64_t* out){ return in + (7 * 8); } -inline const uint8_t* unpack15_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<15>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 15) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2414,7 +2487,8 @@ inline const uint8_t* unpack15_64(const uint8_t* in, uint64_t* out){ return in + (7 * 8 + 4); } -inline const uint8_t* unpack16_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<16>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 16) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2461,7 +2535,8 @@ inline const uint8_t* unpack16_64(const uint8_t* in, uint64_t* out){ return in + (8 * 8); } -inline const uint8_t* unpack17_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<17>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 17) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2509,7 +2584,8 @@ inline const uint8_t* unpack17_64(const uint8_t* in, uint64_t* out){ return in + (8 * 8 + 4); } -inline const uint8_t* unpack18_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<18>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 18) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2557,7 +2633,8 @@ inline const uint8_t* unpack18_64(const uint8_t* in, uint64_t* out){ return in + (9 * 8); } -inline const uint8_t* unpack19_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<19>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 19) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2606,7 +2683,8 @@ inline const uint8_t* unpack19_64(const uint8_t* in, uint64_t* out){ return in + (9 * 8 + 4); } -inline const uint8_t* unpack20_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<20>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 20) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2655,7 +2733,8 @@ inline const uint8_t* unpack20_64(const uint8_t* in, uint64_t* out){ return in + (10 * 8); } -inline const uint8_t* unpack21_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<21>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 21) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2705,7 +2784,8 @@ inline const uint8_t* unpack21_64(const uint8_t* in, uint64_t* out){ return in + (10 * 8 + 4); } -inline const uint8_t* unpack22_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<22>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 22) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2755,7 +2835,8 @@ inline const uint8_t* unpack22_64(const uint8_t* in, uint64_t* out){ return in + (11 * 8); } -inline const uint8_t* unpack23_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<23>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 23) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2806,7 +2887,8 @@ inline const uint8_t* unpack23_64(const uint8_t* in, uint64_t* out){ return in + (11 * 8 + 4); } -inline const uint8_t* unpack24_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<24>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 24) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2857,7 +2939,8 @@ inline const uint8_t* unpack24_64(const uint8_t* in, uint64_t* out){ return in + (12 * 8); } -inline const uint8_t* unpack25_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<25>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 25) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2909,7 +2992,8 @@ inline const uint8_t* unpack25_64(const uint8_t* in, uint64_t* out){ return in + (12 * 8 + 4); } -inline const uint8_t* unpack26_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<26>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 26) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2961,7 +3045,8 @@ inline const uint8_t* unpack26_64(const uint8_t* in, uint64_t* out){ return in + (13 * 8); } -inline const uint8_t* unpack27_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<27>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 27) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3014,7 +3099,8 @@ inline const uint8_t* unpack27_64(const uint8_t* in, uint64_t* out){ return in + (13 * 8 + 4); } -inline const uint8_t* unpack28_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<28>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 28) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3067,7 +3153,8 @@ inline const uint8_t* unpack28_64(const uint8_t* in, uint64_t* out){ return in + (14 * 8); } -inline const uint8_t* unpack29_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<29>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 29) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3121,7 +3208,8 @@ inline const uint8_t* unpack29_64(const uint8_t* in, uint64_t* out){ return in + (14 * 8 + 4); } -inline const uint8_t* unpack30_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<30>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 30) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3175,7 +3263,8 @@ inline const uint8_t* unpack30_64(const uint8_t* in, uint64_t* out){ return in + (15 * 8); } -inline const uint8_t* unpack31_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<31>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 31) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3230,7 +3319,8 @@ inline const uint8_t* unpack31_64(const uint8_t* in, uint64_t* out){ return in + (15 * 8 + 4); } -inline const uint8_t* unpack32_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<32>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 32) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3285,7 +3375,8 @@ inline const uint8_t* unpack32_64(const uint8_t* in, uint64_t* out){ return in + (16 * 8); } -inline const uint8_t* unpack33_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<33>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 33) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3341,7 +3432,8 @@ inline const uint8_t* unpack33_64(const uint8_t* in, uint64_t* out){ return in + (16 * 8 + 4); } -inline const uint8_t* unpack34_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<34>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 34) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3397,7 +3489,8 @@ inline const uint8_t* unpack34_64(const uint8_t* in, uint64_t* out){ return in + (17 * 8); } -inline const uint8_t* unpack35_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<35>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 35) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3454,7 +3547,8 @@ inline const uint8_t* unpack35_64(const uint8_t* in, uint64_t* out){ return in + (17 * 8 + 4); } -inline const uint8_t* unpack36_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<36>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 36) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3511,7 +3605,8 @@ inline const uint8_t* unpack36_64(const uint8_t* in, uint64_t* out){ return in + (18 * 8); } -inline const uint8_t* unpack37_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<37>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 37) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3569,7 +3664,8 @@ inline const uint8_t* unpack37_64(const uint8_t* in, uint64_t* out){ return in + (18 * 8 + 4); } -inline const uint8_t* unpack38_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<38>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 38) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3627,7 +3723,8 @@ inline const uint8_t* unpack38_64(const uint8_t* in, uint64_t* out){ return in + (19 * 8); } -inline const uint8_t* unpack39_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<39>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 39) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3686,7 +3783,8 @@ inline const uint8_t* unpack39_64(const uint8_t* in, uint64_t* out){ return in + (19 * 8 + 4); } -inline const uint8_t* unpack40_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<40>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 40) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3745,7 +3843,8 @@ inline const uint8_t* unpack40_64(const uint8_t* in, uint64_t* out){ return in + (20 * 8); } -inline const uint8_t* unpack41_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<41>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 41) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3805,7 +3904,8 @@ inline const uint8_t* unpack41_64(const uint8_t* in, uint64_t* out){ return in + (20 * 8 + 4); } -inline const uint8_t* unpack42_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<42>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 42) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3865,7 +3965,8 @@ inline const uint8_t* unpack42_64(const uint8_t* in, uint64_t* out){ return in + (21 * 8); } -inline const uint8_t* unpack43_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<43>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 43) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3926,7 +4027,8 @@ inline const uint8_t* unpack43_64(const uint8_t* in, uint64_t* out){ return in + (21 * 8 + 4); } -inline const uint8_t* unpack44_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<44>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 44) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3987,7 +4089,8 @@ inline const uint8_t* unpack44_64(const uint8_t* in, uint64_t* out){ return in + (22 * 8); } -inline const uint8_t* unpack45_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<45>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 45) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4049,7 +4152,8 @@ inline const uint8_t* unpack45_64(const uint8_t* in, uint64_t* out){ return in + (22 * 8 + 4); } -inline const uint8_t* unpack46_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<46>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 46) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4111,7 +4215,8 @@ inline const uint8_t* unpack46_64(const uint8_t* in, uint64_t* out){ return in + (23 * 8); } -inline const uint8_t* unpack47_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<47>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 47) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4174,7 +4279,8 @@ inline const uint8_t* unpack47_64(const uint8_t* in, uint64_t* out){ return in + (23 * 8 + 4); } -inline const uint8_t* unpack48_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<48>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 48) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4237,7 +4343,8 @@ inline const uint8_t* unpack48_64(const uint8_t* in, uint64_t* out){ return in + (24 * 8); } -inline const uint8_t* unpack49_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<49>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 49) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4301,7 +4408,8 @@ inline const uint8_t* unpack49_64(const uint8_t* in, uint64_t* out){ return in + (24 * 8 + 4); } -inline const uint8_t* unpack50_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<50>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 50) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4365,7 +4473,8 @@ inline const uint8_t* unpack50_64(const uint8_t* in, uint64_t* out){ return in + (25 * 8); } -inline const uint8_t* unpack51_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<51>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 51) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4430,7 +4539,8 @@ inline const uint8_t* unpack51_64(const uint8_t* in, uint64_t* out){ return in + (25 * 8 + 4); } -inline const uint8_t* unpack52_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<52>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 52) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4495,7 +4605,8 @@ inline const uint8_t* unpack52_64(const uint8_t* in, uint64_t* out){ return in + (26 * 8); } -inline const uint8_t* unpack53_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<53>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 53) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4561,7 +4672,8 @@ inline const uint8_t* unpack53_64(const uint8_t* in, uint64_t* out){ return in + (26 * 8 + 4); } -inline const uint8_t* unpack54_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<54>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 54) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4627,7 +4739,8 @@ inline const uint8_t* unpack54_64(const uint8_t* in, uint64_t* out){ return in + (27 * 8); } -inline const uint8_t* unpack55_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<55>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 55) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4694,7 +4807,8 @@ inline const uint8_t* unpack55_64(const uint8_t* in, uint64_t* out){ return in + (27 * 8 + 4); } -inline const uint8_t* unpack56_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<56>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 56) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4761,7 +4875,8 @@ inline const uint8_t* unpack56_64(const uint8_t* in, uint64_t* out){ return in + (28 * 8); } -inline const uint8_t* unpack57_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<57>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 57) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4829,7 +4944,8 @@ inline const uint8_t* unpack57_64(const uint8_t* in, uint64_t* out){ return in + (28 * 8 + 4); } -inline const uint8_t* unpack58_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<58>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 58) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4897,7 +5013,8 @@ inline const uint8_t* unpack58_64(const uint8_t* in, uint64_t* out){ return in + (29 * 8); } -inline const uint8_t* unpack59_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<59>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 59) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4966,7 +5083,8 @@ inline const uint8_t* unpack59_64(const uint8_t* in, uint64_t* out){ return in + (29 * 8 + 4); } -inline const uint8_t* unpack60_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<60>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 60) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -5035,7 +5153,8 @@ inline const uint8_t* unpack60_64(const uint8_t* in, uint64_t* out){ return in + (30 * 8); } -inline const uint8_t* unpack61_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<61>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 61) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -5105,7 +5224,8 @@ inline const uint8_t* unpack61_64(const uint8_t* in, uint64_t* out){ return in + (30 * 8 + 4); } -inline const uint8_t* unpack62_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<62>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 62) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -5175,7 +5295,8 @@ inline const uint8_t* unpack62_64(const uint8_t* in, uint64_t* out){ return in + (31 * 8); } -inline const uint8_t* unpack63_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<63>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 63) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -5246,12 +5367,14 @@ inline const uint8_t* unpack63_64(const uint8_t* in, uint64_t* out){ return in + (31 * 8 + 4); } -inline const uint8_t* unpack64_64(const uint8_t* in, uint64_t* out){ +template<> +const uint8_t* unpack<64>(const uint8_t* in, uint64_t* out) { for(int k = 0; k < 32; k += 1) { out[k] = LoadInt(in + (k * 8)); } return in + (8 * 32); } +}; // struct } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h index fc5e151eb94..933c10d9030 100644 --- a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h @@ -38,6 +38,11 @@ struct Simd128Unpacker; template<> struct Simd128Unpacker { +using out_type = uint16_t; +using simd_batch = xsimd::make_sized_batch_t; + +static constexpr int kValuesUnpacked = 16; + template static const uint8_t* unpack(const uint8_t* in, uint16_t* out); @@ -50,8 +55,6 @@ const uint8_t* unpack<0>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<1>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x1; simd_batch masks(kMask); @@ -96,8 +99,6 @@ const uint8_t* unpack<1>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<2>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x3; simd_batch masks(kMask); @@ -142,8 +143,6 @@ const uint8_t* unpack<2>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<3>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x7; simd_batch masks(kMask); @@ -188,8 +187,6 @@ const uint8_t* unpack<3>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<4>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0xf; simd_batch masks(kMask); @@ -234,8 +231,6 @@ const uint8_t* unpack<4>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<5>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x1f; simd_batch masks(kMask); @@ -280,8 +275,6 @@ const uint8_t* unpack<5>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<6>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x3f; simd_batch masks(kMask); @@ -326,8 +319,6 @@ const uint8_t* unpack<6>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<7>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x7f; simd_batch masks(kMask); @@ -372,8 +363,6 @@ const uint8_t* unpack<7>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<8>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0xff; simd_batch masks(kMask); @@ -418,8 +407,6 @@ const uint8_t* unpack<8>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<9>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x1ff; simd_batch masks(kMask); @@ -464,8 +451,6 @@ const uint8_t* unpack<9>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<10>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x3ff; simd_batch masks(kMask); @@ -510,8 +495,6 @@ const uint8_t* unpack<10>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<11>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x7ff; simd_batch masks(kMask); @@ -556,8 +539,6 @@ const uint8_t* unpack<11>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<12>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0xfff; simd_batch masks(kMask); @@ -602,8 +583,6 @@ const uint8_t* unpack<12>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<13>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x1fff; simd_batch masks(kMask); @@ -648,8 +627,6 @@ const uint8_t* unpack<13>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<14>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x3fff; simd_batch masks(kMask); @@ -694,8 +671,6 @@ const uint8_t* unpack<14>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<15>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x7fff; simd_batch masks(kMask); @@ -750,6 +725,11 @@ const uint8_t* unpack<16>(const uint8_t* in, uint16_t* out) { template<> struct Simd128Unpacker { +using out_type = uint32_t; +using simd_batch = xsimd::make_sized_batch_t; + +static constexpr int kValuesUnpacked = 32; + template static const uint8_t* unpack(const uint8_t* in, uint32_t* out); @@ -762,8 +742,6 @@ const uint8_t* unpack<0>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1; simd_batch masks(kMask); @@ -872,8 +850,6 @@ const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3; simd_batch masks(kMask); @@ -982,8 +958,6 @@ const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7; simd_batch masks(kMask); @@ -1092,8 +1066,6 @@ const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xf; simd_batch masks(kMask); @@ -1202,8 +1174,6 @@ const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1f; simd_batch masks(kMask); @@ -1312,8 +1282,6 @@ const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3f; simd_batch masks(kMask); @@ -1422,8 +1390,6 @@ const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7f; simd_batch masks(kMask); @@ -1532,8 +1498,6 @@ const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xff; simd_batch masks(kMask); @@ -1642,8 +1606,6 @@ const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1ff; simd_batch masks(kMask); @@ -1752,8 +1714,6 @@ const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3ff; simd_batch masks(kMask); @@ -1862,8 +1822,6 @@ const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7ff; simd_batch masks(kMask); @@ -1972,8 +1930,6 @@ const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xfff; simd_batch masks(kMask); @@ -2082,8 +2038,6 @@ const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1fff; simd_batch masks(kMask); @@ -2192,8 +2146,6 @@ const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3fff; simd_batch masks(kMask); @@ -2302,8 +2254,6 @@ const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7fff; simd_batch masks(kMask); @@ -2412,8 +2362,6 @@ const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xffff; simd_batch masks(kMask); @@ -2522,8 +2470,6 @@ const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1ffff; simd_batch masks(kMask); @@ -2632,8 +2578,6 @@ const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3ffff; simd_batch masks(kMask); @@ -2742,8 +2686,6 @@ const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7ffff; simd_batch masks(kMask); @@ -2852,8 +2794,6 @@ const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xfffff; simd_batch masks(kMask); @@ -2962,8 +2902,6 @@ const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1fffff; simd_batch masks(kMask); @@ -3072,8 +3010,6 @@ const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3fffff; simd_batch masks(kMask); @@ -3182,8 +3118,6 @@ const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7fffff; simd_batch masks(kMask); @@ -3292,8 +3226,6 @@ const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xffffff; simd_batch masks(kMask); @@ -3402,8 +3334,6 @@ const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1ffffff; simd_batch masks(kMask); @@ -3512,8 +3442,6 @@ const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3ffffff; simd_batch masks(kMask); @@ -3622,8 +3550,6 @@ const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7ffffff; simd_batch masks(kMask); @@ -3732,8 +3658,6 @@ const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xfffffff; simd_batch masks(kMask); @@ -3842,8 +3766,6 @@ const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1fffffff; simd_batch masks(kMask); @@ -3952,8 +3874,6 @@ const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3fffffff; simd_batch masks(kMask); @@ -4062,8 +3982,6 @@ const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<31>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7fffffff; simd_batch masks(kMask); @@ -4179,6 +4097,4 @@ const uint8_t* unpack<32>(const uint8_t* in, uint32_t* out) { } }; // struct Unpacker - } // namespace arrow::internal - diff --git a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h index 069c44548b4..be522a486b6 100644 --- a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h @@ -38,6 +38,11 @@ struct Simd256Unpacker; template<> struct Simd256Unpacker { +using out_type = uint16_t; +using simd_batch = xsimd::make_sized_batch_t; + +static constexpr int kValuesUnpacked = 16; + template static const uint8_t* unpack(const uint8_t* in, uint16_t* out); @@ -50,8 +55,6 @@ const uint8_t* unpack<0>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<1>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x1; simd_batch masks(kMask); @@ -88,8 +91,6 @@ const uint8_t* unpack<1>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<2>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x3; simd_batch masks(kMask); @@ -126,8 +127,6 @@ const uint8_t* unpack<2>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<3>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x7; simd_batch masks(kMask); @@ -164,8 +163,6 @@ const uint8_t* unpack<3>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<4>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0xf; simd_batch masks(kMask); @@ -202,8 +199,6 @@ const uint8_t* unpack<4>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<5>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x1f; simd_batch masks(kMask); @@ -240,8 +235,6 @@ const uint8_t* unpack<5>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<6>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x3f; simd_batch masks(kMask); @@ -278,8 +271,6 @@ const uint8_t* unpack<6>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<7>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x7f; simd_batch masks(kMask); @@ -316,8 +307,6 @@ const uint8_t* unpack<7>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<8>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0xff; simd_batch masks(kMask); @@ -354,8 +343,6 @@ const uint8_t* unpack<8>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<9>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x1ff; simd_batch masks(kMask); @@ -392,8 +379,6 @@ const uint8_t* unpack<9>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<10>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x3ff; simd_batch masks(kMask); @@ -430,8 +415,6 @@ const uint8_t* unpack<10>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<11>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x7ff; simd_batch masks(kMask); @@ -468,8 +451,6 @@ const uint8_t* unpack<11>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<12>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0xfff; simd_batch masks(kMask); @@ -506,8 +487,6 @@ const uint8_t* unpack<12>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<13>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x1fff; simd_batch masks(kMask); @@ -544,8 +523,6 @@ const uint8_t* unpack<13>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<14>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x3fff; simd_batch masks(kMask); @@ -582,8 +559,6 @@ const uint8_t* unpack<14>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<15>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x7fff; simd_batch masks(kMask); @@ -630,6 +605,11 @@ const uint8_t* unpack<16>(const uint8_t* in, uint16_t* out) { template<> struct Simd256Unpacker { +using out_type = uint32_t; +using simd_batch = xsimd::make_sized_batch_t; + +static constexpr int kValuesUnpacked = 32; + template static const uint8_t* unpack(const uint8_t* in, uint32_t* out); @@ -642,8 +622,6 @@ const uint8_t* unpack<0>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1; simd_batch masks(kMask); @@ -720,8 +698,6 @@ const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3; simd_batch masks(kMask); @@ -798,8 +774,6 @@ const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7; simd_batch masks(kMask); @@ -876,8 +850,6 @@ const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xf; simd_batch masks(kMask); @@ -954,8 +926,6 @@ const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1f; simd_batch masks(kMask); @@ -1032,8 +1002,6 @@ const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3f; simd_batch masks(kMask); @@ -1110,8 +1078,6 @@ const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7f; simd_batch masks(kMask); @@ -1188,8 +1154,6 @@ const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xff; simd_batch masks(kMask); @@ -1266,8 +1230,6 @@ const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1ff; simd_batch masks(kMask); @@ -1344,8 +1306,6 @@ const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3ff; simd_batch masks(kMask); @@ -1422,8 +1382,6 @@ const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7ff; simd_batch masks(kMask); @@ -1500,8 +1458,6 @@ const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xfff; simd_batch masks(kMask); @@ -1578,8 +1534,6 @@ const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1fff; simd_batch masks(kMask); @@ -1656,8 +1610,6 @@ const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3fff; simd_batch masks(kMask); @@ -1734,8 +1686,6 @@ const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7fff; simd_batch masks(kMask); @@ -1812,8 +1762,6 @@ const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xffff; simd_batch masks(kMask); @@ -1890,8 +1838,6 @@ const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1ffff; simd_batch masks(kMask); @@ -1968,8 +1914,6 @@ const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3ffff; simd_batch masks(kMask); @@ -2046,8 +1990,6 @@ const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7ffff; simd_batch masks(kMask); @@ -2124,8 +2066,6 @@ const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xfffff; simd_batch masks(kMask); @@ -2202,8 +2142,6 @@ const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1fffff; simd_batch masks(kMask); @@ -2280,8 +2218,6 @@ const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3fffff; simd_batch masks(kMask); @@ -2358,8 +2294,6 @@ const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7fffff; simd_batch masks(kMask); @@ -2436,8 +2370,6 @@ const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xffffff; simd_batch masks(kMask); @@ -2514,8 +2446,6 @@ const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1ffffff; simd_batch masks(kMask); @@ -2592,8 +2522,6 @@ const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3ffffff; simd_batch masks(kMask); @@ -2670,8 +2598,6 @@ const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7ffffff; simd_batch masks(kMask); @@ -2748,8 +2674,6 @@ const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xfffffff; simd_batch masks(kMask); @@ -2826,8 +2750,6 @@ const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1fffffff; simd_batch masks(kMask); @@ -2904,8 +2826,6 @@ const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3fffffff; simd_batch masks(kMask); @@ -2982,8 +2902,6 @@ const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<31>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7fffffff; simd_batch masks(kMask); @@ -3067,6 +2985,4 @@ const uint8_t* unpack<32>(const uint8_t* in, uint32_t* out) { } }; // struct Unpacker - } // namespace arrow::internal - diff --git a/cpp/src/arrow/util/bpacking_simd512_generated_internal.h b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h index 66564225dc5..3fb06b5709c 100644 --- a/cpp/src/arrow/util/bpacking_simd512_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h @@ -38,6 +38,11 @@ struct Simd512Unpacker; template<> struct Simd512Unpacker { +using out_type = uint16_t; +using simd_batch = xsimd::make_sized_batch_t; + +static constexpr int kValuesUnpacked = 16; + template static const uint8_t* unpack(const uint8_t* in, uint16_t* out); @@ -50,8 +55,6 @@ const uint8_t* unpack<0>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<1>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x1; simd_batch masks(kMask); @@ -88,8 +91,6 @@ const uint8_t* unpack<1>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<2>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x3; simd_batch masks(kMask); @@ -126,8 +127,6 @@ const uint8_t* unpack<2>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<3>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x7; simd_batch masks(kMask); @@ -164,8 +163,6 @@ const uint8_t* unpack<3>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<4>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0xf; simd_batch masks(kMask); @@ -202,8 +199,6 @@ const uint8_t* unpack<4>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<5>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x1f; simd_batch masks(kMask); @@ -240,8 +235,6 @@ const uint8_t* unpack<5>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<6>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x3f; simd_batch masks(kMask); @@ -278,8 +271,6 @@ const uint8_t* unpack<6>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<7>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x7f; simd_batch masks(kMask); @@ -316,8 +307,6 @@ const uint8_t* unpack<7>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<8>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0xff; simd_batch masks(kMask); @@ -354,8 +343,6 @@ const uint8_t* unpack<8>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<9>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x1ff; simd_batch masks(kMask); @@ -392,8 +379,6 @@ const uint8_t* unpack<9>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<10>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x3ff; simd_batch masks(kMask); @@ -430,8 +415,6 @@ const uint8_t* unpack<10>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<11>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x7ff; simd_batch masks(kMask); @@ -468,8 +451,6 @@ const uint8_t* unpack<11>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<12>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0xfff; simd_batch masks(kMask); @@ -506,8 +487,6 @@ const uint8_t* unpack<12>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<13>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x1fff; simd_batch masks(kMask); @@ -544,8 +523,6 @@ const uint8_t* unpack<13>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<14>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x3fff; simd_batch masks(kMask); @@ -582,8 +559,6 @@ const uint8_t* unpack<14>(const uint8_t* in, uint16_t* out) { template<> const uint8_t* unpack<15>(const uint8_t* in, uint16_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint16_t kMask = 0x7fff; simd_batch masks(kMask); @@ -630,6 +605,11 @@ const uint8_t* unpack<16>(const uint8_t* in, uint16_t* out) { template<> struct Simd512Unpacker { +using out_type = uint32_t; +using simd_batch = xsimd::make_sized_batch_t; + +static constexpr int kValuesUnpacked = 32; + template static const uint8_t* unpack(const uint8_t* in, uint32_t* out); @@ -642,8 +622,6 @@ const uint8_t* unpack<0>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1; simd_batch masks(kMask); @@ -704,8 +682,6 @@ const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3; simd_batch masks(kMask); @@ -766,8 +742,6 @@ const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7; simd_batch masks(kMask); @@ -828,8 +802,6 @@ const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xf; simd_batch masks(kMask); @@ -890,8 +862,6 @@ const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1f; simd_batch masks(kMask); @@ -952,8 +922,6 @@ const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3f; simd_batch masks(kMask); @@ -1014,8 +982,6 @@ const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7f; simd_batch masks(kMask); @@ -1076,8 +1042,6 @@ const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xff; simd_batch masks(kMask); @@ -1138,8 +1102,6 @@ const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1ff; simd_batch masks(kMask); @@ -1200,8 +1162,6 @@ const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3ff; simd_batch masks(kMask); @@ -1262,8 +1222,6 @@ const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7ff; simd_batch masks(kMask); @@ -1324,8 +1282,6 @@ const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xfff; simd_batch masks(kMask); @@ -1386,8 +1342,6 @@ const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1fff; simd_batch masks(kMask); @@ -1448,8 +1402,6 @@ const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3fff; simd_batch masks(kMask); @@ -1510,8 +1462,6 @@ const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7fff; simd_batch masks(kMask); @@ -1572,8 +1522,6 @@ const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xffff; simd_batch masks(kMask); @@ -1634,8 +1582,6 @@ const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1ffff; simd_batch masks(kMask); @@ -1696,8 +1642,6 @@ const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3ffff; simd_batch masks(kMask); @@ -1758,8 +1702,6 @@ const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7ffff; simd_batch masks(kMask); @@ -1820,8 +1762,6 @@ const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xfffff; simd_batch masks(kMask); @@ -1882,8 +1822,6 @@ const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1fffff; simd_batch masks(kMask); @@ -1944,8 +1882,6 @@ const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3fffff; simd_batch masks(kMask); @@ -2006,8 +1942,6 @@ const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7fffff; simd_batch masks(kMask); @@ -2068,8 +2002,6 @@ const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xffffff; simd_batch masks(kMask); @@ -2130,8 +2062,6 @@ const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1ffffff; simd_batch masks(kMask); @@ -2192,8 +2122,6 @@ const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3ffffff; simd_batch masks(kMask); @@ -2254,8 +2182,6 @@ const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7ffffff; simd_batch masks(kMask); @@ -2316,8 +2242,6 @@ const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0xfffffff; simd_batch masks(kMask); @@ -2378,8 +2302,6 @@ const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x1fffffff; simd_batch masks(kMask); @@ -2440,8 +2362,6 @@ const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x3fffffff; simd_batch masks(kMask); @@ -2502,8 +2422,6 @@ const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { template<> const uint8_t* unpack<31>(const uint8_t* in, uint32_t* out) { - using simd_batch = xsimd::make_sized_batch_t; - constexpr uint32_t kMask = 0x7fffffff; simd_batch masks(kMask); @@ -2571,6 +2489,4 @@ const uint8_t* unpack<32>(const uint8_t* in, uint32_t* out) { } }; // struct Unpacker - } // namespace arrow::internal - From baf4a5a5b7b5fab2b0780e14f433a667e956f890 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 19 Sep 2025 11:18:46 +0200 Subject: [PATCH 19/76] Try new simd scheme --- .../bpacking_simd128_generated_internal.h | 155 ++++++------------ .../bpacking_simd256_generated_internal.h | 116 ++++++------- 2 files changed, 98 insertions(+), 173 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h index 933c10d9030..49a8dd277e7 100644 --- a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h @@ -727,6 +727,14 @@ struct Simd128Unpacker { using out_type = uint32_t; using simd_batch = xsimd::make_sized_batch_t; +template +using simd_batch_constants = + xsimd::batch_constant; +using simd_bytes = + xsimd::make_sized_batch_t; +template +using simd_bytes_constants = + xsimd::batch_constant; static constexpr int kValuesUnpacked = 32; @@ -740,112 +748,53 @@ const uint8_t* unpack<0>(const uint8_t* in, uint32_t* out) { return in; } -template<> +template +struct Reorder { + static constexpr unsigned get(unsigned i, unsigned n) { + if (i % 4 == 0) { + return K; + } + return 128; + } +}; + +template <> const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 1-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 0, 1, 2, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 4, 5, 6, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 8, 9, 10, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 12, 13, 14, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 16, 17, 18, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 20, 21, 22, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 24, 25, 26, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 28, 29, 30, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 1 * 4; - return in; + constexpr auto kShifts1 = simd_batch_constants<0, 1, 2, 3>{}; + constexpr auto kShifts2 = simd_batch_constants<4, 5, 6, 7>{}; + + { + auto bytes = simd_bytes::load_unaligned(in + 4 * 0); + // TODO var shifts no avail on SSE + { + constexpr auto kReorder = xsimd::make_batch_constant>(); + auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); + ((numbers >> kShifts1) & kMask).store_unaligned(out + 0 * 4); + ((numbers >> kShifts2) & kMask).store_unaligned(out + 1 * 4); + } + { + constexpr auto kReorder = xsimd::make_batch_constant>(); + auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); + ((numbers >> kShifts1) & kMask).store_unaligned(out + 2 * 4); + ((numbers >> kShifts2) & kMask).store_unaligned(out + 3 * 4); + } + { + constexpr auto kReorder = xsimd::make_batch_constant>(); + auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); + ((numbers >> kShifts1) & kMask).store_unaligned(out + 4 * 4); + ((numbers >> kShifts2) & kMask).store_unaligned(out + 5 * 4); + } + { + constexpr auto kReorder = xsimd::make_batch_constant>(); + auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); + ((numbers >> kShifts1) & kMask).store_unaligned(out + 6 * 4); + ((numbers >> kShifts2) & kMask).store_unaligned(out + 7 * 4); + } + } + + return in + 4 ; } template<> diff --git a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h index be522a486b6..3d7daafe35c 100644 --- a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h @@ -607,6 +607,24 @@ struct Simd256Unpacker { using out_type = uint32_t; using simd_batch = xsimd::make_sized_batch_t; +template +using simd_batch_constants = + xsimd::batch_constant; +using simd_bytes = + xsimd::make_sized_batch_t; +template +using simd_bytes_constants = + xsimd::batch_constant; + +template +struct Reorder { + static constexpr unsigned get(unsigned i, unsigned n) { + if (i % 4 == 0) { + return K; + } + return 128; + } +}; static constexpr int kValuesUnpacked = 32; @@ -624,76 +642,34 @@ template<> const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 1-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 0, 1, 2, 3, 4, 5, 6, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 1-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 8, 9, 10, 11, 12, 13, 14, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 1-bit bundles 16 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 16, 17, 18, 19, 20, 21, 22, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 1-bit bundles 24 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 24, 25, 26, 27, 28, 29, 30, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 1 * 4; - return in; + constexpr auto kShifts1 = simd_batch_constants<0, 1, 2, 3, 4, 5, 6, 7>{}; + + { + auto bytes = simd_bytes::load_unaligned(in + 4 * 0); + // TODO var shifts no avail on SSE + { + constexpr auto kReorder = xsimd::make_batch_constant>(); + auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); + ((numbers >> kShifts1) & kMask).store_unaligned(out + 0 * 8); + } + { + constexpr auto kReorder = xsimd::make_batch_constant>(); + auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); + ((numbers >> kShifts1) & kMask).store_unaligned(out + 1 * 8); + } + { + constexpr auto kReorder = xsimd::make_batch_constant>(); + auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); + ((numbers >> kShifts1) & kMask).store_unaligned(out + 2 * 8); + } + { + constexpr auto kReorder = xsimd::make_batch_constant>(); + auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); + ((numbers >> kShifts1) & kMask).store_unaligned(out + 3 * 8); + } + } + + return in + 4 ; } template<> From faf8fac9cc0ce6370cd2d73ed48f4467378d7ed4 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 19 Sep 2025 12:27:58 +0200 Subject: [PATCH 20/76] Fix template specialization --- cpp/src/arrow/util/bpacking_scalar_codegen.py | 50 +++++++++---------- cpp/src/arrow/util/bpacking_simd_codegen.py | 44 ++++++++-------- 2 files changed, 47 insertions(+), 47 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_scalar_codegen.py b/cpp/src/arrow/util/bpacking_scalar_codegen.py index bdf63d88eb2..236d8781ac2 100644 --- a/cpp/src/arrow/util/bpacking_scalar_codegen.py +++ b/cpp/src/arrow/util/bpacking_scalar_codegen.py @@ -94,6 +94,14 @@ def out_type(self) -> str: def out_type_half(self) -> str: return f"uint{self.out_bit_width // 2}_t" + @property + def struct_name(self) -> str: + return "ScalarUnpacker" + + @property + def struct_specialization(self) -> str: + return f"{self.struct_name}<{self.out_type}>" + @property def howmany(self) -> int: """How many values are we going to pack?""" @@ -110,19 +118,15 @@ def howmanybytes(self, bit: int) -> int: def print_unpack_signature(self, bit: int | None) -> str: if bit is None: print("template") - static = "static " - specialized = "" - end = ";" + print( + f"static const uint8_t* unpack(const uint8_t* in, {self.out_type}* out);" + ) else: print("template<>") - static = "" - specialized = f"<{bit}>" - end = " {" - - print( - f"{static}const uint8_t* unpack{specialized}" - f"(const uint8_t* in, {self.out_type}* out){end}" - ) + print( + f"const uint8_t* {self.struct_specialization}::unpack<{bit}>" + f"(const uint8_t* in, {self.out_type}* out) {{" + ) def print_unpack_0(self) -> None: self.print_unpack_signature(0) @@ -134,8 +138,7 @@ def print_unpack_last(self) -> None: self.print_unpack_signature(self.out_bit_width) print(f" for(int k = 0; k < {self.howmany}; k += 1) {{") print( - f" out[k] = LoadInt<{self.out_type}>(" - f"in + (k * {self.out_byte_width}));" + f" out[k] = LoadInt<{self.out_type}>(in + (k * {self.out_byte_width}));" ) print(" }") print(f" return in + ({self.out_byte_width} * {self.howmany});") @@ -200,27 +203,25 @@ def print_unpack_k(self, bit: int) -> None: def print_struct_declaration(self): print("template") - print("struct ScalarUnpacker;") + print(f"struct {self.struct_name};") - def print_struct_top(self): + def print_struct(self): print("template<>") - print(f"struct ScalarUnpacker<{self.out_type}> {{") + print(f"struct {self.struct_specialization} {{") print() print(f"using out_type = {self.out_type};") print() print(f"static constexpr int kValuesUnpacked = {self.howmany};") print() self.print_unpack_signature(None) + print("};") - def print_struct_bottom(self): - print("}; // struct") - - def print_struct(self): - self.print_struct_top() + def print_struct_and_def(self): + self.print_struct() print() self.print_unpack_0() - print("") + print() for bit in range(1, self.out_bit_width): self.print_unpack_k(bit) @@ -228,7 +229,6 @@ def print_struct(self): self.print_unpack_last() - self.print_struct_bottom() def print_note(): @@ -246,10 +246,10 @@ def print_note(): gen.print_struct_declaration() print() - gen.print_struct() + gen.print_struct_and_def() print() gen = ScalarUnpackGenerator(64, smart_halve=True) - gen.print_struct() + gen.print_struct_and_def() print(FOOTER) diff --git a/cpp/src/arrow/util/bpacking_simd_codegen.py b/cpp/src/arrow/util/bpacking_simd_codegen.py index 5c627b59dcd..84a741bb690 100755 --- a/cpp/src/arrow/util/bpacking_simd_codegen.py +++ b/cpp/src/arrow/util/bpacking_simd_codegen.py @@ -67,6 +67,14 @@ def out_byte_width(self) -> int: def out_type(self) -> str: return f"uint{self.out_bit_width}_t" + @property + def struct_name(self) -> str: + return f"Simd{self.simd_bit_width}Unpacker" + + @property + def struct_specialization(self) -> str: + return f"{self.struct_name}<{self.out_type}>" + def __post_init__(self): if self.simd_bit_width % self.out_bit_width != 0: raise ("SIMD bit width should be a multiple of output width") @@ -74,19 +82,15 @@ def __post_init__(self): def print_unpack_signature(self, bit: int | None) -> str: if bit is None: print("template") - static = "static " - specialized = "" - end = ";" + print( + f"static const uint8_t* unpack(const uint8_t* in, {self.out_type}* out);" + ) else: print("template<>") - static = "" - specialized = f"<{bit}>" - end = " {" - - print( - f"{static}const uint8_t* unpack{specialized}" - f"(const uint8_t* in, {self.out_type}* out){end}" - ) + print( + f"const uint8_t* {self.struct_specialization}::unpack<{bit}>" + f"(const uint8_t* in, {self.out_type}* out) {{" + ) def print_unpack_bit0_func(self): self.print_unpack_signature(0) @@ -185,11 +189,11 @@ def static_cast_as_needed(str): def print_struct_declaration(self): print("template") - print(f"struct Simd{self.simd_bit_width}Unpacker;") + print(f"struct {self.struct_name};") - def print_struct_top(self): + def print_struct(self): print("template<>") - print(f"struct Simd{self.simd_bit_width}Unpacker<{self.out_type}> {{") + print(f"struct {self.struct_specialization} {{") print() print(f"using out_type = {self.out_type};") print( @@ -200,12 +204,10 @@ def print_struct_top(self): print(f"static constexpr int kValuesUnpacked = {self.out_bit_width};") print() self.print_unpack_signature(None) + print("};") - def print_struct_bottom(self): - print("}; // struct Unpacker") - - def print_struct(self): - self.print_struct_top() + def print_struct_and_def(self): + self.print_struct() print() self.print_unpack_bit0_func() @@ -215,8 +217,6 @@ def print_struct(self): print() self.print_unpack_bitmax_func() - self.print_struct_bottom() - @dataclasses.dataclass class UnpackFileGenerator: @@ -259,7 +259,7 @@ def print_structs(self): print() delclared.add(gen.simd_bit_width) - gen.print_struct() + gen.print_struct_and_def() print() def print_file(self): From 1e4b851701a8d559893a67270d14ab44a3052604 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 19 Sep 2025 12:28:44 +0200 Subject: [PATCH 21/76] Gen: regenerate unpack files --- .../util/bpacking_scalar_generated_internal.h | 200 +++++++------- .../bpacking_simd128_generated_internal.h | 259 +++++++++++------- .../bpacking_simd256_generated_internal.h | 220 ++++++++------- .../bpacking_simd512_generated_internal.h | 104 +++---- 4 files changed, 429 insertions(+), 354 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_scalar_generated_internal.h b/cpp/src/arrow/util/bpacking_scalar_generated_internal.h index 824cf9306e0..cac8227d550 100644 --- a/cpp/src/arrow/util/bpacking_scalar_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_scalar_generated_internal.h @@ -55,15 +55,16 @@ static constexpr int kValuesUnpacked = 32; template static const uint8_t* unpack(const uint8_t* in, uint32_t* out); +}; template<> -const uint8_t* unpack<0>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<0>(const uint8_t* in, uint32_t* out) { std::memset(out, 0, 32 * 4); return in; } template<> -const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<1>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 1) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -104,7 +105,7 @@ const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<2>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 2) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -146,7 +147,7 @@ const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<3>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 3) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -189,7 +190,7 @@ const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<4>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 4) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -233,7 +234,7 @@ const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<5>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 5) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -278,7 +279,7 @@ const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<6>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 6) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -324,7 +325,7 @@ const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<7>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 7) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -371,7 +372,7 @@ const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<8>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 8) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -419,7 +420,7 @@ const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<9>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 9) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -468,7 +469,7 @@ const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<10>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 10) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -518,7 +519,7 @@ const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<11>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 11) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -569,7 +570,7 @@ const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<12>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 12) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -621,7 +622,7 @@ const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<13>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 13) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -674,7 +675,7 @@ const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<14>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 14) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -728,7 +729,7 @@ const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<15>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 15) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -783,7 +784,7 @@ const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<16>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 16) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -839,7 +840,7 @@ const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<17>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 17) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -896,7 +897,7 @@ const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<18>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 18) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -954,7 +955,7 @@ const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<19>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 19) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1013,7 +1014,7 @@ const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<20>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 20) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1073,7 +1074,7 @@ const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<21>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 21) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1134,7 +1135,7 @@ const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<22>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 22) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1196,7 +1197,7 @@ const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<23>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 23) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1259,7 +1260,7 @@ const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<24>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 24) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1323,7 +1324,7 @@ const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<25>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 25) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1388,7 +1389,7 @@ const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<26>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 26) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1454,7 +1455,7 @@ const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<27>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 27) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1521,7 +1522,7 @@ const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<28>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 28) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1589,7 +1590,7 @@ const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<29>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 29) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1658,7 +1659,7 @@ const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<30>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 30) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1728,7 +1729,7 @@ const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<31>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<31>(const uint8_t* in, uint32_t* out) { constexpr uint32_t mask = ((uint32_t{1} << 31) - uint32_t{1}); const auto w0 = LoadInt(in + 0 * 4); @@ -1799,13 +1800,12 @@ const uint8_t* unpack<31>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<32>(const uint8_t* in, uint32_t* out) { +const uint8_t* ScalarUnpacker::unpack<32>(const uint8_t* in, uint32_t* out) { for(int k = 0; k < 32; k += 1) { out[k] = LoadInt(in + (k * 4)); } return in + (4 * 32); } -}; // struct template<> struct ScalarUnpacker { @@ -1816,15 +1816,16 @@ static constexpr int kValuesUnpacked = 32; template static const uint8_t* unpack(const uint8_t* in, uint64_t* out); +}; template<> -const uint8_t* unpack<0>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<0>(const uint8_t* in, uint64_t* out) { std::memset(out, 0, 32 * 8); return in; } template<> -const uint8_t* unpack<1>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<1>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 1) - uint64_t{1}); const auto w0 = static_cast(LoadInt(in + 0 * 8)); @@ -1865,7 +1866,7 @@ const uint8_t* unpack<1>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<2>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<2>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 2) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -1906,7 +1907,7 @@ const uint8_t* unpack<2>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<3>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<3>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 3) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -1948,7 +1949,7 @@ const uint8_t* unpack<3>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<4>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<4>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 4) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -1990,7 +1991,7 @@ const uint8_t* unpack<4>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<5>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<5>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 5) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2033,7 +2034,7 @@ const uint8_t* unpack<5>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<6>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<6>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 6) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2076,7 +2077,7 @@ const uint8_t* unpack<6>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<7>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<7>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 7) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2120,7 +2121,7 @@ const uint8_t* unpack<7>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<8>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<8>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 8) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2164,7 +2165,7 @@ const uint8_t* unpack<8>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<9>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<9>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 9) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2209,7 +2210,7 @@ const uint8_t* unpack<9>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<10>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<10>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 10) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2254,7 +2255,7 @@ const uint8_t* unpack<10>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<11>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<11>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 11) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2300,7 +2301,7 @@ const uint8_t* unpack<11>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<12>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<12>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 12) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2346,7 +2347,7 @@ const uint8_t* unpack<12>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<13>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<13>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 13) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2393,7 +2394,7 @@ const uint8_t* unpack<13>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<14>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<14>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 14) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2440,7 +2441,7 @@ const uint8_t* unpack<14>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<15>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<15>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 15) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2488,7 +2489,7 @@ const uint8_t* unpack<15>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<16>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<16>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 16) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2536,7 +2537,7 @@ const uint8_t* unpack<16>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<17>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<17>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 17) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2585,7 +2586,7 @@ const uint8_t* unpack<17>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<18>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<18>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 18) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2634,7 +2635,7 @@ const uint8_t* unpack<18>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<19>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<19>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 19) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2684,7 +2685,7 @@ const uint8_t* unpack<19>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<20>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<20>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 20) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2734,7 +2735,7 @@ const uint8_t* unpack<20>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<21>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<21>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 21) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2785,7 +2786,7 @@ const uint8_t* unpack<21>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<22>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<22>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 22) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2836,7 +2837,7 @@ const uint8_t* unpack<22>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<23>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<23>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 23) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2888,7 +2889,7 @@ const uint8_t* unpack<23>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<24>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<24>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 24) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2940,7 +2941,7 @@ const uint8_t* unpack<24>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<25>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<25>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 25) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -2993,7 +2994,7 @@ const uint8_t* unpack<25>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<26>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<26>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 26) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3046,7 +3047,7 @@ const uint8_t* unpack<26>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<27>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<27>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 27) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3100,7 +3101,7 @@ const uint8_t* unpack<27>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<28>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<28>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 28) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3154,7 +3155,7 @@ const uint8_t* unpack<28>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<29>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<29>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 29) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3209,7 +3210,7 @@ const uint8_t* unpack<29>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<30>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<30>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 30) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3264,7 +3265,7 @@ const uint8_t* unpack<30>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<31>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<31>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 31) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3320,7 +3321,7 @@ const uint8_t* unpack<31>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<32>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<32>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 32) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3376,7 +3377,7 @@ const uint8_t* unpack<32>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<33>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<33>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 33) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3433,7 +3434,7 @@ const uint8_t* unpack<33>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<34>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<34>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 34) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3490,7 +3491,7 @@ const uint8_t* unpack<34>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<35>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<35>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 35) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3548,7 +3549,7 @@ const uint8_t* unpack<35>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<36>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<36>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 36) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3606,7 +3607,7 @@ const uint8_t* unpack<36>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<37>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<37>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 37) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3665,7 +3666,7 @@ const uint8_t* unpack<37>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<38>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<38>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 38) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3724,7 +3725,7 @@ const uint8_t* unpack<38>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<39>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<39>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 39) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3784,7 +3785,7 @@ const uint8_t* unpack<39>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<40>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<40>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 40) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3844,7 +3845,7 @@ const uint8_t* unpack<40>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<41>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<41>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 41) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3905,7 +3906,7 @@ const uint8_t* unpack<41>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<42>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<42>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 42) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -3966,7 +3967,7 @@ const uint8_t* unpack<42>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<43>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<43>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 43) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4028,7 +4029,7 @@ const uint8_t* unpack<43>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<44>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<44>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 44) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4090,7 +4091,7 @@ const uint8_t* unpack<44>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<45>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<45>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 45) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4153,7 +4154,7 @@ const uint8_t* unpack<45>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<46>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<46>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 46) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4216,7 +4217,7 @@ const uint8_t* unpack<46>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<47>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<47>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 47) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4280,7 +4281,7 @@ const uint8_t* unpack<47>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<48>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<48>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 48) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4344,7 +4345,7 @@ const uint8_t* unpack<48>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<49>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<49>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 49) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4409,7 +4410,7 @@ const uint8_t* unpack<49>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<50>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<50>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 50) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4474,7 +4475,7 @@ const uint8_t* unpack<50>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<51>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<51>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 51) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4540,7 +4541,7 @@ const uint8_t* unpack<51>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<52>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<52>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 52) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4606,7 +4607,7 @@ const uint8_t* unpack<52>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<53>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<53>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 53) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4673,7 +4674,7 @@ const uint8_t* unpack<53>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<54>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<54>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 54) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4740,7 +4741,7 @@ const uint8_t* unpack<54>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<55>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<55>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 55) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4808,7 +4809,7 @@ const uint8_t* unpack<55>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<56>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<56>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 56) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4876,7 +4877,7 @@ const uint8_t* unpack<56>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<57>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<57>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 57) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -4945,7 +4946,7 @@ const uint8_t* unpack<57>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<58>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<58>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 58) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -5014,7 +5015,7 @@ const uint8_t* unpack<58>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<59>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<59>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 59) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -5084,7 +5085,7 @@ const uint8_t* unpack<59>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<60>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<60>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 60) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -5154,7 +5155,7 @@ const uint8_t* unpack<60>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<61>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<61>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 61) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -5225,7 +5226,7 @@ const uint8_t* unpack<61>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<62>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<62>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 62) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -5296,7 +5297,7 @@ const uint8_t* unpack<62>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<63>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<63>(const uint8_t* in, uint64_t* out) { constexpr uint64_t mask = ((uint64_t{1} << 63) - uint64_t{1}); const auto w0 = LoadInt(in + 0 * 8); @@ -5368,13 +5369,12 @@ const uint8_t* unpack<63>(const uint8_t* in, uint64_t* out) { } template<> -const uint8_t* unpack<64>(const uint8_t* in, uint64_t* out) { +const uint8_t* ScalarUnpacker::unpack<64>(const uint8_t* in, uint64_t* out) { for(int k = 0; k < 32; k += 1) { out[k] = LoadInt(in + (k * 8)); } return in + (8 * 32); } -}; // struct } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h index 49a8dd277e7..b35d8f80923 100644 --- a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h @@ -45,16 +45,17 @@ static constexpr int kValuesUnpacked = 16; template static const uint8_t* unpack(const uint8_t* in, uint16_t* out); +}; template<> -const uint8_t* unpack<0>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<0>(const uint8_t* in, uint16_t* out) { std::memset(out, 0x0, 16 * sizeof(*out)); out += 16; return in; } template<> -const uint8_t* unpack<1>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<1>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x1; simd_batch masks(kMask); @@ -98,7 +99,7 @@ const uint8_t* unpack<1>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<2>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<2>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x3; simd_batch masks(kMask); @@ -142,7 +143,7 @@ const uint8_t* unpack<2>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<3>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<3>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x7; simd_batch masks(kMask); @@ -186,7 +187,7 @@ const uint8_t* unpack<3>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<4>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<4>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0xf; simd_batch masks(kMask); @@ -230,7 +231,7 @@ const uint8_t* unpack<4>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<5>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<5>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x1f; simd_batch masks(kMask); @@ -274,7 +275,7 @@ const uint8_t* unpack<5>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<6>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<6>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x3f; simd_batch masks(kMask); @@ -318,7 +319,7 @@ const uint8_t* unpack<6>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<7>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<7>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x7f; simd_batch masks(kMask); @@ -362,7 +363,7 @@ const uint8_t* unpack<7>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<8>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<8>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0xff; simd_batch masks(kMask); @@ -406,7 +407,7 @@ const uint8_t* unpack<8>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<9>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<9>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x1ff; simd_batch masks(kMask); @@ -450,7 +451,7 @@ const uint8_t* unpack<9>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<10>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<10>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x3ff; simd_batch masks(kMask); @@ -494,7 +495,7 @@ const uint8_t* unpack<10>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<11>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<11>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x7ff; simd_batch masks(kMask); @@ -538,7 +539,7 @@ const uint8_t* unpack<11>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<12>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<12>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0xfff; simd_batch masks(kMask); @@ -582,7 +583,7 @@ const uint8_t* unpack<12>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<13>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<13>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x1fff; simd_batch masks(kMask); @@ -626,7 +627,7 @@ const uint8_t* unpack<13>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<14>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<14>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x3fff; simd_batch masks(kMask); @@ -670,7 +671,7 @@ const uint8_t* unpack<14>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<15>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<15>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x7fff; simd_batch masks(kMask); @@ -714,91 +715,142 @@ const uint8_t* unpack<15>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<16>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd128Unpacker::unpack<16>(const uint8_t* in, uint16_t* out) { std::memcpy(out, in, 16 * sizeof(*out)); in += 2 * 16; out += 16; return in; } -}; // struct Unpacker template<> struct Simd128Unpacker { using out_type = uint32_t; using simd_batch = xsimd::make_sized_batch_t; -template -using simd_batch_constants = - xsimd::batch_constant; -using simd_bytes = - xsimd::make_sized_batch_t; -template -using simd_bytes_constants = - xsimd::batch_constant; static constexpr int kValuesUnpacked = 32; template static const uint8_t* unpack(const uint8_t* in, uint32_t* out); +}; template<> -const uint8_t* unpack<0>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<0>(const uint8_t* in, uint32_t* out) { std::memset(out, 0x0, 32 * sizeof(*out)); out += 32; return in; } -template -struct Reorder { - static constexpr unsigned get(unsigned i, unsigned n) { - if (i % 4 == 0) { - return K; - } - return 128; - } -}; - -template <> -const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { +template<> +const uint8_t* Simd128Unpacker::unpack<1>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1; - constexpr auto kShifts1 = simd_batch_constants<0, 1, 2, 3>{}; - constexpr auto kShifts2 = simd_batch_constants<4, 5, 6, 7>{}; - - { - auto bytes = simd_bytes::load_unaligned(in + 4 * 0); - // TODO var shifts no avail on SSE - { - constexpr auto kReorder = xsimd::make_batch_constant>(); - auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); - ((numbers >> kShifts1) & kMask).store_unaligned(out + 0 * 4); - ((numbers >> kShifts2) & kMask).store_unaligned(out + 1 * 4); - } - { - constexpr auto kReorder = xsimd::make_batch_constant>(); - auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); - ((numbers >> kShifts1) & kMask).store_unaligned(out + 2 * 4); - ((numbers >> kShifts2) & kMask).store_unaligned(out + 3 * 4); - } - { - constexpr auto kReorder = xsimd::make_batch_constant>(); - auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); - ((numbers >> kShifts1) & kMask).store_unaligned(out + 4 * 4); - ((numbers >> kShifts2) & kMask).store_unaligned(out + 5 * 4); - } - { - constexpr auto kReorder = xsimd::make_batch_constant>(); - auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); - ((numbers >> kShifts1) & kMask).store_unaligned(out + 6 * 4); - ((numbers >> kShifts2) & kMask).store_unaligned(out + 7 * 4); - } - } - - return in + 4 ; + simd_batch masks(kMask); + simd_batch words, shifts; + simd_batch results; + + // extract 1-bit bundles 0 to 3 + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; + shifts = simd_batch{ 0, 1, 2, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 1-bit bundles 4 to 7 + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; + shifts = simd_batch{ 4, 5, 6, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 1-bit bundles 8 to 11 + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; + shifts = simd_batch{ 8, 9, 10, 11 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 1-bit bundles 12 to 15 + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; + shifts = simd_batch{ 12, 13, 14, 15 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 1-bit bundles 16 to 19 + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; + shifts = simd_batch{ 16, 17, 18, 19 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 1-bit bundles 20 to 23 + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; + shifts = simd_batch{ 20, 21, 22, 23 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 1-bit bundles 24 to 27 + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; + shifts = simd_batch{ 24, 25, 26, 27 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 1-bit bundles 28 to 31 + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; + shifts = simd_batch{ 28, 29, 30, 31 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 1 * 4; + return in; } template<> -const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<2>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3; simd_batch masks(kMask); @@ -906,7 +958,7 @@ const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<3>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7; simd_batch masks(kMask); @@ -1014,7 +1066,7 @@ const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<4>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xf; simd_batch masks(kMask); @@ -1122,7 +1174,7 @@ const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<5>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1f; simd_batch masks(kMask); @@ -1230,7 +1282,7 @@ const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<6>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3f; simd_batch masks(kMask); @@ -1338,7 +1390,7 @@ const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<7>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7f; simd_batch masks(kMask); @@ -1446,7 +1498,7 @@ const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<8>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xff; simd_batch masks(kMask); @@ -1554,7 +1606,7 @@ const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<9>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1ff; simd_batch masks(kMask); @@ -1662,7 +1714,7 @@ const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<10>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3ff; simd_batch masks(kMask); @@ -1770,7 +1822,7 @@ const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<11>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7ff; simd_batch masks(kMask); @@ -1878,7 +1930,7 @@ const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<12>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xfff; simd_batch masks(kMask); @@ -1986,7 +2038,7 @@ const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<13>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1fff; simd_batch masks(kMask); @@ -2094,7 +2146,7 @@ const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<14>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3fff; simd_batch masks(kMask); @@ -2202,7 +2254,7 @@ const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<15>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7fff; simd_batch masks(kMask); @@ -2310,7 +2362,7 @@ const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<16>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xffff; simd_batch masks(kMask); @@ -2418,7 +2470,7 @@ const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<17>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1ffff; simd_batch masks(kMask); @@ -2526,7 +2578,7 @@ const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<18>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3ffff; simd_batch masks(kMask); @@ -2634,7 +2686,7 @@ const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<19>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7ffff; simd_batch masks(kMask); @@ -2742,7 +2794,7 @@ const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<20>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xfffff; simd_batch masks(kMask); @@ -2850,7 +2902,7 @@ const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<21>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1fffff; simd_batch masks(kMask); @@ -2958,7 +3010,7 @@ const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<22>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3fffff; simd_batch masks(kMask); @@ -3066,7 +3118,7 @@ const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<23>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7fffff; simd_batch masks(kMask); @@ -3174,7 +3226,7 @@ const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<24>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xffffff; simd_batch masks(kMask); @@ -3282,7 +3334,7 @@ const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<25>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1ffffff; simd_batch masks(kMask); @@ -3390,7 +3442,7 @@ const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<26>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3ffffff; simd_batch masks(kMask); @@ -3498,7 +3550,7 @@ const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<27>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7ffffff; simd_batch masks(kMask); @@ -3606,7 +3658,7 @@ const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<28>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xfffffff; simd_batch masks(kMask); @@ -3714,7 +3766,7 @@ const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<29>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1fffffff; simd_batch masks(kMask); @@ -3822,7 +3874,7 @@ const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<30>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3fffffff; simd_batch masks(kMask); @@ -3930,7 +3982,7 @@ const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<31>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<31>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7fffffff; simd_batch masks(kMask); @@ -4038,12 +4090,11 @@ const uint8_t* unpack<31>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<32>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd128Unpacker::unpack<32>(const uint8_t* in, uint32_t* out) { std::memcpy(out, in, 32 * sizeof(*out)); in += 4 * 32; out += 32; return in; } -}; // struct Unpacker } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h index 3d7daafe35c..1542fa8cb8d 100644 --- a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h @@ -45,16 +45,17 @@ static constexpr int kValuesUnpacked = 16; template static const uint8_t* unpack(const uint8_t* in, uint16_t* out); +}; template<> -const uint8_t* unpack<0>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<0>(const uint8_t* in, uint16_t* out) { std::memset(out, 0x0, 16 * sizeof(*out)); out += 16; return in; } template<> -const uint8_t* unpack<1>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<1>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x1; simd_batch masks(kMask); @@ -90,7 +91,7 @@ const uint8_t* unpack<1>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<2>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<2>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x3; simd_batch masks(kMask); @@ -126,7 +127,7 @@ const uint8_t* unpack<2>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<3>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<3>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x7; simd_batch masks(kMask); @@ -162,7 +163,7 @@ const uint8_t* unpack<3>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<4>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<4>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0xf; simd_batch masks(kMask); @@ -198,7 +199,7 @@ const uint8_t* unpack<4>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<5>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<5>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x1f; simd_batch masks(kMask); @@ -234,7 +235,7 @@ const uint8_t* unpack<5>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<6>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<6>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x3f; simd_batch masks(kMask); @@ -270,7 +271,7 @@ const uint8_t* unpack<6>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<7>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<7>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x7f; simd_batch masks(kMask); @@ -306,7 +307,7 @@ const uint8_t* unpack<7>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<8>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<8>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0xff; simd_batch masks(kMask); @@ -342,7 +343,7 @@ const uint8_t* unpack<8>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<9>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<9>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x1ff; simd_batch masks(kMask); @@ -378,7 +379,7 @@ const uint8_t* unpack<9>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<10>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<10>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x3ff; simd_batch masks(kMask); @@ -414,7 +415,7 @@ const uint8_t* unpack<10>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<11>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<11>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x7ff; simd_batch masks(kMask); @@ -450,7 +451,7 @@ const uint8_t* unpack<11>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<12>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<12>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0xfff; simd_batch masks(kMask); @@ -486,7 +487,7 @@ const uint8_t* unpack<12>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<13>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<13>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x1fff; simd_batch masks(kMask); @@ -522,7 +523,7 @@ const uint8_t* unpack<13>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<14>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<14>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x3fff; simd_batch masks(kMask); @@ -558,7 +559,7 @@ const uint8_t* unpack<14>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<15>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<15>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x7fff; simd_batch masks(kMask); @@ -594,86 +595,110 @@ const uint8_t* unpack<15>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<16>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd256Unpacker::unpack<16>(const uint8_t* in, uint16_t* out) { std::memcpy(out, in, 16 * sizeof(*out)); in += 2 * 16; out += 16; return in; } -}; // struct Unpacker template<> struct Simd256Unpacker { using out_type = uint32_t; using simd_batch = xsimd::make_sized_batch_t; -template -using simd_batch_constants = - xsimd::batch_constant; -using simd_bytes = - xsimd::make_sized_batch_t; -template -using simd_bytes_constants = - xsimd::batch_constant; - -template -struct Reorder { - static constexpr unsigned get(unsigned i, unsigned n) { - if (i % 4 == 0) { - return K; - } - return 128; - } -}; static constexpr int kValuesUnpacked = 32; template static const uint8_t* unpack(const uint8_t* in, uint32_t* out); +}; template<> -const uint8_t* unpack<0>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<0>(const uint8_t* in, uint32_t* out) { std::memset(out, 0x0, 32 * sizeof(*out)); out += 32; return in; } template<> -const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<1>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1; - constexpr auto kShifts1 = simd_batch_constants<0, 1, 2, 3, 4, 5, 6, 7>{}; - - { - auto bytes = simd_bytes::load_unaligned(in + 4 * 0); - // TODO var shifts no avail on SSE - { - constexpr auto kReorder = xsimd::make_batch_constant>(); - auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); - ((numbers >> kShifts1) & kMask).store_unaligned(out + 0 * 8); - } - { - constexpr auto kReorder = xsimd::make_batch_constant>(); - auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); - ((numbers >> kShifts1) & kMask).store_unaligned(out + 1 * 8); - } - { - constexpr auto kReorder = xsimd::make_batch_constant>(); - auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); - ((numbers >> kShifts1) & kMask).store_unaligned(out + 2 * 8); - } - { - constexpr auto kReorder = xsimd::make_batch_constant>(); - auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); - ((numbers >> kShifts1) & kMask).store_unaligned(out + 3 * 8); - } - } - - return in + 4 ; + simd_batch masks(kMask); + simd_batch words, shifts; + simd_batch results; + + // extract 1-bit bundles 0 to 7 + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; + shifts = simd_batch{ 0, 1, 2, 3, 4, 5, 6, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 1-bit bundles 8 to 15 + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; + shifts = simd_batch{ 8, 9, 10, 11, 12, 13, 14, 15 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 1-bit bundles 16 to 23 + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; + shifts = simd_batch{ 16, 17, 18, 19, 20, 21, 22, 23 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 1-bit bundles 24 to 31 + words = simd_batch{ + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + }; + shifts = simd_batch{ 24, 25, 26, 27, 28, 29, 30, 31 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 1 * 4; + return in; } template<> -const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<2>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3; simd_batch masks(kMask); @@ -749,7 +774,7 @@ const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<3>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7; simd_batch masks(kMask); @@ -825,7 +850,7 @@ const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<4>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xf; simd_batch masks(kMask); @@ -901,7 +926,7 @@ const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<5>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1f; simd_batch masks(kMask); @@ -977,7 +1002,7 @@ const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<6>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3f; simd_batch masks(kMask); @@ -1053,7 +1078,7 @@ const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<7>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7f; simd_batch masks(kMask); @@ -1129,7 +1154,7 @@ const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<8>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xff; simd_batch masks(kMask); @@ -1205,7 +1230,7 @@ const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<9>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1ff; simd_batch masks(kMask); @@ -1281,7 +1306,7 @@ const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<10>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3ff; simd_batch masks(kMask); @@ -1357,7 +1382,7 @@ const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<11>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7ff; simd_batch masks(kMask); @@ -1433,7 +1458,7 @@ const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<12>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xfff; simd_batch masks(kMask); @@ -1509,7 +1534,7 @@ const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<13>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1fff; simd_batch masks(kMask); @@ -1585,7 +1610,7 @@ const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<14>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3fff; simd_batch masks(kMask); @@ -1661,7 +1686,7 @@ const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<15>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7fff; simd_batch masks(kMask); @@ -1737,7 +1762,7 @@ const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<16>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xffff; simd_batch masks(kMask); @@ -1813,7 +1838,7 @@ const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<17>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1ffff; simd_batch masks(kMask); @@ -1889,7 +1914,7 @@ const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<18>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3ffff; simd_batch masks(kMask); @@ -1965,7 +1990,7 @@ const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<19>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7ffff; simd_batch masks(kMask); @@ -2041,7 +2066,7 @@ const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<20>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xfffff; simd_batch masks(kMask); @@ -2117,7 +2142,7 @@ const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<21>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1fffff; simd_batch masks(kMask); @@ -2193,7 +2218,7 @@ const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<22>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3fffff; simd_batch masks(kMask); @@ -2269,7 +2294,7 @@ const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<23>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7fffff; simd_batch masks(kMask); @@ -2345,7 +2370,7 @@ const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<24>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xffffff; simd_batch masks(kMask); @@ -2421,7 +2446,7 @@ const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<25>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1ffffff; simd_batch masks(kMask); @@ -2497,7 +2522,7 @@ const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<26>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3ffffff; simd_batch masks(kMask); @@ -2573,7 +2598,7 @@ const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<27>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7ffffff; simd_batch masks(kMask); @@ -2649,7 +2674,7 @@ const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<28>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xfffffff; simd_batch masks(kMask); @@ -2725,7 +2750,7 @@ const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<29>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1fffffff; simd_batch masks(kMask); @@ -2801,7 +2826,7 @@ const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<30>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3fffffff; simd_batch masks(kMask); @@ -2877,7 +2902,7 @@ const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<31>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<31>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7fffffff; simd_batch masks(kMask); @@ -2953,12 +2978,11 @@ const uint8_t* unpack<31>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<32>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd256Unpacker::unpack<32>(const uint8_t* in, uint32_t* out) { std::memcpy(out, in, 32 * sizeof(*out)); in += 4 * 32; out += 32; return in; } -}; // struct Unpacker } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_simd512_generated_internal.h b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h index 3fb06b5709c..5b2df999023 100644 --- a/cpp/src/arrow/util/bpacking_simd512_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h @@ -45,16 +45,17 @@ static constexpr int kValuesUnpacked = 16; template static const uint8_t* unpack(const uint8_t* in, uint16_t* out); +}; template<> -const uint8_t* unpack<0>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<0>(const uint8_t* in, uint16_t* out) { std::memset(out, 0x0, 16 * sizeof(*out)); out += 16; return in; } template<> -const uint8_t* unpack<1>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<1>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x1; simd_batch masks(kMask); @@ -90,7 +91,7 @@ const uint8_t* unpack<1>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<2>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<2>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x3; simd_batch masks(kMask); @@ -126,7 +127,7 @@ const uint8_t* unpack<2>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<3>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<3>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x7; simd_batch masks(kMask); @@ -162,7 +163,7 @@ const uint8_t* unpack<3>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<4>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<4>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0xf; simd_batch masks(kMask); @@ -198,7 +199,7 @@ const uint8_t* unpack<4>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<5>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<5>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x1f; simd_batch masks(kMask); @@ -234,7 +235,7 @@ const uint8_t* unpack<5>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<6>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<6>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x3f; simd_batch masks(kMask); @@ -270,7 +271,7 @@ const uint8_t* unpack<6>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<7>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<7>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x7f; simd_batch masks(kMask); @@ -306,7 +307,7 @@ const uint8_t* unpack<7>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<8>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<8>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0xff; simd_batch masks(kMask); @@ -342,7 +343,7 @@ const uint8_t* unpack<8>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<9>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<9>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x1ff; simd_batch masks(kMask); @@ -378,7 +379,7 @@ const uint8_t* unpack<9>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<10>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<10>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x3ff; simd_batch masks(kMask); @@ -414,7 +415,7 @@ const uint8_t* unpack<10>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<11>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<11>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x7ff; simd_batch masks(kMask); @@ -450,7 +451,7 @@ const uint8_t* unpack<11>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<12>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<12>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0xfff; simd_batch masks(kMask); @@ -486,7 +487,7 @@ const uint8_t* unpack<12>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<13>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<13>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x1fff; simd_batch masks(kMask); @@ -522,7 +523,7 @@ const uint8_t* unpack<13>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<14>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<14>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x3fff; simd_batch masks(kMask); @@ -558,7 +559,7 @@ const uint8_t* unpack<14>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<15>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<15>(const uint8_t* in, uint16_t* out) { constexpr uint16_t kMask = 0x7fff; simd_batch masks(kMask); @@ -594,13 +595,12 @@ const uint8_t* unpack<15>(const uint8_t* in, uint16_t* out) { } template<> -const uint8_t* unpack<16>(const uint8_t* in, uint16_t* out) { +const uint8_t* Simd512Unpacker::unpack<16>(const uint8_t* in, uint16_t* out) { std::memcpy(out, in, 16 * sizeof(*out)); in += 2 * 16; out += 16; return in; } -}; // struct Unpacker template<> struct Simd512Unpacker { @@ -612,16 +612,17 @@ static constexpr int kValuesUnpacked = 32; template static const uint8_t* unpack(const uint8_t* in, uint32_t* out); +}; template<> -const uint8_t* unpack<0>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<0>(const uint8_t* in, uint32_t* out) { std::memset(out, 0x0, 32 * sizeof(*out)); out += 32; return in; } template<> -const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<1>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1; simd_batch masks(kMask); @@ -681,7 +682,7 @@ const uint8_t* unpack<1>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<2>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3; simd_batch masks(kMask); @@ -741,7 +742,7 @@ const uint8_t* unpack<2>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<3>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7; simd_batch masks(kMask); @@ -801,7 +802,7 @@ const uint8_t* unpack<3>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<4>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xf; simd_batch masks(kMask); @@ -861,7 +862,7 @@ const uint8_t* unpack<4>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<5>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1f; simd_batch masks(kMask); @@ -921,7 +922,7 @@ const uint8_t* unpack<5>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<6>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3f; simd_batch masks(kMask); @@ -981,7 +982,7 @@ const uint8_t* unpack<6>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<7>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7f; simd_batch masks(kMask); @@ -1041,7 +1042,7 @@ const uint8_t* unpack<7>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<8>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xff; simd_batch masks(kMask); @@ -1101,7 +1102,7 @@ const uint8_t* unpack<8>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<9>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1ff; simd_batch masks(kMask); @@ -1161,7 +1162,7 @@ const uint8_t* unpack<9>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<10>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3ff; simd_batch masks(kMask); @@ -1221,7 +1222,7 @@ const uint8_t* unpack<10>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<11>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7ff; simd_batch masks(kMask); @@ -1281,7 +1282,7 @@ const uint8_t* unpack<11>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<12>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xfff; simd_batch masks(kMask); @@ -1341,7 +1342,7 @@ const uint8_t* unpack<12>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<13>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1fff; simd_batch masks(kMask); @@ -1401,7 +1402,7 @@ const uint8_t* unpack<13>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<14>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3fff; simd_batch masks(kMask); @@ -1461,7 +1462,7 @@ const uint8_t* unpack<14>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<15>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7fff; simd_batch masks(kMask); @@ -1521,7 +1522,7 @@ const uint8_t* unpack<15>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<16>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xffff; simd_batch masks(kMask); @@ -1581,7 +1582,7 @@ const uint8_t* unpack<16>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<17>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1ffff; simd_batch masks(kMask); @@ -1641,7 +1642,7 @@ const uint8_t* unpack<17>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<18>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3ffff; simd_batch masks(kMask); @@ -1701,7 +1702,7 @@ const uint8_t* unpack<18>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<19>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7ffff; simd_batch masks(kMask); @@ -1761,7 +1762,7 @@ const uint8_t* unpack<19>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<20>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xfffff; simd_batch masks(kMask); @@ -1821,7 +1822,7 @@ const uint8_t* unpack<20>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<21>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1fffff; simd_batch masks(kMask); @@ -1881,7 +1882,7 @@ const uint8_t* unpack<21>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<22>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3fffff; simd_batch masks(kMask); @@ -1941,7 +1942,7 @@ const uint8_t* unpack<22>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<23>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7fffff; simd_batch masks(kMask); @@ -2001,7 +2002,7 @@ const uint8_t* unpack<23>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<24>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xffffff; simd_batch masks(kMask); @@ -2061,7 +2062,7 @@ const uint8_t* unpack<24>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<25>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1ffffff; simd_batch masks(kMask); @@ -2121,7 +2122,7 @@ const uint8_t* unpack<25>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<26>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3ffffff; simd_batch masks(kMask); @@ -2181,7 +2182,7 @@ const uint8_t* unpack<26>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<27>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7ffffff; simd_batch masks(kMask); @@ -2241,7 +2242,7 @@ const uint8_t* unpack<27>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<28>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xfffffff; simd_batch masks(kMask); @@ -2301,7 +2302,7 @@ const uint8_t* unpack<28>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<29>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1fffffff; simd_batch masks(kMask); @@ -2361,7 +2362,7 @@ const uint8_t* unpack<29>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<30>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3fffffff; simd_batch masks(kMask); @@ -2421,7 +2422,7 @@ const uint8_t* unpack<30>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<31>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<31>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7fffffff; simd_batch masks(kMask); @@ -2481,12 +2482,11 @@ const uint8_t* unpack<31>(const uint8_t* in, uint32_t* out) { } template<> -const uint8_t* unpack<32>(const uint8_t* in, uint32_t* out) { +const uint8_t* Simd512Unpacker::unpack<32>(const uint8_t* in, uint32_t* out) { std::memcpy(out, in, 32 * sizeof(*out)); in += 4 * 32; out += 32; return in; } -}; // struct Unpacker } // namespace arrow::internal From 2913f5e616d3f0b2ecc3b08f02d486914acf3b01 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 19 Sep 2025 12:34:13 +0200 Subject: [PATCH 22/76] Only generate simd 32 --- cpp/src/arrow/util/bpacking_simd_codegen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/bpacking_simd_codegen.py b/cpp/src/arrow/util/bpacking_simd_codegen.py index 84a741bb690..f4a3cf57ec4 100755 --- a/cpp/src/arrow/util/bpacking_simd_codegen.py +++ b/cpp/src/arrow/util/bpacking_simd_codegen.py @@ -289,4 +289,4 @@ def main(simd_width, outputs): except ValueError: raise ValueError(usage) - main(simd_width, [16, 32]) + main(simd_width, [32]) From 44dead1f1f39eb378e869f19bb30f235733b0ffd Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 19 Sep 2025 12:34:52 +0200 Subject: [PATCH 23/76] Gen: regenerate unpack files --- .../bpacking_simd128_generated_internal.h | 687 ------------------ .../bpacking_simd256_generated_internal.h | 567 --------------- .../bpacking_simd512_generated_internal.h | 567 --------------- 3 files changed, 1821 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h index b35d8f80923..ca78d37c4ff 100644 --- a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h @@ -35,693 +35,6 @@ using ::arrow::util::SafeLoadAs; template struct Simd128Unpacker; -template<> -struct Simd128Unpacker { - -using out_type = uint16_t; -using simd_batch = xsimd::make_sized_batch_t; - -static constexpr int kValuesUnpacked = 16; - -template -static const uint8_t* unpack(const uint8_t* in, uint16_t* out); -}; - -template<> -const uint8_t* Simd128Unpacker::unpack<0>(const uint8_t* in, uint16_t* out) { - std::memset(out, 0x0, 16 * sizeof(*out)); - out += 16; - return in; -} - -template<> -const uint8_t* Simd128Unpacker::unpack<1>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x1; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 1-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - }; - shifts = simd_batch{ 0, 1, 2, 3, 4, 5, 6, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 1-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - }; - shifts = simd_batch{ 8, 9, 10, 11, 12, 13, 14, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 1 * 2; - return in; -} - -template<> -const uint8_t* Simd128Unpacker::unpack<2>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x3; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 2-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - }; - shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 2-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - }; - shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 2 * 2; - return in; -} - -template<> -const uint8_t* Simd128Unpacker::unpack<3>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x7; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 3-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - }; - shifts = simd_batch{ 0, 3, 6, 9, 12, 0, 2, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 3-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - }; - shifts = simd_batch{ 8, 11, 0, 1, 4, 7, 10, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 3 * 2; - return in; -} - -template<> -const uint8_t* Simd128Unpacker::unpack<4>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0xf; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 4-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - }; - shifts = simd_batch{ 0, 4, 8, 12, 0, 4, 8, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 4-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - }; - shifts = simd_batch{ 0, 4, 8, 12, 0, 4, 8, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 4 * 2; - return in; -} - -template<> -const uint8_t* Simd128Unpacker::unpack<5>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x1f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 5-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), - SafeLoadAs(in + 2 * 2), - }; - shifts = simd_batch{ 0, 5, 10, 0, 4, 9, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 5-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 4), - }; - shifts = simd_batch{ 8, 0, 2, 7, 0, 1, 6, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 5 * 2; - return in; -} - -template<> -const uint8_t* Simd128Unpacker::unpack<6>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x3f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 6-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 12 | SafeLoadAs(in + 2 * 1) << 4), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - }; - shifts = simd_batch{ 0, 6, 0, 2, 8, 0, 4, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 6-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 14 | SafeLoadAs(in + 2 * 5) << 2), - SafeLoadAs(in + 2 * 5), - SafeLoadAs(in + 2 * 5), - }; - shifts = simd_batch{ 0, 6, 0, 2, 8, 0, 4, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 6 * 2; - return in; -} - -template<> -const uint8_t* Simd128Unpacker::unpack<7>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x7f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 7-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 14 | SafeLoadAs(in + 2 * 1) << 2), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 12 | SafeLoadAs(in + 2 * 2) << 4), - SafeLoadAs(in + 2 * 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 10 | SafeLoadAs(in + 2 * 3) << 6), - SafeLoadAs(in + 2 * 3), - }; - shifts = simd_batch{ 0, 7, 0, 5, 0, 3, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 7-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 15 | SafeLoadAs(in + 2 * 4) << 1), - SafeLoadAs(in + 2 * 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 13 | SafeLoadAs(in + 2 * 5) << 3), - SafeLoadAs(in + 2 * 5), - static_cast(SafeLoadAs(in + 2 * 5) >> 11 | SafeLoadAs(in + 2 * 6) << 5), - SafeLoadAs(in + 2 * 6), - SafeLoadAs(in + 2 * 6), - }; - shifts = simd_batch{ 8, 0, 6, 0, 4, 0, 2, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 7 * 2; - return in; -} - -template<> -const uint8_t* Simd128Unpacker::unpack<8>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0xff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 8-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - }; - shifts = simd_batch{ 0, 8, 0, 8, 0, 8, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 8-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 5), - SafeLoadAs(in + 2 * 5), - SafeLoadAs(in + 2 * 6), - SafeLoadAs(in + 2 * 6), - SafeLoadAs(in + 2 * 7), - SafeLoadAs(in + 2 * 7), - }; - shifts = simd_batch{ 0, 8, 0, 8, 0, 8, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 8 * 2; - return in; -} - -template<> -const uint8_t* Simd128Unpacker::unpack<9>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x1ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 9-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 9 | SafeLoadAs(in + 2 * 1) << 7), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 11 | SafeLoadAs(in + 2 * 2) << 5), - SafeLoadAs(in + 2 * 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 15 | SafeLoadAs(in + 2 * 4) << 1), - }; - shifts = simd_batch{ 0, 0, 2, 0, 4, 0, 6, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 9-bit bundles 8 to 15 - words = simd_batch{ - static_cast(SafeLoadAs(in + 2 * 4) >> 8 | SafeLoadAs(in + 2 * 5) << 8), - SafeLoadAs(in + 2 * 5), - static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), - SafeLoadAs(in + 2 * 6), - static_cast(SafeLoadAs(in + 2 * 6) >> 12 | SafeLoadAs(in + 2 * 7) << 4), - SafeLoadAs(in + 2 * 7), - static_cast(SafeLoadAs(in + 2 * 7) >> 14 | SafeLoadAs(in + 2 * 8) << 2), - SafeLoadAs(in + 2 * 8), - }; - shifts = simd_batch{ 0, 1, 0, 3, 0, 5, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 9 * 2; - return in; -} - -template<> -const uint8_t* Simd128Unpacker::unpack<10>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x3ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 10-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 10 | SafeLoadAs(in + 2 * 1) << 6), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 8 | SafeLoadAs(in + 2 * 3) << 8), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), - SafeLoadAs(in + 2 * 4), - }; - shifts = simd_batch{ 0, 0, 4, 0, 0, 2, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 10-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 5), - static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), - SafeLoadAs(in + 2 * 6), - static_cast(SafeLoadAs(in + 2 * 6) >> 14 | SafeLoadAs(in + 2 * 7) << 2), - static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), - SafeLoadAs(in + 2 * 8), - static_cast(SafeLoadAs(in + 2 * 8) >> 12 | SafeLoadAs(in + 2 * 9) << 4), - SafeLoadAs(in + 2 * 9), - }; - shifts = simd_batch{ 0, 0, 4, 0, 0, 2, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 10 * 2; - return in; -} - -template<> -const uint8_t* Simd128Unpacker::unpack<11>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x7ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 11-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 11 | SafeLoadAs(in + 2 * 1) << 5), - static_cast(SafeLoadAs(in + 2 * 1) >> 6 | SafeLoadAs(in + 2 * 2) << 10), - SafeLoadAs(in + 2 * 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 12 | SafeLoadAs(in + 2 * 3) << 4), - static_cast(SafeLoadAs(in + 2 * 3) >> 7 | SafeLoadAs(in + 2 * 4) << 9), - SafeLoadAs(in + 2 * 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 13 | SafeLoadAs(in + 2 * 5) << 3), - }; - shifts = simd_batch{ 0, 0, 0, 1, 0, 0, 2, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 11-bit bundles 8 to 15 - words = simd_batch{ - static_cast(SafeLoadAs(in + 2 * 5) >> 8 | SafeLoadAs(in + 2 * 6) << 8), - SafeLoadAs(in + 2 * 6), - static_cast(SafeLoadAs(in + 2 * 6) >> 14 | SafeLoadAs(in + 2 * 7) << 2), - static_cast(SafeLoadAs(in + 2 * 7) >> 9 | SafeLoadAs(in + 2 * 8) << 7), - SafeLoadAs(in + 2 * 8), - static_cast(SafeLoadAs(in + 2 * 8) >> 15 | SafeLoadAs(in + 2 * 9) << 1), - static_cast(SafeLoadAs(in + 2 * 9) >> 10 | SafeLoadAs(in + 2 * 10) << 6), - SafeLoadAs(in + 2 * 10), - }; - shifts = simd_batch{ 0, 3, 0, 0, 4, 0, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 11 * 2; - return in; -} - -template<> -const uint8_t* Simd128Unpacker::unpack<12>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0xfff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 12-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 12 | SafeLoadAs(in + 2 * 1) << 4), - static_cast(SafeLoadAs(in + 2 * 1) >> 8 | SafeLoadAs(in + 2 * 2) << 8), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 8 | SafeLoadAs(in + 2 * 5) << 8), - SafeLoadAs(in + 2 * 5), - }; - shifts = simd_batch{ 0, 0, 0, 4, 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 12-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 6), - static_cast(SafeLoadAs(in + 2 * 6) >> 12 | SafeLoadAs(in + 2 * 7) << 4), - static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), - SafeLoadAs(in + 2 * 8), - SafeLoadAs(in + 2 * 9), - static_cast(SafeLoadAs(in + 2 * 9) >> 12 | SafeLoadAs(in + 2 * 10) << 4), - static_cast(SafeLoadAs(in + 2 * 10) >> 8 | SafeLoadAs(in + 2 * 11) << 8), - SafeLoadAs(in + 2 * 11), - }; - shifts = simd_batch{ 0, 0, 0, 4, 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 12 * 2; - return in; -} - -template<> -const uint8_t* Simd128Unpacker::unpack<13>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x1fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 13-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 13 | SafeLoadAs(in + 2 * 1) << 3), - static_cast(SafeLoadAs(in + 2 * 1) >> 10 | SafeLoadAs(in + 2 * 2) << 6), - static_cast(SafeLoadAs(in + 2 * 2) >> 7 | SafeLoadAs(in + 2 * 3) << 9), - static_cast(SafeLoadAs(in + 2 * 3) >> 4 | SafeLoadAs(in + 2 * 4) << 12), - SafeLoadAs(in + 2 * 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 14 | SafeLoadAs(in + 2 * 5) << 2), - static_cast(SafeLoadAs(in + 2 * 5) >> 11 | SafeLoadAs(in + 2 * 6) << 5), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 1, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 13-bit bundles 8 to 15 - words = simd_batch{ - static_cast(SafeLoadAs(in + 2 * 6) >> 8 | SafeLoadAs(in + 2 * 7) << 8), - static_cast(SafeLoadAs(in + 2 * 7) >> 5 | SafeLoadAs(in + 2 * 8) << 11), - SafeLoadAs(in + 2 * 8), - static_cast(SafeLoadAs(in + 2 * 8) >> 15 | SafeLoadAs(in + 2 * 9) << 1), - static_cast(SafeLoadAs(in + 2 * 9) >> 12 | SafeLoadAs(in + 2 * 10) << 4), - static_cast(SafeLoadAs(in + 2 * 10) >> 9 | SafeLoadAs(in + 2 * 11) << 7), - static_cast(SafeLoadAs(in + 2 * 11) >> 6 | SafeLoadAs(in + 2 * 12) << 10), - SafeLoadAs(in + 2 * 12), - }; - shifts = simd_batch{ 0, 0, 2, 0, 0, 0, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 13 * 2; - return in; -} - -template<> -const uint8_t* Simd128Unpacker::unpack<14>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x3fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 14-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 14 | SafeLoadAs(in + 2 * 1) << 2), - static_cast(SafeLoadAs(in + 2 * 1) >> 12 | SafeLoadAs(in + 2 * 2) << 4), - static_cast(SafeLoadAs(in + 2 * 2) >> 10 | SafeLoadAs(in + 2 * 3) << 6), - static_cast(SafeLoadAs(in + 2 * 3) >> 8 | SafeLoadAs(in + 2 * 4) << 8), - static_cast(SafeLoadAs(in + 2 * 4) >> 6 | SafeLoadAs(in + 2 * 5) << 10), - static_cast(SafeLoadAs(in + 2 * 5) >> 4 | SafeLoadAs(in + 2 * 6) << 12), - SafeLoadAs(in + 2 * 6), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 14-bit bundles 8 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 7), - static_cast(SafeLoadAs(in + 2 * 7) >> 14 | SafeLoadAs(in + 2 * 8) << 2), - static_cast(SafeLoadAs(in + 2 * 8) >> 12 | SafeLoadAs(in + 2 * 9) << 4), - static_cast(SafeLoadAs(in + 2 * 9) >> 10 | SafeLoadAs(in + 2 * 10) << 6), - static_cast(SafeLoadAs(in + 2 * 10) >> 8 | SafeLoadAs(in + 2 * 11) << 8), - static_cast(SafeLoadAs(in + 2 * 11) >> 6 | SafeLoadAs(in + 2 * 12) << 10), - static_cast(SafeLoadAs(in + 2 * 12) >> 4 | SafeLoadAs(in + 2 * 13) << 12), - SafeLoadAs(in + 2 * 13), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 14 * 2; - return in; -} - -template<> -const uint8_t* Simd128Unpacker::unpack<15>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x7fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 15-bit bundles 0 to 7 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 11 | SafeLoadAs(in + 2 * 5) << 5), - static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), - static_cast(SafeLoadAs(in + 2 * 6) >> 9 | SafeLoadAs(in + 2 * 7) << 7), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - // extract 15-bit bundles 8 to 15 - words = simd_batch{ - static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), - static_cast(SafeLoadAs(in + 2 * 8) >> 7 | SafeLoadAs(in + 2 * 9) << 9), - static_cast(SafeLoadAs(in + 2 * 9) >> 6 | SafeLoadAs(in + 2 * 10) << 10), - static_cast(SafeLoadAs(in + 2 * 10) >> 5 | SafeLoadAs(in + 2 * 11) << 11), - static_cast(SafeLoadAs(in + 2 * 11) >> 4 | SafeLoadAs(in + 2 * 12) << 12), - static_cast(SafeLoadAs(in + 2 * 12) >> 3 | SafeLoadAs(in + 2 * 13) << 13), - static_cast(SafeLoadAs(in + 2 * 13) >> 2 | SafeLoadAs(in + 2 * 14) << 14), - SafeLoadAs(in + 2 * 14), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 8; - - in += 15 * 2; - return in; -} - -template<> -const uint8_t* Simd128Unpacker::unpack<16>(const uint8_t* in, uint16_t* out) { - std::memcpy(out, in, 16 * sizeof(*out)); - in += 2 * 16; - out += 16; - return in; -} - template<> struct Simd128Unpacker { diff --git a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h index 1542fa8cb8d..a753ead6782 100644 --- a/cpp/src/arrow/util/bpacking_simd256_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd256_generated_internal.h @@ -35,573 +35,6 @@ using ::arrow::util::SafeLoadAs; template struct Simd256Unpacker; -template<> -struct Simd256Unpacker { - -using out_type = uint16_t; -using simd_batch = xsimd::make_sized_batch_t; - -static constexpr int kValuesUnpacked = 16; - -template -static const uint8_t* unpack(const uint8_t* in, uint16_t* out); -}; - -template<> -const uint8_t* Simd256Unpacker::unpack<0>(const uint8_t* in, uint16_t* out) { - std::memset(out, 0x0, 16 * sizeof(*out)); - out += 16; - return in; -} - -template<> -const uint8_t* Simd256Unpacker::unpack<1>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x1; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 1-bit bundles 0 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - }; - shifts = simd_batch{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 16; - - in += 1 * 2; - return in; -} - -template<> -const uint8_t* Simd256Unpacker::unpack<2>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x3; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 2-bit bundles 0 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - }; - shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 16; - - in += 2 * 2; - return in; -} - -template<> -const uint8_t* Simd256Unpacker::unpack<3>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x7; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 3-bit bundles 0 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - }; - shifts = simd_batch{ 0, 3, 6, 9, 12, 0, 2, 5, 8, 11, 0, 1, 4, 7, 10, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 16; - - in += 3 * 2; - return in; -} - -template<> -const uint8_t* Simd256Unpacker::unpack<4>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0xf; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 4-bit bundles 0 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - }; - shifts = simd_batch{ 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 16; - - in += 4 * 2; - return in; -} - -template<> -const uint8_t* Simd256Unpacker::unpack<5>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x1f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 5-bit bundles 0 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 4), - }; - shifts = simd_batch{ 0, 5, 10, 0, 4, 9, 0, 3, 8, 0, 2, 7, 0, 1, 6, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 16; - - in += 5 * 2; - return in; -} - -template<> -const uint8_t* Simd256Unpacker::unpack<6>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x3f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 6-bit bundles 0 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 12 | SafeLoadAs(in + 2 * 1) << 4), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 14 | SafeLoadAs(in + 2 * 5) << 2), - SafeLoadAs(in + 2 * 5), - SafeLoadAs(in + 2 * 5), - }; - shifts = simd_batch{ 0, 6, 0, 2, 8, 0, 4, 10, 0, 6, 0, 2, 8, 0, 4, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 16; - - in += 6 * 2; - return in; -} - -template<> -const uint8_t* Simd256Unpacker::unpack<7>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x7f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 7-bit bundles 0 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 14 | SafeLoadAs(in + 2 * 1) << 2), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 12 | SafeLoadAs(in + 2 * 2) << 4), - SafeLoadAs(in + 2 * 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 10 | SafeLoadAs(in + 2 * 3) << 6), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 15 | SafeLoadAs(in + 2 * 4) << 1), - SafeLoadAs(in + 2 * 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 13 | SafeLoadAs(in + 2 * 5) << 3), - SafeLoadAs(in + 2 * 5), - static_cast(SafeLoadAs(in + 2 * 5) >> 11 | SafeLoadAs(in + 2 * 6) << 5), - SafeLoadAs(in + 2 * 6), - SafeLoadAs(in + 2 * 6), - }; - shifts = simd_batch{ 0, 7, 0, 5, 0, 3, 0, 1, 8, 0, 6, 0, 4, 0, 2, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 16; - - in += 7 * 2; - return in; -} - -template<> -const uint8_t* Simd256Unpacker::unpack<8>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0xff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 8-bit bundles 0 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 5), - SafeLoadAs(in + 2 * 5), - SafeLoadAs(in + 2 * 6), - SafeLoadAs(in + 2 * 6), - SafeLoadAs(in + 2 * 7), - SafeLoadAs(in + 2 * 7), - }; - shifts = simd_batch{ 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 16; - - in += 8 * 2; - return in; -} - -template<> -const uint8_t* Simd256Unpacker::unpack<9>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x1ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 9-bit bundles 0 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 9 | SafeLoadAs(in + 2 * 1) << 7), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 11 | SafeLoadAs(in + 2 * 2) << 5), - SafeLoadAs(in + 2 * 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 15 | SafeLoadAs(in + 2 * 4) << 1), - static_cast(SafeLoadAs(in + 2 * 4) >> 8 | SafeLoadAs(in + 2 * 5) << 8), - SafeLoadAs(in + 2 * 5), - static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), - SafeLoadAs(in + 2 * 6), - static_cast(SafeLoadAs(in + 2 * 6) >> 12 | SafeLoadAs(in + 2 * 7) << 4), - SafeLoadAs(in + 2 * 7), - static_cast(SafeLoadAs(in + 2 * 7) >> 14 | SafeLoadAs(in + 2 * 8) << 2), - SafeLoadAs(in + 2 * 8), - }; - shifts = simd_batch{ 0, 0, 2, 0, 4, 0, 6, 0, 0, 1, 0, 3, 0, 5, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 16; - - in += 9 * 2; - return in; -} - -template<> -const uint8_t* Simd256Unpacker::unpack<10>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x3ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 10-bit bundles 0 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 10 | SafeLoadAs(in + 2 * 1) << 6), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 8 | SafeLoadAs(in + 2 * 3) << 8), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 5), - static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), - SafeLoadAs(in + 2 * 6), - static_cast(SafeLoadAs(in + 2 * 6) >> 14 | SafeLoadAs(in + 2 * 7) << 2), - static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), - SafeLoadAs(in + 2 * 8), - static_cast(SafeLoadAs(in + 2 * 8) >> 12 | SafeLoadAs(in + 2 * 9) << 4), - SafeLoadAs(in + 2 * 9), - }; - shifts = simd_batch{ 0, 0, 4, 0, 0, 2, 0, 6, 0, 0, 4, 0, 0, 2, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 16; - - in += 10 * 2; - return in; -} - -template<> -const uint8_t* Simd256Unpacker::unpack<11>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x7ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 11-bit bundles 0 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 11 | SafeLoadAs(in + 2 * 1) << 5), - static_cast(SafeLoadAs(in + 2 * 1) >> 6 | SafeLoadAs(in + 2 * 2) << 10), - SafeLoadAs(in + 2 * 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 12 | SafeLoadAs(in + 2 * 3) << 4), - static_cast(SafeLoadAs(in + 2 * 3) >> 7 | SafeLoadAs(in + 2 * 4) << 9), - SafeLoadAs(in + 2 * 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 13 | SafeLoadAs(in + 2 * 5) << 3), - static_cast(SafeLoadAs(in + 2 * 5) >> 8 | SafeLoadAs(in + 2 * 6) << 8), - SafeLoadAs(in + 2 * 6), - static_cast(SafeLoadAs(in + 2 * 6) >> 14 | SafeLoadAs(in + 2 * 7) << 2), - static_cast(SafeLoadAs(in + 2 * 7) >> 9 | SafeLoadAs(in + 2 * 8) << 7), - SafeLoadAs(in + 2 * 8), - static_cast(SafeLoadAs(in + 2 * 8) >> 15 | SafeLoadAs(in + 2 * 9) << 1), - static_cast(SafeLoadAs(in + 2 * 9) >> 10 | SafeLoadAs(in + 2 * 10) << 6), - SafeLoadAs(in + 2 * 10), - }; - shifts = simd_batch{ 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 16; - - in += 11 * 2; - return in; -} - -template<> -const uint8_t* Simd256Unpacker::unpack<12>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0xfff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 12-bit bundles 0 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 12 | SafeLoadAs(in + 2 * 1) << 4), - static_cast(SafeLoadAs(in + 2 * 1) >> 8 | SafeLoadAs(in + 2 * 2) << 8), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 8 | SafeLoadAs(in + 2 * 5) << 8), - SafeLoadAs(in + 2 * 5), - SafeLoadAs(in + 2 * 6), - static_cast(SafeLoadAs(in + 2 * 6) >> 12 | SafeLoadAs(in + 2 * 7) << 4), - static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), - SafeLoadAs(in + 2 * 8), - SafeLoadAs(in + 2 * 9), - static_cast(SafeLoadAs(in + 2 * 9) >> 12 | SafeLoadAs(in + 2 * 10) << 4), - static_cast(SafeLoadAs(in + 2 * 10) >> 8 | SafeLoadAs(in + 2 * 11) << 8), - SafeLoadAs(in + 2 * 11), - }; - shifts = simd_batch{ 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 16; - - in += 12 * 2; - return in; -} - -template<> -const uint8_t* Simd256Unpacker::unpack<13>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x1fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 13-bit bundles 0 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 13 | SafeLoadAs(in + 2 * 1) << 3), - static_cast(SafeLoadAs(in + 2 * 1) >> 10 | SafeLoadAs(in + 2 * 2) << 6), - static_cast(SafeLoadAs(in + 2 * 2) >> 7 | SafeLoadAs(in + 2 * 3) << 9), - static_cast(SafeLoadAs(in + 2 * 3) >> 4 | SafeLoadAs(in + 2 * 4) << 12), - SafeLoadAs(in + 2 * 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 14 | SafeLoadAs(in + 2 * 5) << 2), - static_cast(SafeLoadAs(in + 2 * 5) >> 11 | SafeLoadAs(in + 2 * 6) << 5), - static_cast(SafeLoadAs(in + 2 * 6) >> 8 | SafeLoadAs(in + 2 * 7) << 8), - static_cast(SafeLoadAs(in + 2 * 7) >> 5 | SafeLoadAs(in + 2 * 8) << 11), - SafeLoadAs(in + 2 * 8), - static_cast(SafeLoadAs(in + 2 * 8) >> 15 | SafeLoadAs(in + 2 * 9) << 1), - static_cast(SafeLoadAs(in + 2 * 9) >> 12 | SafeLoadAs(in + 2 * 10) << 4), - static_cast(SafeLoadAs(in + 2 * 10) >> 9 | SafeLoadAs(in + 2 * 11) << 7), - static_cast(SafeLoadAs(in + 2 * 11) >> 6 | SafeLoadAs(in + 2 * 12) << 10), - SafeLoadAs(in + 2 * 12), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 16; - - in += 13 * 2; - return in; -} - -template<> -const uint8_t* Simd256Unpacker::unpack<14>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x3fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 14-bit bundles 0 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 14 | SafeLoadAs(in + 2 * 1) << 2), - static_cast(SafeLoadAs(in + 2 * 1) >> 12 | SafeLoadAs(in + 2 * 2) << 4), - static_cast(SafeLoadAs(in + 2 * 2) >> 10 | SafeLoadAs(in + 2 * 3) << 6), - static_cast(SafeLoadAs(in + 2 * 3) >> 8 | SafeLoadAs(in + 2 * 4) << 8), - static_cast(SafeLoadAs(in + 2 * 4) >> 6 | SafeLoadAs(in + 2 * 5) << 10), - static_cast(SafeLoadAs(in + 2 * 5) >> 4 | SafeLoadAs(in + 2 * 6) << 12), - SafeLoadAs(in + 2 * 6), - SafeLoadAs(in + 2 * 7), - static_cast(SafeLoadAs(in + 2 * 7) >> 14 | SafeLoadAs(in + 2 * 8) << 2), - static_cast(SafeLoadAs(in + 2 * 8) >> 12 | SafeLoadAs(in + 2 * 9) << 4), - static_cast(SafeLoadAs(in + 2 * 9) >> 10 | SafeLoadAs(in + 2 * 10) << 6), - static_cast(SafeLoadAs(in + 2 * 10) >> 8 | SafeLoadAs(in + 2 * 11) << 8), - static_cast(SafeLoadAs(in + 2 * 11) >> 6 | SafeLoadAs(in + 2 * 12) << 10), - static_cast(SafeLoadAs(in + 2 * 12) >> 4 | SafeLoadAs(in + 2 * 13) << 12), - SafeLoadAs(in + 2 * 13), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 16; - - in += 14 * 2; - return in; -} - -template<> -const uint8_t* Simd256Unpacker::unpack<15>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x7fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 15-bit bundles 0 to 15 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 11 | SafeLoadAs(in + 2 * 5) << 5), - static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), - static_cast(SafeLoadAs(in + 2 * 6) >> 9 | SafeLoadAs(in + 2 * 7) << 7), - static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), - static_cast(SafeLoadAs(in + 2 * 8) >> 7 | SafeLoadAs(in + 2 * 9) << 9), - static_cast(SafeLoadAs(in + 2 * 9) >> 6 | SafeLoadAs(in + 2 * 10) << 10), - static_cast(SafeLoadAs(in + 2 * 10) >> 5 | SafeLoadAs(in + 2 * 11) << 11), - static_cast(SafeLoadAs(in + 2 * 11) >> 4 | SafeLoadAs(in + 2 * 12) << 12), - static_cast(SafeLoadAs(in + 2 * 12) >> 3 | SafeLoadAs(in + 2 * 13) << 13), - static_cast(SafeLoadAs(in + 2 * 13) >> 2 | SafeLoadAs(in + 2 * 14) << 14), - SafeLoadAs(in + 2 * 14), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 16; - - in += 15 * 2; - return in; -} - -template<> -const uint8_t* Simd256Unpacker::unpack<16>(const uint8_t* in, uint16_t* out) { - std::memcpy(out, in, 16 * sizeof(*out)); - in += 2 * 16; - out += 16; - return in; -} - template<> struct Simd256Unpacker { diff --git a/cpp/src/arrow/util/bpacking_simd512_generated_internal.h b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h index 5b2df999023..d5cdbdc987f 100644 --- a/cpp/src/arrow/util/bpacking_simd512_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd512_generated_internal.h @@ -35,573 +35,6 @@ using ::arrow::util::SafeLoadAs; template struct Simd512Unpacker; -template<> -struct Simd512Unpacker { - -using out_type = uint16_t; -using simd_batch = xsimd::make_sized_batch_t; - -static constexpr int kValuesUnpacked = 16; - -template -static const uint8_t* unpack(const uint8_t* in, uint16_t* out); -}; - -template<> -const uint8_t* Simd512Unpacker::unpack<0>(const uint8_t* in, uint16_t* out) { - std::memset(out, 0x0, 16 * sizeof(*out)); - out += 16; - return in; -} - -template<> -const uint8_t* Simd512Unpacker::unpack<1>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x1; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 1-bit bundles 0 to 31 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - }; - shifts = simd_batch{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 32; - - in += 1 * 2; - return in; -} - -template<> -const uint8_t* Simd512Unpacker::unpack<2>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x3; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 2-bit bundles 0 to 31 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - }; - shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 32; - - in += 2 * 2; - return in; -} - -template<> -const uint8_t* Simd512Unpacker::unpack<3>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x7; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 3-bit bundles 0 to 31 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - }; - shifts = simd_batch{ 0, 3, 6, 9, 12, 0, 2, 5, 8, 11, 0, 1, 4, 7, 10, 13 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 32; - - in += 3 * 2; - return in; -} - -template<> -const uint8_t* Simd512Unpacker::unpack<4>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0xf; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 4-bit bundles 0 to 31 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - }; - shifts = simd_batch{ 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 32; - - in += 4 * 2; - return in; -} - -template<> -const uint8_t* Simd512Unpacker::unpack<5>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x1f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 5-bit bundles 0 to 31 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 4), - }; - shifts = simd_batch{ 0, 5, 10, 0, 4, 9, 0, 3, 8, 0, 2, 7, 0, 1, 6, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 32; - - in += 5 * 2; - return in; -} - -template<> -const uint8_t* Simd512Unpacker::unpack<6>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x3f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 6-bit bundles 0 to 31 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 12 | SafeLoadAs(in + 2 * 1) << 4), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 14 | SafeLoadAs(in + 2 * 5) << 2), - SafeLoadAs(in + 2 * 5), - SafeLoadAs(in + 2 * 5), - }; - shifts = simd_batch{ 0, 6, 0, 2, 8, 0, 4, 10, 0, 6, 0, 2, 8, 0, 4, 10 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 32; - - in += 6 * 2; - return in; -} - -template<> -const uint8_t* Simd512Unpacker::unpack<7>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x7f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 7-bit bundles 0 to 31 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 14 | SafeLoadAs(in + 2 * 1) << 2), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 12 | SafeLoadAs(in + 2 * 2) << 4), - SafeLoadAs(in + 2 * 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 10 | SafeLoadAs(in + 2 * 3) << 6), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 15 | SafeLoadAs(in + 2 * 4) << 1), - SafeLoadAs(in + 2 * 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 13 | SafeLoadAs(in + 2 * 5) << 3), - SafeLoadAs(in + 2 * 5), - static_cast(SafeLoadAs(in + 2 * 5) >> 11 | SafeLoadAs(in + 2 * 6) << 5), - SafeLoadAs(in + 2 * 6), - SafeLoadAs(in + 2 * 6), - }; - shifts = simd_batch{ 0, 7, 0, 5, 0, 3, 0, 1, 8, 0, 6, 0, 4, 0, 2, 9 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 32; - - in += 7 * 2; - return in; -} - -template<> -const uint8_t* Simd512Unpacker::unpack<8>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0xff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 8-bit bundles 0 to 31 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 0), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 1), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 3), - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 5), - SafeLoadAs(in + 2 * 5), - SafeLoadAs(in + 2 * 6), - SafeLoadAs(in + 2 * 6), - SafeLoadAs(in + 2 * 7), - SafeLoadAs(in + 2 * 7), - }; - shifts = simd_batch{ 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 32; - - in += 8 * 2; - return in; -} - -template<> -const uint8_t* Simd512Unpacker::unpack<9>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x1ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 9-bit bundles 0 to 31 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 9 | SafeLoadAs(in + 2 * 1) << 7), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 11 | SafeLoadAs(in + 2 * 2) << 5), - SafeLoadAs(in + 2 * 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 15 | SafeLoadAs(in + 2 * 4) << 1), - static_cast(SafeLoadAs(in + 2 * 4) >> 8 | SafeLoadAs(in + 2 * 5) << 8), - SafeLoadAs(in + 2 * 5), - static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), - SafeLoadAs(in + 2 * 6), - static_cast(SafeLoadAs(in + 2 * 6) >> 12 | SafeLoadAs(in + 2 * 7) << 4), - SafeLoadAs(in + 2 * 7), - static_cast(SafeLoadAs(in + 2 * 7) >> 14 | SafeLoadAs(in + 2 * 8) << 2), - SafeLoadAs(in + 2 * 8), - }; - shifts = simd_batch{ 0, 0, 2, 0, 4, 0, 6, 0, 0, 1, 0, 3, 0, 5, 0, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 32; - - in += 9 * 2; - return in; -} - -template<> -const uint8_t* Simd512Unpacker::unpack<10>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x3ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 10-bit bundles 0 to 31 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 10 | SafeLoadAs(in + 2 * 1) << 6), - SafeLoadAs(in + 2 * 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 8 | SafeLoadAs(in + 2 * 3) << 8), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), - SafeLoadAs(in + 2 * 4), - SafeLoadAs(in + 2 * 5), - static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), - SafeLoadAs(in + 2 * 6), - static_cast(SafeLoadAs(in + 2 * 6) >> 14 | SafeLoadAs(in + 2 * 7) << 2), - static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), - SafeLoadAs(in + 2 * 8), - static_cast(SafeLoadAs(in + 2 * 8) >> 12 | SafeLoadAs(in + 2 * 9) << 4), - SafeLoadAs(in + 2 * 9), - }; - shifts = simd_batch{ 0, 0, 4, 0, 0, 2, 0, 6, 0, 0, 4, 0, 0, 2, 0, 6 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 32; - - in += 10 * 2; - return in; -} - -template<> -const uint8_t* Simd512Unpacker::unpack<11>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x7ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 11-bit bundles 0 to 31 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 11 | SafeLoadAs(in + 2 * 1) << 5), - static_cast(SafeLoadAs(in + 2 * 1) >> 6 | SafeLoadAs(in + 2 * 2) << 10), - SafeLoadAs(in + 2 * 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 12 | SafeLoadAs(in + 2 * 3) << 4), - static_cast(SafeLoadAs(in + 2 * 3) >> 7 | SafeLoadAs(in + 2 * 4) << 9), - SafeLoadAs(in + 2 * 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 13 | SafeLoadAs(in + 2 * 5) << 3), - static_cast(SafeLoadAs(in + 2 * 5) >> 8 | SafeLoadAs(in + 2 * 6) << 8), - SafeLoadAs(in + 2 * 6), - static_cast(SafeLoadAs(in + 2 * 6) >> 14 | SafeLoadAs(in + 2 * 7) << 2), - static_cast(SafeLoadAs(in + 2 * 7) >> 9 | SafeLoadAs(in + 2 * 8) << 7), - SafeLoadAs(in + 2 * 8), - static_cast(SafeLoadAs(in + 2 * 8) >> 15 | SafeLoadAs(in + 2 * 9) << 1), - static_cast(SafeLoadAs(in + 2 * 9) >> 10 | SafeLoadAs(in + 2 * 10) << 6), - SafeLoadAs(in + 2 * 10), - }; - shifts = simd_batch{ 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 5 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 32; - - in += 11 * 2; - return in; -} - -template<> -const uint8_t* Simd512Unpacker::unpack<12>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0xfff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 12-bit bundles 0 to 31 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 12 | SafeLoadAs(in + 2 * 1) << 4), - static_cast(SafeLoadAs(in + 2 * 1) >> 8 | SafeLoadAs(in + 2 * 2) << 8), - SafeLoadAs(in + 2 * 2), - SafeLoadAs(in + 2 * 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 8 | SafeLoadAs(in + 2 * 5) << 8), - SafeLoadAs(in + 2 * 5), - SafeLoadAs(in + 2 * 6), - static_cast(SafeLoadAs(in + 2 * 6) >> 12 | SafeLoadAs(in + 2 * 7) << 4), - static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), - SafeLoadAs(in + 2 * 8), - SafeLoadAs(in + 2 * 9), - static_cast(SafeLoadAs(in + 2 * 9) >> 12 | SafeLoadAs(in + 2 * 10) << 4), - static_cast(SafeLoadAs(in + 2 * 10) >> 8 | SafeLoadAs(in + 2 * 11) << 8), - SafeLoadAs(in + 2 * 11), - }; - shifts = simd_batch{ 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 32; - - in += 12 * 2; - return in; -} - -template<> -const uint8_t* Simd512Unpacker::unpack<13>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x1fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 13-bit bundles 0 to 31 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 13 | SafeLoadAs(in + 2 * 1) << 3), - static_cast(SafeLoadAs(in + 2 * 1) >> 10 | SafeLoadAs(in + 2 * 2) << 6), - static_cast(SafeLoadAs(in + 2 * 2) >> 7 | SafeLoadAs(in + 2 * 3) << 9), - static_cast(SafeLoadAs(in + 2 * 3) >> 4 | SafeLoadAs(in + 2 * 4) << 12), - SafeLoadAs(in + 2 * 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 14 | SafeLoadAs(in + 2 * 5) << 2), - static_cast(SafeLoadAs(in + 2 * 5) >> 11 | SafeLoadAs(in + 2 * 6) << 5), - static_cast(SafeLoadAs(in + 2 * 6) >> 8 | SafeLoadAs(in + 2 * 7) << 8), - static_cast(SafeLoadAs(in + 2 * 7) >> 5 | SafeLoadAs(in + 2 * 8) << 11), - SafeLoadAs(in + 2 * 8), - static_cast(SafeLoadAs(in + 2 * 8) >> 15 | SafeLoadAs(in + 2 * 9) << 1), - static_cast(SafeLoadAs(in + 2 * 9) >> 12 | SafeLoadAs(in + 2 * 10) << 4), - static_cast(SafeLoadAs(in + 2 * 10) >> 9 | SafeLoadAs(in + 2 * 11) << 7), - static_cast(SafeLoadAs(in + 2 * 11) >> 6 | SafeLoadAs(in + 2 * 12) << 10), - SafeLoadAs(in + 2 * 12), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 32; - - in += 13 * 2; - return in; -} - -template<> -const uint8_t* Simd512Unpacker::unpack<14>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x3fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 14-bit bundles 0 to 31 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 14 | SafeLoadAs(in + 2 * 1) << 2), - static_cast(SafeLoadAs(in + 2 * 1) >> 12 | SafeLoadAs(in + 2 * 2) << 4), - static_cast(SafeLoadAs(in + 2 * 2) >> 10 | SafeLoadAs(in + 2 * 3) << 6), - static_cast(SafeLoadAs(in + 2 * 3) >> 8 | SafeLoadAs(in + 2 * 4) << 8), - static_cast(SafeLoadAs(in + 2 * 4) >> 6 | SafeLoadAs(in + 2 * 5) << 10), - static_cast(SafeLoadAs(in + 2 * 5) >> 4 | SafeLoadAs(in + 2 * 6) << 12), - SafeLoadAs(in + 2 * 6), - SafeLoadAs(in + 2 * 7), - static_cast(SafeLoadAs(in + 2 * 7) >> 14 | SafeLoadAs(in + 2 * 8) << 2), - static_cast(SafeLoadAs(in + 2 * 8) >> 12 | SafeLoadAs(in + 2 * 9) << 4), - static_cast(SafeLoadAs(in + 2 * 9) >> 10 | SafeLoadAs(in + 2 * 10) << 6), - static_cast(SafeLoadAs(in + 2 * 10) >> 8 | SafeLoadAs(in + 2 * 11) << 8), - static_cast(SafeLoadAs(in + 2 * 11) >> 6 | SafeLoadAs(in + 2 * 12) << 10), - static_cast(SafeLoadAs(in + 2 * 12) >> 4 | SafeLoadAs(in + 2 * 13) << 12), - SafeLoadAs(in + 2 * 13), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 32; - - in += 14 * 2; - return in; -} - -template<> -const uint8_t* Simd512Unpacker::unpack<15>(const uint8_t* in, uint16_t* out) { - constexpr uint16_t kMask = 0x7fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 15-bit bundles 0 to 31 - words = simd_batch{ - SafeLoadAs(in + 2 * 0), - static_cast(SafeLoadAs(in + 2 * 0) >> 15 | SafeLoadAs(in + 2 * 1) << 1), - static_cast(SafeLoadAs(in + 2 * 1) >> 14 | SafeLoadAs(in + 2 * 2) << 2), - static_cast(SafeLoadAs(in + 2 * 2) >> 13 | SafeLoadAs(in + 2 * 3) << 3), - static_cast(SafeLoadAs(in + 2 * 3) >> 12 | SafeLoadAs(in + 2 * 4) << 4), - static_cast(SafeLoadAs(in + 2 * 4) >> 11 | SafeLoadAs(in + 2 * 5) << 5), - static_cast(SafeLoadAs(in + 2 * 5) >> 10 | SafeLoadAs(in + 2 * 6) << 6), - static_cast(SafeLoadAs(in + 2 * 6) >> 9 | SafeLoadAs(in + 2 * 7) << 7), - static_cast(SafeLoadAs(in + 2 * 7) >> 8 | SafeLoadAs(in + 2 * 8) << 8), - static_cast(SafeLoadAs(in + 2 * 8) >> 7 | SafeLoadAs(in + 2 * 9) << 9), - static_cast(SafeLoadAs(in + 2 * 9) >> 6 | SafeLoadAs(in + 2 * 10) << 10), - static_cast(SafeLoadAs(in + 2 * 10) >> 5 | SafeLoadAs(in + 2 * 11) << 11), - static_cast(SafeLoadAs(in + 2 * 11) >> 4 | SafeLoadAs(in + 2 * 12) << 12), - static_cast(SafeLoadAs(in + 2 * 12) >> 3 | SafeLoadAs(in + 2 * 13) << 13), - static_cast(SafeLoadAs(in + 2 * 13) >> 2 | SafeLoadAs(in + 2 * 14) << 14), - SafeLoadAs(in + 2 * 14), - }; - shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 32; - - in += 15 * 2; - return in; -} - -template<> -const uint8_t* Simd512Unpacker::unpack<16>(const uint8_t* in, uint16_t* out) { - std::memcpy(out, in, 16 * sizeof(*out)); - in += 2 * 16; - out += 16; - return in; -} - template<> struct Simd512Unpacker { From 642d252f2c3aa6bab33628396606d06937228aad Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 19 Sep 2025 14:35:33 +0200 Subject: [PATCH 24/76] Add SSE4.2 instantiation to bpacking --- cpp/src/arrow/util/bpacking.cc | 11 +++++++++++ cpp/src/arrow/util/bpacking_benchmark.cc | 5 +++++ cpp/src/arrow/util/bpacking_internal.h | 6 ++++++ cpp/src/arrow/util/bpacking_test.cc | 4 ++++ 4 files changed, 26 insertions(+) diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index 903f5e0c144..286614156a2 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -21,6 +21,10 @@ #include "arrow/util/cpu_info.h" #include "arrow/util/dispatch_internal.h" +#if defined(ARROW_HAVE_SSE4_2) +# include "arrow/util/bpacking_simd128_generated_internal.h" +#endif + #if defined(ARROW_HAVE_RUNTIME_AVX2) # include "arrow/util/bpacking_avx2_internal.h" #endif @@ -34,6 +38,13 @@ namespace arrow { namespace internal { +// TODO probably better in its own file +#if defined(ARROW_HAVE_SSE4_2) +int unpack32_sse4_2(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { + return unpack_jump32>(in, out, batch_size, num_bits); +} +#endif + int unpack32_scalar(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { return unpack_jump32>(in, out, batch_size, num_bits); } diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index f0ac22910c6..20e2d065c64 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -129,6 +129,11 @@ BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, unpack32_scalar) BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, unpack64_scalar) ->ArgsProduct(kBitWidthsNumValues64); +#if defined(ARROW_HAVE_SSE4_2) +BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false, unpack32_sse4_2) + ->ArgsProduct(kBitWidthsNumValues32); +#endif + #if defined(ARROW_HAVE_RUNTIME_AVX2) BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, unpack32_avx2, !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), diff --git a/cpp/src/arrow/util/bpacking_internal.h b/cpp/src/arrow/util/bpacking_internal.h index e003cd8c0c6..e1618b44821 100644 --- a/cpp/src/arrow/util/bpacking_internal.h +++ b/cpp/src/arrow/util/bpacking_internal.h @@ -23,6 +23,12 @@ namespace arrow::internal { +#if defined(ARROW_HAVE_SSE4_2) +/// The 32 bit unpacking with SSE 4.2 +ARROW_EXPORT int unpack32_sse4_2(const uint8_t* in, uint32_t* out, int batch_size, + int num_bits); +#endif + /// The scalar 32 bit unpacking. ARROW_EXPORT int unpack32_scalar(const uint8_t* in, uint32_t* out, int batch_size, int num_bits); diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc index a5f8b5ea1e1..728f9048355 100644 --- a/cpp/src/arrow/util/bpacking_test.cc +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -230,6 +230,10 @@ INSTANTIATE_TEST_SUITE_P( TEST_P(TestUnpack, Unpack32Scalar) { this->TestAll(&unpack32_scalar); } TEST_P(TestUnpack, Unpack64Scalar) { this->TestAll(&unpack64_scalar); } +#if defined(ARROW_HAVE_SSE4_2) +TEST_P(TestUnpack, Unpack32Sse42) { this->TestAll(&unpack32_sse4_2); } +#endif + #if defined(ARROW_HAVE_RUNTIME_AVX2) TEST_P(TestUnpack, Unpack32Avx2) { if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) { From ca9ce76be8cd05f28d35c2db00080df9d5f36a07 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 19 Sep 2025 14:51:16 +0200 Subject: [PATCH 25/76] Try: new simd scheme --- .../bpacking_simd128_generated_internal.h | 2642 ++++++++--------- 1 file changed, 1297 insertions(+), 1345 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h index ca78d37c4ff..e2ce7e6671a 100644 --- a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h @@ -32,137 +32,89 @@ namespace arrow::internal { using ::arrow::util::SafeLoadAs; -template +template struct Simd128Unpacker; -template<> +template <> struct Simd128Unpacker { - -using out_type = uint32_t; -using simd_batch = xsimd::make_sized_batch_t; - -static constexpr int kValuesUnpacked = 32; - -template -static const uint8_t* unpack(const uint8_t* in, uint32_t* out); + using out_type = uint32_t; + using simd_batch = xsimd::make_sized_batch_t; + template + using simd_batch_constants = + xsimd::batch_constant; + using simd_bytes = + xsimd::make_sized_batch_t; + template + using simd_bytes_constants = + xsimd::batch_constant; + + static constexpr int kValuesUnpacked = 32; + + template + static const uint8_t* unpack(const uint8_t* in, uint32_t* out); }; -template<> +template <> const uint8_t* Simd128Unpacker::unpack<0>(const uint8_t* in, uint32_t* out) { std::memset(out, 0x0, 32 * sizeof(*out)); out += 32; return in; } -template<> +template +struct Reorder { + static constexpr unsigned get(unsigned i, unsigned n) { + if (i % 4 == 0) { + return K; + } + return 128; + } +}; + +template <> const uint8_t* Simd128Unpacker::unpack<1>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 1-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 0, 1, 2, 3 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 4, 5, 6, 7 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 8, 9, 10, 11 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 12, 13, 14, 15 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 16, 17, 18, 19 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 20, 21, 22, 23 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 24, 25, 26, 27 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 1-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{ 28, 29, 30, 31 }; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 1 * 4; - return in; + constexpr auto kShifts1 = simd_batch_constants<0, 1, 2, 3>{}; + constexpr auto kShifts2 = simd_batch_constants<4, 5, 6, 7>{}; + + { + auto bytes = simd_bytes::load_unaligned(in + 4 * 0); + // TODO var shifts no avail on SSE + { + constexpr auto kReorder = + xsimd::make_batch_constant>(); + auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); + ((numbers >> kShifts1) & kMask).store_unaligned(out + 0 * 4); + ((numbers >> kShifts2) & kMask).store_unaligned(out + 1 * 4); + } + { + constexpr auto kReorder = + xsimd::make_batch_constant>(); + auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); + ((numbers >> kShifts1) & kMask).store_unaligned(out + 2 * 4); + ((numbers >> kShifts2) & kMask).store_unaligned(out + 3 * 4); + } + { + constexpr auto kReorder = + xsimd::make_batch_constant>(); + auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); + ((numbers >> kShifts1) & kMask).store_unaligned(out + 4 * 4); + ((numbers >> kShifts2) & kMask).store_unaligned(out + 5 * 4); + } + { + constexpr auto kReorder = + xsimd::make_batch_constant>(); + auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); + ((numbers >> kShifts1) & kMask).store_unaligned(out + 6 * 4); + ((numbers >> kShifts2) & kMask).store_unaligned(out + 7 * 4); + } + } + + return in + 4; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<2>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3; @@ -172,96 +124,96 @@ const uint8_t* Simd128Unpacker::unpack<2>(const uint8_t* in, uint32_t* // extract 2-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), }; - shifts = simd_batch{ 0, 2, 4, 6 }; + shifts = simd_batch{0, 2, 4, 6}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), }; - shifts = simd_batch{ 8, 10, 12, 14 }; + shifts = simd_batch{8, 10, 12, 14}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), }; - shifts = simd_batch{ 16, 18, 20, 22 }; + shifts = simd_batch{16, 18, 20, 22}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), }; - shifts = simd_batch{ 24, 26, 28, 30 }; + shifts = simd_batch{24, 26, 28, 30}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 0, 2, 4, 6 }; + shifts = simd_batch{0, 2, 4, 6}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 8, 10, 12, 14 }; + shifts = simd_batch{8, 10, 12, 14}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 16, 18, 20, 22 }; + shifts = simd_batch{16, 18, 20, 22}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 2-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 24, 26, 28, 30 }; + shifts = simd_batch{24, 26, 28, 30}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -270,7 +222,7 @@ const uint8_t* Simd128Unpacker::unpack<2>(const uint8_t* in, uint32_t* return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<3>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7; @@ -280,96 +232,96 @@ const uint8_t* Simd128Unpacker::unpack<3>(const uint8_t* in, uint32_t* // extract 3-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), }; - shifts = simd_batch{ 0, 3, 6, 9 }; + shifts = simd_batch{0, 3, 6, 9}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), }; - shifts = simd_batch{ 12, 15, 18, 21 }; + shifts = simd_batch{12, 15, 18, 21}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 24, 27, 0, 1 }; + shifts = simd_batch{24, 27, 0, 1}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 4, 7, 10, 13 }; + shifts = simd_batch{4, 7, 10, 13}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 16, 19, 22, 25 }; + shifts = simd_batch{16, 19, 22, 25}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), }; - shifts = simd_batch{ 28, 0, 2, 5 }; + shifts = simd_batch{28, 0, 2, 5}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), }; - shifts = simd_batch{ 8, 11, 14, 17 }; + shifts = simd_batch{8, 11, 14, 17}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 3-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), }; - shifts = simd_batch{ 20, 23, 26, 29 }; + shifts = simd_batch{20, 23, 26, 29}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -378,7 +330,7 @@ const uint8_t* Simd128Unpacker::unpack<3>(const uint8_t* in, uint32_t* return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<4>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xf; @@ -388,96 +340,96 @@ const uint8_t* Simd128Unpacker::unpack<4>(const uint8_t* in, uint32_t* // extract 4-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), }; - shifts = simd_batch{ 0, 4, 8, 12 }; + shifts = simd_batch{0, 4, 8, 12}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), }; - shifts = simd_batch{ 16, 20, 24, 28 }; + shifts = simd_batch{16, 20, 24, 28}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 0, 4, 8, 12 }; + shifts = simd_batch{0, 4, 8, 12}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 16, 20, 24, 28 }; + shifts = simd_batch{16, 20, 24, 28}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), }; - shifts = simd_batch{ 0, 4, 8, 12 }; + shifts = simd_batch{0, 4, 8, 12}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), }; - shifts = simd_batch{ 16, 20, 24, 28 }; + shifts = simd_batch{16, 20, 24, 28}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), }; - shifts = simd_batch{ 0, 4, 8, 12 }; + shifts = simd_batch{0, 4, 8, 12}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 4-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), }; - shifts = simd_batch{ 16, 20, 24, 28 }; + shifts = simd_batch{16, 20, 24, 28}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -486,7 +438,7 @@ const uint8_t* Simd128Unpacker::unpack<4>(const uint8_t* in, uint32_t* return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<5>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1f; @@ -496,96 +448,96 @@ const uint8_t* Simd128Unpacker::unpack<5>(const uint8_t* in, uint32_t* // extract 5-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), }; - shifts = simd_batch{ 0, 5, 10, 15 }; + shifts = simd_batch{0, 5, 10, 15}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 20, 25, 0, 3 }; + shifts = simd_batch{20, 25, 0, 3}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 8, 13, 18, 23 }; + shifts = simd_batch{8, 13, 18, 23}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), }; - shifts = simd_batch{ 0, 1, 6, 11 }; + shifts = simd_batch{0, 1, 6, 11}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 31 | SafeLoadAs(in + 4 * 3) << 1, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 31 | SafeLoadAs(in + 4 * 3) << 1, }; - shifts = simd_batch{ 16, 21, 26, 0 }; + shifts = simd_batch{16, 21, 26, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), }; - shifts = simd_batch{ 4, 9, 14, 19 }; + shifts = simd_batch{4, 9, 14, 19}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 29 | SafeLoadAs(in + 4 * 4) << 3, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 29 | SafeLoadAs(in + 4 * 4) << 3, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), }; - shifts = simd_batch{ 24, 0, 2, 7 }; + shifts = simd_batch{24, 0, 2, 7}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 5-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), }; - shifts = simd_batch{ 12, 17, 22, 27 }; + shifts = simd_batch{12, 17, 22, 27}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -594,7 +546,7 @@ const uint8_t* Simd128Unpacker::unpack<5>(const uint8_t* in, uint32_t* return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<6>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3f; @@ -604,96 +556,96 @@ const uint8_t* Simd128Unpacker::unpack<6>(const uint8_t* in, uint32_t* // extract 6-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), }; - shifts = simd_batch{ 0, 6, 12, 18 }; + shifts = simd_batch{0, 6, 12, 18}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 24, 0, 4, 10 }; + shifts = simd_batch{24, 0, 4, 10}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), }; - shifts = simd_batch{ 16, 22, 0, 2 }; + shifts = simd_batch{16, 22, 0, 2}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), }; - shifts = simd_batch{ 8, 14, 20, 26 }; + shifts = simd_batch{8, 14, 20, 26}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), }; - shifts = simd_batch{ 0, 6, 12, 18 }; + shifts = simd_batch{0, 6, 12, 18}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), }; - shifts = simd_batch{ 24, 0, 4, 10 }; + shifts = simd_batch{24, 0, 4, 10}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, - SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, + SafeLoadAs(in + 4 * 5), }; - shifts = simd_batch{ 16, 22, 0, 2 }; + shifts = simd_batch{16, 22, 0, 2}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 6-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), }; - shifts = simd_batch{ 8, 14, 20, 26 }; + shifts = simd_batch{8, 14, 20, 26}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -702,7 +654,7 @@ const uint8_t* Simd128Unpacker::unpack<6>(const uint8_t* in, uint32_t* return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<7>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7f; @@ -712,96 +664,96 @@ const uint8_t* Simd128Unpacker::unpack<7>(const uint8_t* in, uint32_t* // extract 7-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), }; - shifts = simd_batch{ 0, 7, 14, 21 }; + shifts = simd_batch{0, 7, 14, 21}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 0, 3, 10, 17 }; + shifts = simd_batch{0, 3, 10, 17}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), }; - shifts = simd_batch{ 24, 0, 6, 13 }; + shifts = simd_batch{24, 0, 6, 13}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 27 | SafeLoadAs(in + 4 * 3) << 5, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 27 | SafeLoadAs(in + 4 * 3) << 5, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), }; - shifts = simd_batch{ 20, 0, 2, 9 }; + shifts = simd_batch{20, 0, 2, 9}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4), }; - shifts = simd_batch{ 16, 23, 0, 5 }; + shifts = simd_batch{16, 23, 0, 5}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, - SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, + SafeLoadAs(in + 4 * 5), }; - shifts = simd_batch{ 12, 19, 0, 1 }; + shifts = simd_batch{12, 19, 0, 1}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, }; - shifts = simd_batch{ 8, 15, 22, 0 }; + shifts = simd_batch{8, 15, 22, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 7-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), }; - shifts = simd_batch{ 4, 11, 18, 25 }; + shifts = simd_batch{4, 11, 18, 25}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -810,7 +762,7 @@ const uint8_t* Simd128Unpacker::unpack<7>(const uint8_t* in, uint32_t* return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<8>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xff; @@ -820,96 +772,96 @@ const uint8_t* Simd128Unpacker::unpack<8>(const uint8_t* in, uint32_t* // extract 8-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), }; - shifts = simd_batch{ 0, 8, 16, 24 }; + shifts = simd_batch{0, 8, 16, 24}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 0, 8, 16, 24 }; + shifts = simd_batch{0, 8, 16, 24}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), }; - shifts = simd_batch{ 0, 8, 16, 24 }; + shifts = simd_batch{0, 8, 16, 24}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), }; - shifts = simd_batch{ 0, 8, 16, 24 }; + shifts = simd_batch{0, 8, 16, 24}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), }; - shifts = simd_batch{ 0, 8, 16, 24 }; + shifts = simd_batch{0, 8, 16, 24}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), }; - shifts = simd_batch{ 0, 8, 16, 24 }; + shifts = simd_batch{0, 8, 16, 24}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), }; - shifts = simd_batch{ 0, 8, 16, 24 }; + shifts = simd_batch{0, 8, 16, 24}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 8-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), }; - shifts = simd_batch{ 0, 8, 16, 24 }; + shifts = simd_batch{0, 8, 16, 24}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -918,7 +870,7 @@ const uint8_t* Simd128Unpacker::unpack<8>(const uint8_t* in, uint32_t* return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<9>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1ff; @@ -928,96 +880,96 @@ const uint8_t* Simd128Unpacker::unpack<9>(const uint8_t* in, uint32_t* // extract 9-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, }; - shifts = simd_batch{ 0, 9, 18, 0 }; + shifts = simd_batch{0, 9, 18, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, }; - shifts = simd_batch{ 4, 13, 22, 0 }; + shifts = simd_batch{4, 13, 22, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3), }; - shifts = simd_batch{ 8, 17, 0, 3 }; + shifts = simd_batch{8, 17, 0, 3}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4), }; - shifts = simd_batch{ 12, 21, 0, 7 }; + shifts = simd_batch{12, 21, 0, 7}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 25 | SafeLoadAs(in + 4 * 5) << 7, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 25 | SafeLoadAs(in + 4 * 5) << 7, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), }; - shifts = simd_batch{ 16, 0, 2, 11 }; + shifts = simd_batch{16, 0, 2, 11}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), }; - shifts = simd_batch{ 20, 0, 6, 15 }; + shifts = simd_batch{20, 0, 6, 15}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), }; - shifts = simd_batch{ 0, 1, 10, 19 }; + shifts = simd_batch{0, 1, 10, 19}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 9-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), }; - shifts = simd_batch{ 0, 5, 14, 23 }; + shifts = simd_batch{0, 5, 14, 23}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -1026,7 +978,7 @@ const uint8_t* Simd128Unpacker::unpack<9>(const uint8_t* in, uint32_t* return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<10>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3ff; @@ -1036,96 +988,96 @@ const uint8_t* Simd128Unpacker::unpack<10>(const uint8_t* in, uint32_t // extract 10-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, }; - shifts = simd_batch{ 0, 10, 20, 0 }; + shifts = simd_batch{0, 10, 20, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), }; - shifts = simd_batch{ 8, 18, 0, 6 }; + shifts = simd_batch{8, 18, 0, 6}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), }; - shifts = simd_batch{ 16, 0, 4, 14 }; + shifts = simd_batch{16, 0, 4, 14}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), }; - shifts = simd_batch{ 0, 2, 12, 22 }; + shifts = simd_batch{0, 2, 12, 22}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 30 | SafeLoadAs(in + 4 * 6) << 2, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 30 | SafeLoadAs(in + 4 * 6) << 2, }; - shifts = simd_batch{ 0, 10, 20, 0 }; + shifts = simd_batch{0, 10, 20, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, - SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, + SafeLoadAs(in + 4 * 7), }; - shifts = simd_batch{ 8, 18, 0, 6 }; + shifts = simd_batch{8, 18, 0, 6}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 26 | SafeLoadAs(in + 4 * 8) << 6, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 26 | SafeLoadAs(in + 4 * 8) << 6, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), }; - shifts = simd_batch{ 16, 0, 4, 14 }; + shifts = simd_batch{16, 0, 4, 14}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 10-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), }; - shifts = simd_batch{ 0, 2, 12, 22 }; + shifts = simd_batch{0, 2, 12, 22}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -1134,7 +1086,7 @@ const uint8_t* Simd128Unpacker::unpack<10>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<11>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7ff; @@ -1144,96 +1096,96 @@ const uint8_t* Simd128Unpacker::unpack<11>(const uint8_t* in, uint32_t // extract 11-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 22 | SafeLoadAs(in + 4 * 1) << 10, - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 22 | SafeLoadAs(in + 4 * 1) << 10, + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 0, 11, 0, 1 }; + shifts = simd_batch{0, 11, 0, 1}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 23 | SafeLoadAs(in + 4 * 2) << 9, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 23 | SafeLoadAs(in + 4 * 2) << 9, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), }; - shifts = simd_batch{ 12, 0, 2, 13 }; + shifts = simd_batch{12, 0, 2, 13}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 2) >> 24 | SafeLoadAs(in + 4 * 3) << 8, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 25 | SafeLoadAs(in + 4 * 4) << 7, + SafeLoadAs(in + 4 * 2) >> 24 | SafeLoadAs(in + 4 * 3) << 8, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 25 | SafeLoadAs(in + 4 * 4) << 7, }; - shifts = simd_batch{ 0, 3, 14, 0 }; + shifts = simd_batch{0, 3, 14, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, - SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, + SafeLoadAs(in + 4 * 5), }; - shifts = simd_batch{ 4, 15, 0, 5 }; + shifts = simd_batch{4, 15, 0, 5}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 27 | SafeLoadAs(in + 4 * 6) << 5, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 27 | SafeLoadAs(in + 4 * 6) << 5, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), }; - shifts = simd_batch{ 16, 0, 6, 17 }; + shifts = simd_batch{16, 0, 6, 17}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 29 | SafeLoadAs(in + 4 * 8) << 3, + SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 29 | SafeLoadAs(in + 4 * 8) << 3, }; - shifts = simd_batch{ 0, 7, 18, 0 }; + shifts = simd_batch{0, 7, 18, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, - SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, + SafeLoadAs(in + 4 * 9), }; - shifts = simd_batch{ 8, 19, 0, 9 }; + shifts = simd_batch{8, 19, 0, 9}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 11-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10), }; - shifts = simd_batch{ 20, 0, 10, 21 }; + shifts = simd_batch{20, 0, 10, 21}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -1242,7 +1194,7 @@ const uint8_t* Simd128Unpacker::unpack<11>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<12>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xfff; @@ -1252,96 +1204,96 @@ const uint8_t* Simd128Unpacker::unpack<12>(const uint8_t* in, uint32_t // extract 12-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 24 | SafeLoadAs(in + 4 * 1) << 8, - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 24 | SafeLoadAs(in + 4 * 1) << 8, + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 0, 12, 0, 4 }; + shifts = simd_batch{0, 12, 0, 4}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 12-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), }; - shifts = simd_batch{ 16, 0, 8, 20 }; + shifts = simd_batch{16, 0, 8, 20}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 12-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4), }; - shifts = simd_batch{ 0, 12, 0, 4 }; + shifts = simd_batch{0, 12, 0, 4}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 12-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), }; - shifts = simd_batch{ 16, 0, 8, 20 }; + shifts = simd_batch{16, 0, 8, 20}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 12-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7), }; - shifts = simd_batch{ 0, 12, 0, 4 }; + shifts = simd_batch{0, 12, 0, 4}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 12-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), }; - shifts = simd_batch{ 16, 0, 8, 20 }; + shifts = simd_batch{16, 0, 8, 20}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 12-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, - SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, + SafeLoadAs(in + 4 * 10), }; - shifts = simd_batch{ 0, 12, 0, 4 }; + shifts = simd_batch{0, 12, 0, 4}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 12-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11), }; - shifts = simd_batch{ 16, 0, 8, 20 }; + shifts = simd_batch{16, 0, 8, 20}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -1350,7 +1302,7 @@ const uint8_t* Simd128Unpacker::unpack<12>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<13>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1fff; @@ -1360,96 +1312,96 @@ const uint8_t* Simd128Unpacker::unpack<13>(const uint8_t* in, uint32_t // extract 13-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 26 | SafeLoadAs(in + 4 * 1) << 6, - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 26 | SafeLoadAs(in + 4 * 1) << 6, + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 0, 13, 0, 7 }; + shifts = simd_batch{0, 13, 0, 7}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 13-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 1) >> 20 | SafeLoadAs(in + 4 * 2) << 12, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 27 | SafeLoadAs(in + 4 * 3) << 5, + SafeLoadAs(in + 4 * 1) >> 20 | SafeLoadAs(in + 4 * 2) << 12, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 27 | SafeLoadAs(in + 4 * 3) << 5, }; - shifts = simd_batch{ 0, 1, 14, 0 }; + shifts = simd_batch{0, 1, 14, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 13-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 21 | SafeLoadAs(in + 4 * 4) << 11, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 21 | SafeLoadAs(in + 4 * 4) << 11, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), }; - shifts = simd_batch{ 8, 0, 2, 15 }; + shifts = simd_batch{8, 0, 2, 15}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 13-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, - SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, + SafeLoadAs(in + 4 * 6), }; - shifts = simd_batch{ 0, 9, 0, 3 }; + shifts = simd_batch{0, 9, 0, 3}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 13-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 29 | SafeLoadAs(in + 4 * 7) << 3, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 23 | SafeLoadAs(in + 4 * 8) << 9, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 29 | SafeLoadAs(in + 4 * 7) << 3, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 23 | SafeLoadAs(in + 4 * 8) << 9, }; - shifts = simd_batch{ 16, 0, 10, 0 }; + shifts = simd_batch{16, 0, 10, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 13-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, - SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, + SafeLoadAs(in + 4 * 9), }; - shifts = simd_batch{ 4, 17, 0, 11 }; + shifts = simd_batch{4, 17, 0, 11}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 13-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, + SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, }; - shifts = simd_batch{ 0, 5, 18, 0 }; + shifts = simd_batch{0, 5, 18, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 13-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12), }; - shifts = simd_batch{ 12, 0, 6, 19 }; + shifts = simd_batch{12, 0, 6, 19}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -1458,7 +1410,7 @@ const uint8_t* Simd128Unpacker::unpack<13>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<14>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3fff; @@ -1468,96 +1420,96 @@ const uint8_t* Simd128Unpacker::unpack<14>(const uint8_t* in, uint32_t // extract 14-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 0, 14, 0, 10 }; + shifts = simd_batch{0, 14, 0, 10}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 14-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 1) >> 24 | SafeLoadAs(in + 4 * 2) << 8, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, - SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 1) >> 24 | SafeLoadAs(in + 4 * 2) << 8, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, + SafeLoadAs(in + 4 * 3), }; - shifts = simd_batch{ 0, 6, 0, 2 }; + shifts = simd_batch{0, 6, 0, 2}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 14-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, }; - shifts = simd_batch{ 16, 0, 12, 0 }; + shifts = simd_batch{16, 0, 12, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 14-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), }; - shifts = simd_batch{ 8, 0, 4, 18 }; + shifts = simd_batch{8, 0, 4, 18}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 14-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8), }; - shifts = simd_batch{ 0, 14, 0, 10 }; + shifts = simd_batch{0, 14, 0, 10}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 14-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, - SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, + SafeLoadAs(in + 4 * 10), }; - shifts = simd_batch{ 0, 6, 0, 2 }; + shifts = simd_batch{0, 6, 0, 2}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 14-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 30 | SafeLoadAs(in + 4 * 11) << 2, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 30 | SafeLoadAs(in + 4 * 11) << 2, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, }; - shifts = simd_batch{ 16, 0, 12, 0 }; + shifts = simd_batch{16, 0, 12, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 14-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13), }; - shifts = simd_batch{ 8, 0, 4, 18 }; + shifts = simd_batch{8, 0, 4, 18}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -1566,7 +1518,7 @@ const uint8_t* Simd128Unpacker::unpack<14>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<15>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7fff; @@ -1576,96 +1528,96 @@ const uint8_t* Simd128Unpacker::unpack<15>(const uint8_t* in, uint32_t // extract 15-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 0, 15, 0, 13 }; + shifts = simd_batch{0, 15, 0, 13}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 15-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3), }; - shifts = simd_batch{ 0, 11, 0, 9 }; + shifts = simd_batch{0, 11, 0, 9}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 15-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, - SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, + SafeLoadAs(in + 4 * 5), }; - shifts = simd_batch{ 0, 7, 0, 5 }; + shifts = simd_batch{0, 7, 0, 5}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 15-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, - SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, + SafeLoadAs(in + 4 * 7), }; - shifts = simd_batch{ 0, 3, 0, 1 }; + shifts = simd_batch{0, 3, 0, 1}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 15-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 31 | SafeLoadAs(in + 4 * 8) << 1, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 29 | SafeLoadAs(in + 4 * 9) << 3, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 31 | SafeLoadAs(in + 4 * 8) << 1, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 29 | SafeLoadAs(in + 4 * 9) << 3, }; - shifts = simd_batch{ 16, 0, 14, 0 }; + shifts = simd_batch{16, 0, 14, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 15-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 27 | SafeLoadAs(in + 4 * 10) << 5, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 25 | SafeLoadAs(in + 4 * 11) << 7, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 27 | SafeLoadAs(in + 4 * 10) << 5, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 25 | SafeLoadAs(in + 4 * 11) << 7, }; - shifts = simd_batch{ 12, 0, 10, 0 }; + shifts = simd_batch{12, 0, 10, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 15-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 23 | SafeLoadAs(in + 4 * 12) << 9, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 23 | SafeLoadAs(in + 4 * 12) << 9, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, }; - shifts = simd_batch{ 8, 0, 6, 0 }; + shifts = simd_batch{8, 0, 6, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 15-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14), }; - shifts = simd_batch{ 4, 0, 2, 17 }; + shifts = simd_batch{4, 0, 2, 17}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -1674,7 +1626,7 @@ const uint8_t* Simd128Unpacker::unpack<15>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<16>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xffff; @@ -1684,96 +1636,96 @@ const uint8_t* Simd128Unpacker::unpack<16>(const uint8_t* in, uint32_t // extract 16-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1), }; - shifts = simd_batch{ 0, 16, 0, 16 }; + shifts = simd_batch{0, 16, 0, 16}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 16-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3), }; - shifts = simd_batch{ 0, 16, 0, 16 }; + shifts = simd_batch{0, 16, 0, 16}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 16-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5), }; - shifts = simd_batch{ 0, 16, 0, 16 }; + shifts = simd_batch{0, 16, 0, 16}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 16-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7), }; - shifts = simd_batch{ 0, 16, 0, 16 }; + shifts = simd_batch{0, 16, 0, 16}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 16-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9), }; - shifts = simd_batch{ 0, 16, 0, 16 }; + shifts = simd_batch{0, 16, 0, 16}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 16-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11), }; - shifts = simd_batch{ 0, 16, 0, 16 }; + shifts = simd_batch{0, 16, 0, 16}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 16-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13), }; - shifts = simd_batch{ 0, 16, 0, 16 }; + shifts = simd_batch{0, 16, 0, 16}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 16-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15), }; - shifts = simd_batch{ 0, 16, 0, 16 }; + shifts = simd_batch{0, 16, 0, 16}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -1782,7 +1734,7 @@ const uint8_t* Simd128Unpacker::unpack<16>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<17>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1ffff; @@ -1792,96 +1744,96 @@ const uint8_t* Simd128Unpacker::unpack<17>(const uint8_t* in, uint32_t // extract 17-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 17 | SafeLoadAs(in + 4 * 1) << 15, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 19 | SafeLoadAs(in + 4 * 2) << 13, + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 17 | SafeLoadAs(in + 4 * 1) << 15, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 19 | SafeLoadAs(in + 4 * 2) << 13, }; - shifts = simd_batch{ 0, 0, 2, 0 }; + shifts = simd_batch{0, 0, 2, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 17-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 21 | SafeLoadAs(in + 4 * 3) << 11, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 23 | SafeLoadAs(in + 4 * 4) << 9, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 21 | SafeLoadAs(in + 4 * 3) << 11, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 23 | SafeLoadAs(in + 4 * 4) << 9, }; - shifts = simd_batch{ 4, 0, 6, 0 }; + shifts = simd_batch{4, 0, 6, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 17-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 25 | SafeLoadAs(in + 4 * 5) << 7, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 27 | SafeLoadAs(in + 4 * 6) << 5, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 25 | SafeLoadAs(in + 4 * 5) << 7, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 27 | SafeLoadAs(in + 4 * 6) << 5, }; - shifts = simd_batch{ 8, 0, 10, 0 }; + shifts = simd_batch{8, 0, 10, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 17-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 29 | SafeLoadAs(in + 4 * 7) << 3, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 31 | SafeLoadAs(in + 4 * 8) << 1, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 29 | SafeLoadAs(in + 4 * 7) << 3, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 31 | SafeLoadAs(in + 4 * 8) << 1, }; - shifts = simd_batch{ 12, 0, 14, 0 }; + shifts = simd_batch{12, 0, 14, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 17-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 8) >> 16 | SafeLoadAs(in + 4 * 9) << 16, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 18 | SafeLoadAs(in + 4 * 10) << 14, - SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 8) >> 16 | SafeLoadAs(in + 4 * 9) << 16, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 18 | SafeLoadAs(in + 4 * 10) << 14, + SafeLoadAs(in + 4 * 10), }; - shifts = simd_batch{ 0, 1, 0, 3 }; + shifts = simd_batch{0, 1, 0, 3}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 17-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 10) >> 20 | SafeLoadAs(in + 4 * 11) << 12, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 22 | SafeLoadAs(in + 4 * 12) << 10, - SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 10) >> 20 | SafeLoadAs(in + 4 * 11) << 12, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 22 | SafeLoadAs(in + 4 * 12) << 10, + SafeLoadAs(in + 4 * 12), }; - shifts = simd_batch{ 0, 5, 0, 7 }; + shifts = simd_batch{0, 5, 0, 7}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 17-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 12) >> 24 | SafeLoadAs(in + 4 * 13) << 8, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 26 | SafeLoadAs(in + 4 * 14) << 6, - SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 12) >> 24 | SafeLoadAs(in + 4 * 13) << 8, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 26 | SafeLoadAs(in + 4 * 14) << 6, + SafeLoadAs(in + 4 * 14), }; - shifts = simd_batch{ 0, 9, 0, 11 }; + shifts = simd_batch{0, 9, 0, 11}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 17-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 14) >> 28 | SafeLoadAs(in + 4 * 15) << 4, - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, - SafeLoadAs(in + 4 * 16), + SafeLoadAs(in + 4 * 14) >> 28 | SafeLoadAs(in + 4 * 15) << 4, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, + SafeLoadAs(in + 4 * 16), }; - shifts = simd_batch{ 0, 13, 0, 15 }; + shifts = simd_batch{0, 13, 0, 15}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -1890,7 +1842,7 @@ const uint8_t* Simd128Unpacker::unpack<17>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<18>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3ffff; @@ -1900,96 +1852,96 @@ const uint8_t* Simd128Unpacker::unpack<18>(const uint8_t* in, uint32_t // extract 18-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 18 | SafeLoadAs(in + 4 * 1) << 14, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 18 | SafeLoadAs(in + 4 * 1) << 14, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, }; - shifts = simd_batch{ 0, 0, 4, 0 }; + shifts = simd_batch{0, 0, 4, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 18-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, }; - shifts = simd_batch{ 8, 0, 12, 0 }; + shifts = simd_batch{8, 0, 12, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 18-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 4) >> 16 | SafeLoadAs(in + 4 * 5) << 16, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 4) >> 16 | SafeLoadAs(in + 4 * 5) << 16, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6), }; - shifts = simd_batch{ 0, 2, 0, 6 }; + shifts = simd_batch{0, 2, 0, 6}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 18-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8), }; - shifts = simd_batch{ 0, 10, 0, 14 }; + shifts = simd_batch{0, 10, 0, 14}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 18-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 18 | SafeLoadAs(in + 4 * 10) << 14, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 22 | SafeLoadAs(in + 4 * 11) << 10, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 18 | SafeLoadAs(in + 4 * 10) << 14, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 22 | SafeLoadAs(in + 4 * 11) << 10, }; - shifts = simd_batch{ 0, 0, 4, 0 }; + shifts = simd_batch{0, 0, 4, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 18-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 30 | SafeLoadAs(in + 4 * 13) << 2, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 30 | SafeLoadAs(in + 4 * 13) << 2, }; - shifts = simd_batch{ 8, 0, 12, 0 }; + shifts = simd_batch{8, 0, 12, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 18-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 20 | SafeLoadAs(in + 4 * 15) << 12, - SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 20 | SafeLoadAs(in + 4 * 15) << 12, + SafeLoadAs(in + 4 * 15), }; - shifts = simd_batch{ 0, 2, 0, 6 }; + shifts = simd_batch{0, 2, 0, 6}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 18-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, - SafeLoadAs(in + 4 * 16), - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, + SafeLoadAs(in + 4 * 16), + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 17), }; - shifts = simd_batch{ 0, 10, 0, 14 }; + shifts = simd_batch{0, 10, 0, 14}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -1998,7 +1950,7 @@ const uint8_t* Simd128Unpacker::unpack<18>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<19>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7ffff; @@ -2008,96 +1960,96 @@ const uint8_t* Simd128Unpacker::unpack<19>(const uint8_t* in, uint32_t // extract 19-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 19 | SafeLoadAs(in + 4 * 1) << 13, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 25 | SafeLoadAs(in + 4 * 2) << 7, + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 19 | SafeLoadAs(in + 4 * 1) << 13, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 25 | SafeLoadAs(in + 4 * 2) << 7, }; - shifts = simd_batch{ 0, 0, 6, 0 }; + shifts = simd_batch{0, 0, 6, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 19-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 31 | SafeLoadAs(in + 4 * 3) << 1, - SafeLoadAs(in + 4 * 3) >> 18 | SafeLoadAs(in + 4 * 4) << 14, - SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 2) >> 31 | SafeLoadAs(in + 4 * 3) << 1, + SafeLoadAs(in + 4 * 3) >> 18 | SafeLoadAs(in + 4 * 4) << 14, + SafeLoadAs(in + 4 * 4), }; - shifts = simd_batch{ 12, 0, 0, 5 }; + shifts = simd_batch{12, 0, 0, 5}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 19-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 4) >> 24 | SafeLoadAs(in + 4 * 5) << 8, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 30 | SafeLoadAs(in + 4 * 6) << 2, - SafeLoadAs(in + 4 * 6) >> 17 | SafeLoadAs(in + 4 * 7) << 15, + SafeLoadAs(in + 4 * 4) >> 24 | SafeLoadAs(in + 4 * 5) << 8, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 30 | SafeLoadAs(in + 4 * 6) << 2, + SafeLoadAs(in + 4 * 6) >> 17 | SafeLoadAs(in + 4 * 7) << 15, }; - shifts = simd_batch{ 0, 11, 0, 0 }; + shifts = simd_batch{0, 11, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 19-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 23 | SafeLoadAs(in + 4 * 8) << 9, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 29 | SafeLoadAs(in + 4 * 9) << 3, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 23 | SafeLoadAs(in + 4 * 8) << 9, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 29 | SafeLoadAs(in + 4 * 9) << 3, }; - shifts = simd_batch{ 4, 0, 10, 0 }; + shifts = simd_batch{4, 0, 10, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 19-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 9) >> 16 | SafeLoadAs(in + 4 * 10) << 16, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 22 | SafeLoadAs(in + 4 * 11) << 10, - SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 9) >> 16 | SafeLoadAs(in + 4 * 10) << 16, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 22 | SafeLoadAs(in + 4 * 11) << 10, + SafeLoadAs(in + 4 * 11), }; - shifts = simd_batch{ 0, 3, 0, 9 }; + shifts = simd_batch{0, 3, 0, 9}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 19-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 11) >> 28 | SafeLoadAs(in + 4 * 12) << 4, - SafeLoadAs(in + 4 * 12) >> 15 | SafeLoadAs(in + 4 * 13) << 17, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 21 | SafeLoadAs(in + 4 * 14) << 11, + SafeLoadAs(in + 4 * 11) >> 28 | SafeLoadAs(in + 4 * 12) << 4, + SafeLoadAs(in + 4 * 12) >> 15 | SafeLoadAs(in + 4 * 13) << 17, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 21 | SafeLoadAs(in + 4 * 14) << 11, }; - shifts = simd_batch{ 0, 0, 2, 0 }; + shifts = simd_batch{0, 0, 2, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 19-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 27 | SafeLoadAs(in + 4 * 15) << 5, - SafeLoadAs(in + 4 * 15) >> 14 | SafeLoadAs(in + 4 * 16) << 18, - SafeLoadAs(in + 4 * 16), + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 27 | SafeLoadAs(in + 4 * 15) << 5, + SafeLoadAs(in + 4 * 15) >> 14 | SafeLoadAs(in + 4 * 16) << 18, + SafeLoadAs(in + 4 * 16), }; - shifts = simd_batch{ 8, 0, 0, 1 }; + shifts = simd_batch{8, 0, 0, 1}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 19-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 16) >> 20 | SafeLoadAs(in + 4 * 17) << 12, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, - SafeLoadAs(in + 4 * 18), + SafeLoadAs(in + 4 * 16) >> 20 | SafeLoadAs(in + 4 * 17) << 12, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, + SafeLoadAs(in + 4 * 18), }; - shifts = simd_batch{ 0, 7, 0, 13 }; + shifts = simd_batch{0, 7, 0, 13}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -2106,7 +2058,7 @@ const uint8_t* Simd128Unpacker::unpack<19>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<20>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xfffff; @@ -2116,96 +2068,96 @@ const uint8_t* Simd128Unpacker::unpack<20>(const uint8_t* in, uint32_t // extract 20-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 20 | SafeLoadAs(in + 4 * 1) << 12, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 20 | SafeLoadAs(in + 4 * 1) << 12, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, }; - shifts = simd_batch{ 0, 0, 8, 0 }; + shifts = simd_batch{0, 0, 8, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 20-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 2) >> 16 | SafeLoadAs(in + 4 * 3) << 16, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 2) >> 16 | SafeLoadAs(in + 4 * 3) << 16, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4), }; - shifts = simd_batch{ 0, 4, 0, 12 }; + shifts = simd_batch{0, 4, 0, 12}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 20-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, }; - shifts = simd_batch{ 0, 0, 8, 0 }; + shifts = simd_batch{0, 0, 8, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 20-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, - SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, + SafeLoadAs(in + 4 * 9), }; - shifts = simd_batch{ 0, 4, 0, 12 }; + shifts = simd_batch{0, 4, 0, 12}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 20-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 20 | SafeLoadAs(in + 4 * 11) << 12, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 28 | SafeLoadAs(in + 4 * 12) << 4, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 20 | SafeLoadAs(in + 4 * 11) << 12, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 28 | SafeLoadAs(in + 4 * 12) << 4, }; - shifts = simd_batch{ 0, 0, 8, 0 }; + shifts = simd_batch{0, 0, 8, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 20-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 12) >> 16 | SafeLoadAs(in + 4 * 13) << 16, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 24 | SafeLoadAs(in + 4 * 14) << 8, - SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 12) >> 16 | SafeLoadAs(in + 4 * 13) << 16, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 24 | SafeLoadAs(in + 4 * 14) << 8, + SafeLoadAs(in + 4 * 14), }; - shifts = simd_batch{ 0, 4, 0, 12 }; + shifts = simd_batch{0, 4, 0, 12}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 20-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 20 | SafeLoadAs(in + 4 * 16) << 12, - SafeLoadAs(in + 4 * 16), - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 20 | SafeLoadAs(in + 4 * 16) << 12, + SafeLoadAs(in + 4 * 16), + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, }; - shifts = simd_batch{ 0, 0, 8, 0 }; + shifts = simd_batch{0, 0, 8, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 20-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 17) >> 16 | SafeLoadAs(in + 4 * 18) << 16, - SafeLoadAs(in + 4 * 18), - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19), + SafeLoadAs(in + 4 * 17) >> 16 | SafeLoadAs(in + 4 * 18) << 16, + SafeLoadAs(in + 4 * 18), + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19), }; - shifts = simd_batch{ 0, 4, 0, 12 }; + shifts = simd_batch{0, 4, 0, 12}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -2214,7 +2166,7 @@ const uint8_t* Simd128Unpacker::unpack<20>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<21>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1fffff; @@ -2224,96 +2176,96 @@ const uint8_t* Simd128Unpacker::unpack<21>(const uint8_t* in, uint32_t // extract 21-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 21 | SafeLoadAs(in + 4 * 1) << 11, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 21 | SafeLoadAs(in + 4 * 1) << 11, + SafeLoadAs(in + 4 * 1), + SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, }; - shifts = simd_batch{ 0, 0, 10, 0 }; + shifts = simd_batch{0, 0, 10, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 21-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4) >> 19 | SafeLoadAs(in + 4 * 5) << 13, + SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, + SafeLoadAs(in + 4 * 4) >> 19 | SafeLoadAs(in + 4 * 5) << 13, }; - shifts = simd_batch{ 0, 9, 0, 0 }; + shifts = simd_batch{0, 9, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 21-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, - SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, - SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, + SafeLoadAs(in + 4 * 7), }; - shifts = simd_batch{ 8, 0, 0, 7 }; + shifts = simd_batch{8, 0, 0, 7}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 21-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8) >> 17 | SafeLoadAs(in + 4 * 9) << 15, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 27 | SafeLoadAs(in + 4 * 10) << 5, + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8) >> 17 | SafeLoadAs(in + 4 * 9) << 15, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 27 | SafeLoadAs(in + 4 * 10) << 5, }; - shifts = simd_batch{ 0, 0, 6, 0 }; + shifts = simd_batch{0, 0, 6, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 21-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, - SafeLoadAs(in + 4 * 12) >> 15 | SafeLoadAs(in + 4 * 13) << 17, + SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + SafeLoadAs(in + 4 * 12) >> 15 | SafeLoadAs(in + 4 * 13) << 17, }; - shifts = simd_batch{ 0, 5, 0, 0 }; + shifts = simd_batch{0, 5, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 21-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 25 | SafeLoadAs(in + 4 * 14) << 7, - SafeLoadAs(in + 4 * 14) >> 14 | SafeLoadAs(in + 4 * 15) << 18, - SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 25 | SafeLoadAs(in + 4 * 14) << 7, + SafeLoadAs(in + 4 * 14) >> 14 | SafeLoadAs(in + 4 * 15) << 18, + SafeLoadAs(in + 4 * 15), }; - shifts = simd_batch{ 4, 0, 0, 3 }; + shifts = simd_batch{4, 0, 0, 3}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 21-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, - SafeLoadAs(in + 4 * 16) >> 13 | SafeLoadAs(in + 4 * 17) << 19, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, + SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, + SafeLoadAs(in + 4 * 16) >> 13 | SafeLoadAs(in + 4 * 17) << 19, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, }; - shifts = simd_batch{ 0, 0, 2, 0 }; + shifts = simd_batch{0, 0, 2, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 21-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 18) >> 12 | SafeLoadAs(in + 4 * 19) << 20, - SafeLoadAs(in + 4 * 19), - SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, - SafeLoadAs(in + 4 * 20), + SafeLoadAs(in + 4 * 18) >> 12 | SafeLoadAs(in + 4 * 19) << 20, + SafeLoadAs(in + 4 * 19), + SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, + SafeLoadAs(in + 4 * 20), }; - shifts = simd_batch{ 0, 1, 0, 11 }; + shifts = simd_batch{0, 1, 0, 11}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -2322,7 +2274,7 @@ const uint8_t* Simd128Unpacker::unpack<21>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<22>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3fffff; @@ -2332,96 +2284,96 @@ const uint8_t* Simd128Unpacker::unpack<22>(const uint8_t* in, uint32_t // extract 22-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 22 | SafeLoadAs(in + 4 * 1) << 10, - SafeLoadAs(in + 4 * 1) >> 12 | SafeLoadAs(in + 4 * 2) << 20, - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 22 | SafeLoadAs(in + 4 * 1) << 10, + SafeLoadAs(in + 4 * 1) >> 12 | SafeLoadAs(in + 4 * 2) << 20, + SafeLoadAs(in + 4 * 2), }; - shifts = simd_batch{ 0, 0, 0, 2 }; + shifts = simd_batch{0, 0, 0, 2}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 22-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 2) >> 24 | SafeLoadAs(in + 4 * 3) << 8, - SafeLoadAs(in + 4 * 3) >> 14 | SafeLoadAs(in + 4 * 4) << 18, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, + SafeLoadAs(in + 4 * 2) >> 24 | SafeLoadAs(in + 4 * 3) << 8, + SafeLoadAs(in + 4 * 3) >> 14 | SafeLoadAs(in + 4 * 4) << 18, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, }; - shifts = simd_batch{ 0, 0, 4, 0 }; + shifts = simd_batch{0, 0, 4, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 22-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 5) >> 16 | SafeLoadAs(in + 4 * 6) << 16, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, - SafeLoadAs(in + 4 * 7) >> 18 | SafeLoadAs(in + 4 * 8) << 14, + SafeLoadAs(in + 4 * 5) >> 16 | SafeLoadAs(in + 4 * 6) << 16, + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, + SafeLoadAs(in + 4 * 7) >> 18 | SafeLoadAs(in + 4 * 8) << 14, }; - shifts = simd_batch{ 0, 6, 0, 0 }; + shifts = simd_batch{0, 6, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 22-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, - SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, - SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, + SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, + SafeLoadAs(in + 4 * 10), }; - shifts = simd_batch{ 8, 0, 0, 10 }; + shifts = simd_batch{8, 0, 0, 10}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 22-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 22 | SafeLoadAs(in + 4 * 12) << 10, - SafeLoadAs(in + 4 * 12) >> 12 | SafeLoadAs(in + 4 * 13) << 20, - SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 11) >> 22 | SafeLoadAs(in + 4 * 12) << 10, + SafeLoadAs(in + 4 * 12) >> 12 | SafeLoadAs(in + 4 * 13) << 20, + SafeLoadAs(in + 4 * 13), }; - shifts = simd_batch{ 0, 0, 0, 2 }; + shifts = simd_batch{0, 0, 0, 2}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 22-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 13) >> 24 | SafeLoadAs(in + 4 * 14) << 8, - SafeLoadAs(in + 4 * 14) >> 14 | SafeLoadAs(in + 4 * 15) << 18, - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 26 | SafeLoadAs(in + 4 * 16) << 6, + SafeLoadAs(in + 4 * 13) >> 24 | SafeLoadAs(in + 4 * 14) << 8, + SafeLoadAs(in + 4 * 14) >> 14 | SafeLoadAs(in + 4 * 15) << 18, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 26 | SafeLoadAs(in + 4 * 16) << 6, }; - shifts = simd_batch{ 0, 0, 4, 0 }; + shifts = simd_batch{0, 0, 4, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 22-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 16) >> 16 | SafeLoadAs(in + 4 * 17) << 16, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 28 | SafeLoadAs(in + 4 * 18) << 4, - SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, + SafeLoadAs(in + 4 * 16) >> 16 | SafeLoadAs(in + 4 * 17) << 16, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 28 | SafeLoadAs(in + 4 * 18) << 4, + SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, }; - shifts = simd_batch{ 0, 6, 0, 0 }; + shifts = simd_batch{0, 6, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 22-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 19), - SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, - SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, - SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 19), + SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, + SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, + SafeLoadAs(in + 4 * 21), }; - shifts = simd_batch{ 8, 0, 0, 10 }; + shifts = simd_batch{8, 0, 0, 10}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -2430,7 +2382,7 @@ const uint8_t* Simd128Unpacker::unpack<22>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<23>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7fffff; @@ -2440,96 +2392,96 @@ const uint8_t* Simd128Unpacker::unpack<23>(const uint8_t* in, uint32_t // extract 23-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 23 | SafeLoadAs(in + 4 * 1) << 9, - SafeLoadAs(in + 4 * 1) >> 14 | SafeLoadAs(in + 4 * 2) << 18, - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 23 | SafeLoadAs(in + 4 * 1) << 9, + SafeLoadAs(in + 4 * 1) >> 14 | SafeLoadAs(in + 4 * 2) << 18, + SafeLoadAs(in + 4 * 2), }; - shifts = simd_batch{ 0, 0, 0, 5 }; + shifts = simd_batch{0, 0, 0, 5}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 23-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 2) >> 28 | SafeLoadAs(in + 4 * 3) << 4, - SafeLoadAs(in + 4 * 3) >> 19 | SafeLoadAs(in + 4 * 4) << 13, - SafeLoadAs(in + 4 * 4) >> 10 | SafeLoadAs(in + 4 * 5) << 22, - SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 2) >> 28 | SafeLoadAs(in + 4 * 3) << 4, + SafeLoadAs(in + 4 * 3) >> 19 | SafeLoadAs(in + 4 * 4) << 13, + SafeLoadAs(in + 4 * 4) >> 10 | SafeLoadAs(in + 4 * 5) << 22, + SafeLoadAs(in + 4 * 5), }; - shifts = simd_batch{ 0, 0, 0, 1 }; + shifts = simd_batch{0, 0, 0, 1}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 23-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 5) >> 24 | SafeLoadAs(in + 4 * 6) << 8, - SafeLoadAs(in + 4 * 6) >> 15 | SafeLoadAs(in + 4 * 7) << 17, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 29 | SafeLoadAs(in + 4 * 8) << 3, + SafeLoadAs(in + 4 * 5) >> 24 | SafeLoadAs(in + 4 * 6) << 8, + SafeLoadAs(in + 4 * 6) >> 15 | SafeLoadAs(in + 4 * 7) << 17, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 29 | SafeLoadAs(in + 4 * 8) << 3, }; - shifts = simd_batch{ 0, 0, 6, 0 }; + shifts = simd_batch{0, 0, 6, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 23-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 8) >> 20 | SafeLoadAs(in + 4 * 9) << 12, - SafeLoadAs(in + 4 * 9) >> 11 | SafeLoadAs(in + 4 * 10) << 21, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 25 | SafeLoadAs(in + 4 * 11) << 7, + SafeLoadAs(in + 4 * 8) >> 20 | SafeLoadAs(in + 4 * 9) << 12, + SafeLoadAs(in + 4 * 9) >> 11 | SafeLoadAs(in + 4 * 10) << 21, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 25 | SafeLoadAs(in + 4 * 11) << 7, }; - shifts = simd_batch{ 0, 0, 2, 0 }; + shifts = simd_batch{0, 0, 2, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 23-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 11) >> 16 | SafeLoadAs(in + 4 * 12) << 16, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 30 | SafeLoadAs(in + 4 * 13) << 2, - SafeLoadAs(in + 4 * 13) >> 21 | SafeLoadAs(in + 4 * 14) << 11, + SafeLoadAs(in + 4 * 11) >> 16 | SafeLoadAs(in + 4 * 12) << 16, + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 30 | SafeLoadAs(in + 4 * 13) << 2, + SafeLoadAs(in + 4 * 13) >> 21 | SafeLoadAs(in + 4 * 14) << 11, }; - shifts = simd_batch{ 0, 7, 0, 0 }; + shifts = simd_batch{0, 7, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 23-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 14) >> 12 | SafeLoadAs(in + 4 * 15) << 20, - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 26 | SafeLoadAs(in + 4 * 16) << 6, - SafeLoadAs(in + 4 * 16) >> 17 | SafeLoadAs(in + 4 * 17) << 15, + SafeLoadAs(in + 4 * 14) >> 12 | SafeLoadAs(in + 4 * 15) << 20, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 26 | SafeLoadAs(in + 4 * 16) << 6, + SafeLoadAs(in + 4 * 16) >> 17 | SafeLoadAs(in + 4 * 17) << 15, }; - shifts = simd_batch{ 0, 3, 0, 0 }; + shifts = simd_batch{0, 3, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 23-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 31 | SafeLoadAs(in + 4 * 18) << 1, - SafeLoadAs(in + 4 * 18) >> 22 | SafeLoadAs(in + 4 * 19) << 10, - SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 31 | SafeLoadAs(in + 4 * 18) << 1, + SafeLoadAs(in + 4 * 18) >> 22 | SafeLoadAs(in + 4 * 19) << 10, + SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, }; - shifts = simd_batch{ 8, 0, 0, 0 }; + shifts = simd_batch{8, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 23-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 20), - SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, - SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, - SafeLoadAs(in + 4 * 22), + SafeLoadAs(in + 4 * 20), + SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, + SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, + SafeLoadAs(in + 4 * 22), }; - shifts = simd_batch{ 4, 0, 0, 9 }; + shifts = simd_batch{4, 0, 0, 9}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -2538,7 +2490,7 @@ const uint8_t* Simd128Unpacker::unpack<23>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<24>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xffffff; @@ -2548,96 +2500,96 @@ const uint8_t* Simd128Unpacker::unpack<24>(const uint8_t* in, uint32_t // extract 24-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 24 | SafeLoadAs(in + 4 * 1) << 8, - SafeLoadAs(in + 4 * 1) >> 16 | SafeLoadAs(in + 4 * 2) << 16, - SafeLoadAs(in + 4 * 2), + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 24 | SafeLoadAs(in + 4 * 1) << 8, + SafeLoadAs(in + 4 * 1) >> 16 | SafeLoadAs(in + 4 * 2) << 16, + SafeLoadAs(in + 4 * 2), }; - shifts = simd_batch{ 0, 0, 0, 8 }; + shifts = simd_batch{0, 0, 0, 8}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 24-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4) >> 16 | SafeLoadAs(in + 4 * 5) << 16, - SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4) >> 16 | SafeLoadAs(in + 4 * 5) << 16, + SafeLoadAs(in + 4 * 5), }; - shifts = simd_batch{ 0, 0, 0, 8 }; + shifts = simd_batch{0, 0, 0, 8}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 24-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, - SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, + SafeLoadAs(in + 4 * 8), }; - shifts = simd_batch{ 0, 0, 0, 8 }; + shifts = simd_batch{0, 0, 0, 8}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 24-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, - SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, - SafeLoadAs(in + 4 * 11), + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, + SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, + SafeLoadAs(in + 4 * 11), }; - shifts = simd_batch{ 0, 0, 0, 8 }; + shifts = simd_batch{0, 0, 0, 8}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 24-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 24 | SafeLoadAs(in + 4 * 13) << 8, - SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, - SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 12) >> 24 | SafeLoadAs(in + 4 * 13) << 8, + SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, + SafeLoadAs(in + 4 * 14), }; - shifts = simd_batch{ 0, 0, 0, 8 }; + shifts = simd_batch{0, 0, 0, 8}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 24-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, - SafeLoadAs(in + 4 * 16) >> 16 | SafeLoadAs(in + 4 * 17) << 16, - SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, + SafeLoadAs(in + 4 * 16) >> 16 | SafeLoadAs(in + 4 * 17) << 16, + SafeLoadAs(in + 4 * 17), }; - shifts = simd_batch{ 0, 0, 0, 8 }; + shifts = simd_batch{0, 0, 0, 8}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 24-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 18), - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19) >> 16 | SafeLoadAs(in + 4 * 20) << 16, - SafeLoadAs(in + 4 * 20), + SafeLoadAs(in + 4 * 18), + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19) >> 16 | SafeLoadAs(in + 4 * 20) << 16, + SafeLoadAs(in + 4 * 20), }; - shifts = simd_batch{ 0, 0, 0, 8 }; + shifts = simd_batch{0, 0, 0, 8}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 24-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, - SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, - SafeLoadAs(in + 4 * 23), + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, + SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, + SafeLoadAs(in + 4 * 23), }; - shifts = simd_batch{ 0, 0, 0, 8 }; + shifts = simd_batch{0, 0, 0, 8}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -2646,7 +2598,7 @@ const uint8_t* Simd128Unpacker::unpack<24>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<25>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1ffffff; @@ -2656,96 +2608,96 @@ const uint8_t* Simd128Unpacker::unpack<25>(const uint8_t* in, uint32_t // extract 25-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 25 | SafeLoadAs(in + 4 * 1) << 7, - SafeLoadAs(in + 4 * 1) >> 18 | SafeLoadAs(in + 4 * 2) << 14, - SafeLoadAs(in + 4 * 2) >> 11 | SafeLoadAs(in + 4 * 3) << 21, + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 25 | SafeLoadAs(in + 4 * 1) << 7, + SafeLoadAs(in + 4 * 1) >> 18 | SafeLoadAs(in + 4 * 2) << 14, + SafeLoadAs(in + 4 * 2) >> 11 | SafeLoadAs(in + 4 * 3) << 21, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 25-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 29 | SafeLoadAs(in + 4 * 4) << 3, - SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, - SafeLoadAs(in + 4 * 5) >> 15 | SafeLoadAs(in + 4 * 6) << 17, + SafeLoadAs(in + 4 * 3), + SafeLoadAs(in + 4 * 3) >> 29 | SafeLoadAs(in + 4 * 4) << 3, + SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, + SafeLoadAs(in + 4 * 5) >> 15 | SafeLoadAs(in + 4 * 6) << 17, }; - shifts = simd_batch{ 4, 0, 0, 0 }; + shifts = simd_batch{4, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 25-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 8 | SafeLoadAs(in + 4 * 7) << 24, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 26 | SafeLoadAs(in + 4 * 8) << 6, - SafeLoadAs(in + 4 * 8) >> 19 | SafeLoadAs(in + 4 * 9) << 13, + SafeLoadAs(in + 4 * 6) >> 8 | SafeLoadAs(in + 4 * 7) << 24, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 26 | SafeLoadAs(in + 4 * 8) << 6, + SafeLoadAs(in + 4 * 8) >> 19 | SafeLoadAs(in + 4 * 9) << 13, }; - shifts = simd_batch{ 0, 1, 0, 0 }; + shifts = simd_batch{0, 1, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 25-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 30 | SafeLoadAs(in + 4 * 11) << 2, - SafeLoadAs(in + 4 * 11) >> 23 | SafeLoadAs(in + 4 * 12) << 9, + SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 30 | SafeLoadAs(in + 4 * 11) << 2, + SafeLoadAs(in + 4 * 11) >> 23 | SafeLoadAs(in + 4 * 12) << 9, }; - shifts = simd_batch{ 0, 5, 0, 0 }; + shifts = simd_batch{0, 5, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 25-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 12) >> 16 | SafeLoadAs(in + 4 * 13) << 16, - SafeLoadAs(in + 4 * 13) >> 9 | SafeLoadAs(in + 4 * 14) << 23, - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 27 | SafeLoadAs(in + 4 * 15) << 5, + SafeLoadAs(in + 4 * 12) >> 16 | SafeLoadAs(in + 4 * 13) << 16, + SafeLoadAs(in + 4 * 13) >> 9 | SafeLoadAs(in + 4 * 14) << 23, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 27 | SafeLoadAs(in + 4 * 15) << 5, }; - shifts = simd_batch{ 0, 0, 2, 0 }; + shifts = simd_batch{0, 0, 2, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 25-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 15) >> 20 | SafeLoadAs(in + 4 * 16) << 12, - SafeLoadAs(in + 4 * 16) >> 13 | SafeLoadAs(in + 4 * 17) << 19, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 31 | SafeLoadAs(in + 4 * 18) << 1, + SafeLoadAs(in + 4 * 15) >> 20 | SafeLoadAs(in + 4 * 16) << 12, + SafeLoadAs(in + 4 * 16) >> 13 | SafeLoadAs(in + 4 * 17) << 19, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 31 | SafeLoadAs(in + 4 * 18) << 1, }; - shifts = simd_batch{ 0, 0, 6, 0 }; + shifts = simd_batch{0, 0, 6, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 25-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19) >> 17 | SafeLoadAs(in + 4 * 20) << 15, - SafeLoadAs(in + 4 * 20) >> 10 | SafeLoadAs(in + 4 * 21) << 22, - SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19) >> 17 | SafeLoadAs(in + 4 * 20) << 15, + SafeLoadAs(in + 4 * 20) >> 10 | SafeLoadAs(in + 4 * 21) << 22, + SafeLoadAs(in + 4 * 21), }; - shifts = simd_batch{ 0, 0, 0, 3 }; + shifts = simd_batch{0, 0, 0, 3}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 25-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 21) >> 28 | SafeLoadAs(in + 4 * 22) << 4, - SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, - SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, - SafeLoadAs(in + 4 * 24), + SafeLoadAs(in + 4 * 21) >> 28 | SafeLoadAs(in + 4 * 22) << 4, + SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, + SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, + SafeLoadAs(in + 4 * 24), }; - shifts = simd_batch{ 0, 0, 0, 7 }; + shifts = simd_batch{0, 0, 0, 7}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -2754,7 +2706,7 @@ const uint8_t* Simd128Unpacker::unpack<25>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<26>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3ffffff; @@ -2764,96 +2716,96 @@ const uint8_t* Simd128Unpacker::unpack<26>(const uint8_t* in, uint32_t // extract 26-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 26 | SafeLoadAs(in + 4 * 1) << 6, - SafeLoadAs(in + 4 * 1) >> 20 | SafeLoadAs(in + 4 * 2) << 12, - SafeLoadAs(in + 4 * 2) >> 14 | SafeLoadAs(in + 4 * 3) << 18, + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 26 | SafeLoadAs(in + 4 * 1) << 6, + SafeLoadAs(in + 4 * 1) >> 20 | SafeLoadAs(in + 4 * 2) << 12, + SafeLoadAs(in + 4 * 2) >> 14 | SafeLoadAs(in + 4 * 3) << 18, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 26-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 8 | SafeLoadAs(in + 4 * 4) << 24, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, - SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, + SafeLoadAs(in + 4 * 3) >> 8 | SafeLoadAs(in + 4 * 4) << 24, + SafeLoadAs(in + 4 * 4), + SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, + SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, }; - shifts = simd_batch{ 0, 2, 0, 0 }; + shifts = simd_batch{0, 2, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 26-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 16 | SafeLoadAs(in + 4 * 7) << 16, - SafeLoadAs(in + 4 * 7) >> 10 | SafeLoadAs(in + 4 * 8) << 22, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, + SafeLoadAs(in + 4 * 6) >> 16 | SafeLoadAs(in + 4 * 7) << 16, + SafeLoadAs(in + 4 * 7) >> 10 | SafeLoadAs(in + 4 * 8) << 22, + SafeLoadAs(in + 4 * 8), + SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, }; - shifts = simd_batch{ 0, 0, 4, 0 }; + shifts = simd_batch{0, 0, 4, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 26-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, - SafeLoadAs(in + 4 * 10) >> 18 | SafeLoadAs(in + 4 * 11) << 14, - SafeLoadAs(in + 4 * 11) >> 12 | SafeLoadAs(in + 4 * 12) << 20, - SafeLoadAs(in + 4 * 12), + SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, + SafeLoadAs(in + 4 * 10) >> 18 | SafeLoadAs(in + 4 * 11) << 14, + SafeLoadAs(in + 4 * 11) >> 12 | SafeLoadAs(in + 4 * 12) << 20, + SafeLoadAs(in + 4 * 12), }; - shifts = simd_batch{ 0, 0, 0, 6 }; + shifts = simd_batch{0, 0, 0, 6}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 26-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 26 | SafeLoadAs(in + 4 * 14) << 6, - SafeLoadAs(in + 4 * 14) >> 20 | SafeLoadAs(in + 4 * 15) << 12, - SafeLoadAs(in + 4 * 15) >> 14 | SafeLoadAs(in + 4 * 16) << 18, + SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 13) >> 26 | SafeLoadAs(in + 4 * 14) << 6, + SafeLoadAs(in + 4 * 14) >> 20 | SafeLoadAs(in + 4 * 15) << 12, + SafeLoadAs(in + 4 * 15) >> 14 | SafeLoadAs(in + 4 * 16) << 18, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 26-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 16) >> 8 | SafeLoadAs(in + 4 * 17) << 24, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 28 | SafeLoadAs(in + 4 * 18) << 4, - SafeLoadAs(in + 4 * 18) >> 22 | SafeLoadAs(in + 4 * 19) << 10, + SafeLoadAs(in + 4 * 16) >> 8 | SafeLoadAs(in + 4 * 17) << 24, + SafeLoadAs(in + 4 * 17), + SafeLoadAs(in + 4 * 17) >> 28 | SafeLoadAs(in + 4 * 18) << 4, + SafeLoadAs(in + 4 * 18) >> 22 | SafeLoadAs(in + 4 * 19) << 10, }; - shifts = simd_batch{ 0, 2, 0, 0 }; + shifts = simd_batch{0, 2, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 26-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 19) >> 16 | SafeLoadAs(in + 4 * 20) << 16, - SafeLoadAs(in + 4 * 20) >> 10 | SafeLoadAs(in + 4 * 21) << 22, - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, + SafeLoadAs(in + 4 * 19) >> 16 | SafeLoadAs(in + 4 * 20) << 16, + SafeLoadAs(in + 4 * 20) >> 10 | SafeLoadAs(in + 4 * 21) << 22, + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, }; - shifts = simd_batch{ 0, 0, 4, 0 }; + shifts = simd_batch{0, 0, 4, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 26-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 22) >> 24 | SafeLoadAs(in + 4 * 23) << 8, - SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, - SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, - SafeLoadAs(in + 4 * 25), + SafeLoadAs(in + 4 * 22) >> 24 | SafeLoadAs(in + 4 * 23) << 8, + SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, + SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, + SafeLoadAs(in + 4 * 25), }; - shifts = simd_batch{ 0, 0, 0, 6 }; + shifts = simd_batch{0, 0, 0, 6}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -2862,7 +2814,7 @@ const uint8_t* Simd128Unpacker::unpack<26>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<27>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7ffffff; @@ -2872,96 +2824,96 @@ const uint8_t* Simd128Unpacker::unpack<27>(const uint8_t* in, uint32_t // extract 27-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, - SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, - SafeLoadAs(in + 4 * 2) >> 17 | SafeLoadAs(in + 4 * 3) << 15, + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, + SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, + SafeLoadAs(in + 4 * 2) >> 17 | SafeLoadAs(in + 4 * 3) << 15, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 27-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 12 | SafeLoadAs(in + 4 * 4) << 20, - SafeLoadAs(in + 4 * 4) >> 7 | SafeLoadAs(in + 4 * 5) << 25, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, + SafeLoadAs(in + 4 * 3) >> 12 | SafeLoadAs(in + 4 * 4) << 20, + SafeLoadAs(in + 4 * 4) >> 7 | SafeLoadAs(in + 4 * 5) << 25, + SafeLoadAs(in + 4 * 5), + SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, }; - shifts = simd_batch{ 0, 0, 2, 0 }; + shifts = simd_batch{0, 0, 2, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 27-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7) >> 19 | SafeLoadAs(in + 4 * 8) << 13, - SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, - SafeLoadAs(in + 4 * 9) >> 9 | SafeLoadAs(in + 4 * 10) << 23, + SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, + SafeLoadAs(in + 4 * 7) >> 19 | SafeLoadAs(in + 4 * 8) << 13, + SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, + SafeLoadAs(in + 4 * 9) >> 9 | SafeLoadAs(in + 4 * 10) << 23, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 27-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, - SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, + SafeLoadAs(in + 4 * 10), + SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, + SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, + SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, }; - shifts = simd_batch{ 4, 0, 0, 0 }; + shifts = simd_batch{4, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 27-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, - SafeLoadAs(in + 4 * 14) >> 11 | SafeLoadAs(in + 4 * 15) << 21, - SafeLoadAs(in + 4 * 15) >> 6 | SafeLoadAs(in + 4 * 16) << 26, - SafeLoadAs(in + 4 * 16), + SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, + SafeLoadAs(in + 4 * 14) >> 11 | SafeLoadAs(in + 4 * 15) << 21, + SafeLoadAs(in + 4 * 15) >> 6 | SafeLoadAs(in + 4 * 16) << 26, + SafeLoadAs(in + 4 * 16), }; - shifts = simd_batch{ 0, 0, 0, 1 }; + shifts = simd_batch{0, 0, 0, 1}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 27-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, - SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, - SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, + SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, + SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 27-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 20) >> 8 | SafeLoadAs(in + 4 * 21) << 24, - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, - SafeLoadAs(in + 4 * 22) >> 25 | SafeLoadAs(in + 4 * 23) << 7, + SafeLoadAs(in + 4 * 20) >> 8 | SafeLoadAs(in + 4 * 21) << 24, + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, + SafeLoadAs(in + 4 * 22) >> 25 | SafeLoadAs(in + 4 * 23) << 7, }; - shifts = simd_batch{ 0, 3, 0, 0 }; + shifts = simd_batch{0, 3, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 27-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, - SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, - SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, - SafeLoadAs(in + 4 * 26), + SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, + SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, + SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, + SafeLoadAs(in + 4 * 26), }; - shifts = simd_batch{ 0, 0, 0, 5 }; + shifts = simd_batch{0, 0, 0, 5}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -2970,7 +2922,7 @@ const uint8_t* Simd128Unpacker::unpack<27>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<28>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0xfffffff; @@ -2980,96 +2932,96 @@ const uint8_t* Simd128Unpacker::unpack<28>(const uint8_t* in, uint32_t // extract 28-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, - SafeLoadAs(in + 4 * 1) >> 24 | SafeLoadAs(in + 4 * 2) << 8, - SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, + SafeLoadAs(in + 4 * 1) >> 24 | SafeLoadAs(in + 4 * 2) << 8, + SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 28-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 16 | SafeLoadAs(in + 4 * 4) << 16, - SafeLoadAs(in + 4 * 4) >> 12 | SafeLoadAs(in + 4 * 5) << 20, - SafeLoadAs(in + 4 * 5) >> 8 | SafeLoadAs(in + 4 * 6) << 24, - SafeLoadAs(in + 4 * 6), + SafeLoadAs(in + 4 * 3) >> 16 | SafeLoadAs(in + 4 * 4) << 16, + SafeLoadAs(in + 4 * 4) >> 12 | SafeLoadAs(in + 4 * 5) << 20, + SafeLoadAs(in + 4 * 5) >> 8 | SafeLoadAs(in + 4 * 6) << 24, + SafeLoadAs(in + 4 * 6), }; - shifts = simd_batch{ 0, 0, 0, 4 }; + shifts = simd_batch{0, 0, 0, 4}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 28-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, - SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, + SafeLoadAs(in + 4 * 7), + SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, + SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, + SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 28-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, - SafeLoadAs(in + 4 * 11) >> 12 | SafeLoadAs(in + 4 * 12) << 20, - SafeLoadAs(in + 4 * 12) >> 8 | SafeLoadAs(in + 4 * 13) << 24, - SafeLoadAs(in + 4 * 13), + SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, + SafeLoadAs(in + 4 * 11) >> 12 | SafeLoadAs(in + 4 * 12) << 20, + SafeLoadAs(in + 4 * 12) >> 8 | SafeLoadAs(in + 4 * 13) << 24, + SafeLoadAs(in + 4 * 13), }; - shifts = simd_batch{ 0, 0, 0, 4 }; + shifts = simd_batch{0, 0, 0, 4}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 28-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 28 | SafeLoadAs(in + 4 * 15) << 4, - SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, - SafeLoadAs(in + 4 * 16) >> 20 | SafeLoadAs(in + 4 * 17) << 12, + SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 14) >> 28 | SafeLoadAs(in + 4 * 15) << 4, + SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, + SafeLoadAs(in + 4 * 16) >> 20 | SafeLoadAs(in + 4 * 17) << 12, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 28-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 17) >> 16 | SafeLoadAs(in + 4 * 18) << 16, - SafeLoadAs(in + 4 * 18) >> 12 | SafeLoadAs(in + 4 * 19) << 20, - SafeLoadAs(in + 4 * 19) >> 8 | SafeLoadAs(in + 4 * 20) << 24, - SafeLoadAs(in + 4 * 20), + SafeLoadAs(in + 4 * 17) >> 16 | SafeLoadAs(in + 4 * 18) << 16, + SafeLoadAs(in + 4 * 18) >> 12 | SafeLoadAs(in + 4 * 19) << 20, + SafeLoadAs(in + 4 * 19) >> 8 | SafeLoadAs(in + 4 * 20) << 24, + SafeLoadAs(in + 4 * 20), }; - shifts = simd_batch{ 0, 0, 0, 4 }; + shifts = simd_batch{0, 0, 0, 4}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 28-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 28 | SafeLoadAs(in + 4 * 22) << 4, - SafeLoadAs(in + 4 * 22) >> 24 | SafeLoadAs(in + 4 * 23) << 8, - SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, + SafeLoadAs(in + 4 * 21), + SafeLoadAs(in + 4 * 21) >> 28 | SafeLoadAs(in + 4 * 22) << 4, + SafeLoadAs(in + 4 * 22) >> 24 | SafeLoadAs(in + 4 * 23) << 8, + SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 28-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 24) >> 16 | SafeLoadAs(in + 4 * 25) << 16, - SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, - SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, - SafeLoadAs(in + 4 * 27), + SafeLoadAs(in + 4 * 24) >> 16 | SafeLoadAs(in + 4 * 25) << 16, + SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, + SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, + SafeLoadAs(in + 4 * 27), }; - shifts = simd_batch{ 0, 0, 0, 4 }; + shifts = simd_batch{0, 0, 0, 4}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -3078,7 +3030,7 @@ const uint8_t* Simd128Unpacker::unpack<28>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<29>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x1fffffff; @@ -3088,96 +3040,96 @@ const uint8_t* Simd128Unpacker::unpack<29>(const uint8_t* in, uint32_t // extract 29-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 29 | SafeLoadAs(in + 4 * 1) << 3, - SafeLoadAs(in + 4 * 1) >> 26 | SafeLoadAs(in + 4 * 2) << 6, - SafeLoadAs(in + 4 * 2) >> 23 | SafeLoadAs(in + 4 * 3) << 9, + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 29 | SafeLoadAs(in + 4 * 1) << 3, + SafeLoadAs(in + 4 * 1) >> 26 | SafeLoadAs(in + 4 * 2) << 6, + SafeLoadAs(in + 4 * 2) >> 23 | SafeLoadAs(in + 4 * 3) << 9, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 29-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 20 | SafeLoadAs(in + 4 * 4) << 12, - SafeLoadAs(in + 4 * 4) >> 17 | SafeLoadAs(in + 4 * 5) << 15, - SafeLoadAs(in + 4 * 5) >> 14 | SafeLoadAs(in + 4 * 6) << 18, - SafeLoadAs(in + 4 * 6) >> 11 | SafeLoadAs(in + 4 * 7) << 21, + SafeLoadAs(in + 4 * 3) >> 20 | SafeLoadAs(in + 4 * 4) << 12, + SafeLoadAs(in + 4 * 4) >> 17 | SafeLoadAs(in + 4 * 5) << 15, + SafeLoadAs(in + 4 * 5) >> 14 | SafeLoadAs(in + 4 * 6) << 18, + SafeLoadAs(in + 4 * 6) >> 11 | SafeLoadAs(in + 4 * 7) << 21, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 29-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 8 | SafeLoadAs(in + 4 * 8) << 24, - SafeLoadAs(in + 4 * 8) >> 5 | SafeLoadAs(in + 4 * 9) << 27, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, + SafeLoadAs(in + 4 * 7) >> 8 | SafeLoadAs(in + 4 * 8) << 24, + SafeLoadAs(in + 4 * 8) >> 5 | SafeLoadAs(in + 4 * 9) << 27, + SafeLoadAs(in + 4 * 9), + SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, }; - shifts = simd_batch{ 0, 0, 2, 0 }; + shifts = simd_batch{0, 0, 2, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 29-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, - SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, - SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, - SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, + SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, + SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, + SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, + SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 29-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 14) >> 16 | SafeLoadAs(in + 4 * 15) << 16, - SafeLoadAs(in + 4 * 15) >> 13 | SafeLoadAs(in + 4 * 16) << 19, - SafeLoadAs(in + 4 * 16) >> 10 | SafeLoadAs(in + 4 * 17) << 22, - SafeLoadAs(in + 4 * 17) >> 7 | SafeLoadAs(in + 4 * 18) << 25, + SafeLoadAs(in + 4 * 14) >> 16 | SafeLoadAs(in + 4 * 15) << 16, + SafeLoadAs(in + 4 * 15) >> 13 | SafeLoadAs(in + 4 * 16) << 19, + SafeLoadAs(in + 4 * 16) >> 10 | SafeLoadAs(in + 4 * 17) << 22, + SafeLoadAs(in + 4 * 17) >> 7 | SafeLoadAs(in + 4 * 18) << 25, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 29-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 18) >> 4 | SafeLoadAs(in + 4 * 19) << 28, - SafeLoadAs(in + 4 * 19), - SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, - SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, + SafeLoadAs(in + 4 * 18) >> 4 | SafeLoadAs(in + 4 * 19) << 28, + SafeLoadAs(in + 4 * 19), + SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, + SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, }; - shifts = simd_batch{ 0, 1, 0, 0 }; + shifts = simd_batch{0, 1, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 29-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, - SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, - SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, - SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, + SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, + SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, + SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, + SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 29-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, - SafeLoadAs(in + 4 * 26) >> 9 | SafeLoadAs(in + 4 * 27) << 23, - SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, - SafeLoadAs(in + 4 * 28), + SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, + SafeLoadAs(in + 4 * 26) >> 9 | SafeLoadAs(in + 4 * 27) << 23, + SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, + SafeLoadAs(in + 4 * 28), }; - shifts = simd_batch{ 0, 0, 0, 3 }; + shifts = simd_batch{0, 0, 0, 3}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -3186,7 +3138,7 @@ const uint8_t* Simd128Unpacker::unpack<29>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<30>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x3fffffff; @@ -3196,96 +3148,96 @@ const uint8_t* Simd128Unpacker::unpack<30>(const uint8_t* in, uint32_t // extract 30-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, + SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, + SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 30-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, + SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, + SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, + SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, + SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 30-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, - SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, - SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, - SafeLoadAs(in + 4 * 10) >> 10 | SafeLoadAs(in + 4 * 11) << 22, + SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, + SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, + SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, + SafeLoadAs(in + 4 * 10) >> 10 | SafeLoadAs(in + 4 * 11) << 22, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 30-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 11) >> 8 | SafeLoadAs(in + 4 * 12) << 24, - SafeLoadAs(in + 4 * 12) >> 6 | SafeLoadAs(in + 4 * 13) << 26, - SafeLoadAs(in + 4 * 13) >> 4 | SafeLoadAs(in + 4 * 14) << 28, - SafeLoadAs(in + 4 * 14), + SafeLoadAs(in + 4 * 11) >> 8 | SafeLoadAs(in + 4 * 12) << 24, + SafeLoadAs(in + 4 * 12) >> 6 | SafeLoadAs(in + 4 * 13) << 26, + SafeLoadAs(in + 4 * 13) >> 4 | SafeLoadAs(in + 4 * 14) << 28, + SafeLoadAs(in + 4 * 14), }; - shifts = simd_batch{ 0, 0, 0, 2 }; + shifts = simd_batch{0, 0, 0, 2}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 30-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, + SafeLoadAs(in + 4 * 15), + SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, + SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, + SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 30-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, - SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, - SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, + SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, + SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, + SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, + SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 30-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, - SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, - SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, - SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, + SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, + SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, + SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, + SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 30-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, - SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, - SafeLoadAs(in + 4 * 28) >> 4 | SafeLoadAs(in + 4 * 29) << 28, - SafeLoadAs(in + 4 * 29), + SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, + SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, + SafeLoadAs(in + 4 * 28) >> 4 | SafeLoadAs(in + 4 * 29) << 28, + SafeLoadAs(in + 4 * 29), }; - shifts = simd_batch{ 0, 0, 0, 2 }; + shifts = simd_batch{0, 0, 0, 2}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -3294,7 +3246,7 @@ const uint8_t* Simd128Unpacker::unpack<30>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<31>(const uint8_t* in, uint32_t* out) { constexpr uint32_t kMask = 0x7fffffff; @@ -3304,96 +3256,96 @@ const uint8_t* Simd128Unpacker::unpack<31>(const uint8_t* in, uint32_t // extract 31-bit bundles 0 to 3 words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 31 | SafeLoadAs(in + 4 * 1) << 1, - SafeLoadAs(in + 4 * 1) >> 30 | SafeLoadAs(in + 4 * 2) << 2, - SafeLoadAs(in + 4 * 2) >> 29 | SafeLoadAs(in + 4 * 3) << 3, + SafeLoadAs(in + 4 * 0), + SafeLoadAs(in + 4 * 0) >> 31 | SafeLoadAs(in + 4 * 1) << 1, + SafeLoadAs(in + 4 * 1) >> 30 | SafeLoadAs(in + 4 * 2) << 2, + SafeLoadAs(in + 4 * 2) >> 29 | SafeLoadAs(in + 4 * 3) << 3, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 31-bit bundles 4 to 7 words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 28 | SafeLoadAs(in + 4 * 4) << 4, - SafeLoadAs(in + 4 * 4) >> 27 | SafeLoadAs(in + 4 * 5) << 5, - SafeLoadAs(in + 4 * 5) >> 26 | SafeLoadAs(in + 4 * 6) << 6, - SafeLoadAs(in + 4 * 6) >> 25 | SafeLoadAs(in + 4 * 7) << 7, + SafeLoadAs(in + 4 * 3) >> 28 | SafeLoadAs(in + 4 * 4) << 4, + SafeLoadAs(in + 4 * 4) >> 27 | SafeLoadAs(in + 4 * 5) << 5, + SafeLoadAs(in + 4 * 5) >> 26 | SafeLoadAs(in + 4 * 6) << 6, + SafeLoadAs(in + 4 * 6) >> 25 | SafeLoadAs(in + 4 * 7) << 7, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 31-bit bundles 8 to 11 words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 24 | SafeLoadAs(in + 4 * 8) << 8, - SafeLoadAs(in + 4 * 8) >> 23 | SafeLoadAs(in + 4 * 9) << 9, - SafeLoadAs(in + 4 * 9) >> 22 | SafeLoadAs(in + 4 * 10) << 10, - SafeLoadAs(in + 4 * 10) >> 21 | SafeLoadAs(in + 4 * 11) << 11, + SafeLoadAs(in + 4 * 7) >> 24 | SafeLoadAs(in + 4 * 8) << 8, + SafeLoadAs(in + 4 * 8) >> 23 | SafeLoadAs(in + 4 * 9) << 9, + SafeLoadAs(in + 4 * 9) >> 22 | SafeLoadAs(in + 4 * 10) << 10, + SafeLoadAs(in + 4 * 10) >> 21 | SafeLoadAs(in + 4 * 11) << 11, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 31-bit bundles 12 to 15 words = simd_batch{ - SafeLoadAs(in + 4 * 11) >> 20 | SafeLoadAs(in + 4 * 12) << 12, - SafeLoadAs(in + 4 * 12) >> 19 | SafeLoadAs(in + 4 * 13) << 13, - SafeLoadAs(in + 4 * 13) >> 18 | SafeLoadAs(in + 4 * 14) << 14, - SafeLoadAs(in + 4 * 14) >> 17 | SafeLoadAs(in + 4 * 15) << 15, + SafeLoadAs(in + 4 * 11) >> 20 | SafeLoadAs(in + 4 * 12) << 12, + SafeLoadAs(in + 4 * 12) >> 19 | SafeLoadAs(in + 4 * 13) << 13, + SafeLoadAs(in + 4 * 13) >> 18 | SafeLoadAs(in + 4 * 14) << 14, + SafeLoadAs(in + 4 * 14) >> 17 | SafeLoadAs(in + 4 * 15) << 15, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 31-bit bundles 16 to 19 words = simd_batch{ - SafeLoadAs(in + 4 * 15) >> 16 | SafeLoadAs(in + 4 * 16) << 16, - SafeLoadAs(in + 4 * 16) >> 15 | SafeLoadAs(in + 4 * 17) << 17, - SafeLoadAs(in + 4 * 17) >> 14 | SafeLoadAs(in + 4 * 18) << 18, - SafeLoadAs(in + 4 * 18) >> 13 | SafeLoadAs(in + 4 * 19) << 19, + SafeLoadAs(in + 4 * 15) >> 16 | SafeLoadAs(in + 4 * 16) << 16, + SafeLoadAs(in + 4 * 16) >> 15 | SafeLoadAs(in + 4 * 17) << 17, + SafeLoadAs(in + 4 * 17) >> 14 | SafeLoadAs(in + 4 * 18) << 18, + SafeLoadAs(in + 4 * 18) >> 13 | SafeLoadAs(in + 4 * 19) << 19, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 31-bit bundles 20 to 23 words = simd_batch{ - SafeLoadAs(in + 4 * 19) >> 12 | SafeLoadAs(in + 4 * 20) << 20, - SafeLoadAs(in + 4 * 20) >> 11 | SafeLoadAs(in + 4 * 21) << 21, - SafeLoadAs(in + 4 * 21) >> 10 | SafeLoadAs(in + 4 * 22) << 22, - SafeLoadAs(in + 4 * 22) >> 9 | SafeLoadAs(in + 4 * 23) << 23, + SafeLoadAs(in + 4 * 19) >> 12 | SafeLoadAs(in + 4 * 20) << 20, + SafeLoadAs(in + 4 * 20) >> 11 | SafeLoadAs(in + 4 * 21) << 21, + SafeLoadAs(in + 4 * 21) >> 10 | SafeLoadAs(in + 4 * 22) << 22, + SafeLoadAs(in + 4 * 22) >> 9 | SafeLoadAs(in + 4 * 23) << 23, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 31-bit bundles 24 to 27 words = simd_batch{ - SafeLoadAs(in + 4 * 23) >> 8 | SafeLoadAs(in + 4 * 24) << 24, - SafeLoadAs(in + 4 * 24) >> 7 | SafeLoadAs(in + 4 * 25) << 25, - SafeLoadAs(in + 4 * 25) >> 6 | SafeLoadAs(in + 4 * 26) << 26, - SafeLoadAs(in + 4 * 26) >> 5 | SafeLoadAs(in + 4 * 27) << 27, + SafeLoadAs(in + 4 * 23) >> 8 | SafeLoadAs(in + 4 * 24) << 24, + SafeLoadAs(in + 4 * 24) >> 7 | SafeLoadAs(in + 4 * 25) << 25, + SafeLoadAs(in + 4 * 25) >> 6 | SafeLoadAs(in + 4 * 26) << 26, + SafeLoadAs(in + 4 * 26) >> 5 | SafeLoadAs(in + 4 * 27) << 27, }; - shifts = simd_batch{ 0, 0, 0, 0 }; + shifts = simd_batch{0, 0, 0, 0}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; // extract 31-bit bundles 28 to 31 words = simd_batch{ - SafeLoadAs(in + 4 * 27) >> 4 | SafeLoadAs(in + 4 * 28) << 28, - SafeLoadAs(in + 4 * 28) >> 3 | SafeLoadAs(in + 4 * 29) << 29, - SafeLoadAs(in + 4 * 29) >> 2 | SafeLoadAs(in + 4 * 30) << 30, - SafeLoadAs(in + 4 * 30), + SafeLoadAs(in + 4 * 27) >> 4 | SafeLoadAs(in + 4 * 28) << 28, + SafeLoadAs(in + 4 * 28) >> 3 | SafeLoadAs(in + 4 * 29) << 29, + SafeLoadAs(in + 4 * 29) >> 2 | SafeLoadAs(in + 4 * 30) << 30, + SafeLoadAs(in + 4 * 30), }; - shifts = simd_batch{ 0, 0, 0, 1 }; + shifts = simd_batch{0, 0, 0, 1}; results = (words >> shifts) & masks; results.store_unaligned(out); out += 4; @@ -3402,7 +3354,7 @@ const uint8_t* Simd128Unpacker::unpack<31>(const uint8_t* in, uint32_t return in; } -template<> +template <> const uint8_t* Simd128Unpacker::unpack<32>(const uint8_t* in, uint32_t* out) { std::memcpy(out, in, 32 * sizeof(*out)); in += 4 * 32; From 122ae9c85e8f3c255704aeda844d330727284c9b Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 19 Sep 2025 16:01:48 +0200 Subject: [PATCH 26/76] WIP: new simd algo --- .../bpacking_simd128_generated_internal.h | 51 ++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h index e2ce7e6671a..0fd9b3185d5 100644 --- a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h @@ -30,7 +30,56 @@ namespace arrow::internal { -using ::arrow::util::SafeLoadAs; +template +struct SimdUnpackerForWidthTraits { + static constexpr int kOutByteSize = sizeof(Uint); + static constexpr int kOutBitSize = 8 * kOutByteSize; + static constexpr int kSimdByteSize = SimdByteSize; + static constexpr int kSimdBitSize = 8 * kSimdByteSize; + static constexpr int kSimdOutCount = kSimdByteSize / kOutByteSize; + static constexpr int kBitWidth = BitWidth; + static constexpr int kOutCountUnpacked = 32; + static constexpr int kOutBytesUnpacked = kOutCountUnpacked * kOutByteSize; + + using out_type = Uint; + using simd_batch = xsimd::make_sized_batch_t; + using simd_bytes = xsimd::make_sized_batch_t; +}; + +template +struct SimdUnpackerForWidth; + +template +struct SimdUnpackerForWidth { + using Traits = SimdUnpackerForWidthTraits; + + const uint8_t* unpack(const uint8_t* in, typename Traits::out_type* out) { + std::memset(out, 0, Traits::kOutBytesUnpacked); + return in; + } +}; + +template +struct SimdUnpackerForWidth { + using Traits = SimdUnpackerForWidthTraits; + + const uint8_t* unpack(const uint8_t* in, typename Traits::out_type* out) { + std::memcpy(out, in, Traits::kOutBytesUnpacked); + return in + Traits::kOutBytesUnpacked; + } +}; + +template +struct SimdUnpackerForWidth { + using Traits = SimdUnpackerForWidthTraits; + + const uint8_t* unpack(const uint8_t* in, typename Traits::out_type* out) { + // TODO + // https://github.com/fast-pack/LittleIntPacker/blob/master/src/horizontalpacking32.c + } +}; + +using util::SafeLoadAs; template struct Simd128Unpacker; From 8bc3da923a88af8754c89e1f1bea5d9b28593570 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 22 Sep 2025 15:00:15 +0200 Subject: [PATCH 27/76] WIP --- .../bpacking_simd128_generated_internal.h | 3427 +---------------- 1 file changed, 107 insertions(+), 3320 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h index 0fd9b3185d5..9c0812e8f47 100644 --- a/cpp/src/arrow/util/bpacking_simd128_generated_internal.h +++ b/cpp/src/arrow/util/bpacking_simd128_generated_internal.h @@ -26,10 +26,17 @@ #include +#include "arrow/util/bit_util.h" #include "arrow/util/ubsan.h" namespace arrow::internal { +// https://github.com/fast-pack/LittleIntPacker/blob/master/src/horizontalpacking32.c +// TODO +// - No zero and full size unpack here +// - _mm_cvtepi8_epi32 +// - var rshifts no avail on SSE + template struct SimdUnpackerForWidthTraits { static constexpr int kOutByteSize = sizeof(Uint); @@ -37,7 +44,9 @@ struct SimdUnpackerForWidthTraits { static constexpr int kSimdByteSize = SimdByteSize; static constexpr int kSimdBitSize = 8 * kSimdByteSize; static constexpr int kSimdOutCount = kSimdByteSize / kOutByteSize; - static constexpr int kBitWidth = BitWidth; + static constexpr int kPackedBitSize = BitWidth; + static constexpr int kPackedMinByteSize = bit_util::BytesForBits(kPackedBitSize); + // TODO should not be here static constexpr int kOutCountUnpacked = 32; static constexpr int kOutBytesUnpacked = kOutCountUnpacked * kOutByteSize; @@ -53,7 +62,7 @@ template struct SimdUnpackerForWidth { using Traits = SimdUnpackerForWidthTraits; - const uint8_t* unpack(const uint8_t* in, typename Traits::out_type* out) { + static const uint8_t* unpack(const uint8_t* in, typename Traits::out_type* out) { std::memset(out, 0, Traits::kOutBytesUnpacked); return in; } @@ -63,3352 +72,130 @@ template struct SimdUnpackerForWidth { using Traits = SimdUnpackerForWidthTraits; - const uint8_t* unpack(const uint8_t* in, typename Traits::out_type* out) { + static const uint8_t* unpack(const uint8_t* in, typename Traits::out_type* out) { std::memcpy(out, in, Traits::kOutBytesUnpacked); return in + Traits::kOutBytesUnpacked; } }; +template +constexpr auto LowBitMask(Int count) { + if (count == 8 * sizeof(Int)) { + return ~Int{0}; + } + return (Int{1} << count) - Int{1}; +} + template struct SimdUnpackerForWidth { using Traits = SimdUnpackerForWidthTraits; + using out_type = typename Traits::out_type; + using simd_batch = typename Traits::simd_batch; + using simd_bytes = typename Traits::simd_bytes; - const uint8_t* unpack(const uint8_t* in, typename Traits::out_type* out) { - // TODO - // https://github.com/fast-pack/LittleIntPacker/blob/master/src/horizontalpacking32.c - } -}; - -using util::SafeLoadAs; - -template -struct Simd128Unpacker; - -template <> -struct Simd128Unpacker { - using out_type = uint32_t; - using simd_batch = xsimd::make_sized_batch_t; - template - using simd_batch_constants = - xsimd::batch_constant; - using simd_bytes = - xsimd::make_sized_batch_t; - template - using simd_bytes_constants = - xsimd::batch_constant; + static_assert(Traits::kOutBitSize >= 1); - static constexpr int kValuesUnpacked = 32; + template + static constexpr auto make_batch_constant() { + return xsimd::make_batch_constant(); + } - template - static const uint8_t* unpack(const uint8_t* in, uint32_t* out); -}; + template + struct ByteSwizzle { + static constexpr int get(int byte_idx, int byte_count) { + // The byte index as if all simd batch iterations were contiguous. + const int out_iter_byte_idx = byte_idx + kIteration * Traits::kSimdByteSize; + // The index of the value we are unpacking in this byte. + const int out_iter_idx = out_iter_byte_idx / Traits::kOutByteSize; + // The index within the unpacked of the byte we are unpacking. + const int out_byte_offset = out_iter_byte_idx % Traits::kOutByteSize; + // Where the packed value starts in the input. + const int in_bit_start = out_iter_idx * Traits::kPackedBitSize; + // At which byte the packed value starts in the input. + const int in_byte_start = in_bit_start / 8; + + // This is the LSB byte there is always data + if (out_byte_offset == 0) { + return in_byte_start; + } + + // Number of bits in the LSB byte of the output value that actually contain data + // about the current value. + const int bits_in_offset_0 = (in_byte_start + 1) * 8 - in_bit_start; + // Bit capacity in all lesser output bytes, accounting for shit + const int bits_in_smaller_offset = bits_in_offset_0 + (out_byte_offset - 1) * 8; + + // No more data to extract for this output value, fill with zero. + if (bits_in_smaller_offset >= Traits::kPackedBitSize) { + // X86_64 looks at bit 8 and Arm for oversized index. + return LowBitMask(8U); + } + + return in_byte_start + out_byte_offset; + } + }; -template <> -const uint8_t* Simd128Unpacker::unpack<0>(const uint8_t* in, uint32_t* out) { - std::memset(out, 0x0, 32 * sizeof(*out)); - out += 32; - return in; -} + template + struct UnpackedShift { + static constexpr int get(int out_idx, int out_count) { + // The out value index as if all simd batch iterations were contiguous. + const int out_iter_idx = out_idx + kIteration * Traits::kSimdOutCount; + // The bit index in the input where the associated output values starts. + const int in_bit_start = out_iter_idx * Traits::kPackedBitSize; + // The bit index in the input where the value starts + const int in_byte_start = in_bit_start / 8; -template -struct Reorder { - static constexpr unsigned get(unsigned i, unsigned n) { - if (i % 4 == 0) { - return K; + return in_bit_start - (8 * in_byte_start); } - return 128; - } -}; + }; -template <> -const uint8_t* Simd128Unpacker::unpack<1>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1; + static const uint8_t* unpack(const uint8_t* in, out_type* out) { + constexpr out_type kMask = LowBitMask(Traits::kPackedBitSize); - constexpr auto kShifts1 = simd_batch_constants<0, 1, 2, 3>{}; - constexpr auto kShifts2 = simd_batch_constants<4, 5, 6, 7>{}; + constexpr auto kShifts0 = make_batch_constant>(); + constexpr auto kShifts1 = make_batch_constant>(); - { - auto bytes = simd_bytes::load_unaligned(in + 4 * 0); - // TODO var shifts no avail on SSE + auto bytes = simd_bytes::load_unaligned(in + Traits::kSimdOutCount * 0); { - constexpr auto kReorder = - xsimd::make_batch_constant>(); - auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); - ((numbers >> kShifts1) & kMask).store_unaligned(out + 0 * 4); - ((numbers >> kShifts2) & kMask).store_unaligned(out + 1 * 4); + constexpr auto kReorder = make_batch_constant>(); + auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); + ((numbers >> kShifts0) & kMask).store_unaligned(out + 0 * Traits::kSimdOutCount); + ((numbers >> kShifts1) & kMask).store_unaligned(out + 1 * Traits::kSimdOutCount); } { - constexpr auto kReorder = - xsimd::make_batch_constant>(); - auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); - ((numbers >> kShifts1) & kMask).store_unaligned(out + 2 * 4); - ((numbers >> kShifts2) & kMask).store_unaligned(out + 3 * 4); + constexpr auto kReorder = make_batch_constant>(); + auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); + ((numbers >> kShifts0) & kMask).store_unaligned(out + 2 * Traits::kSimdOutCount); + ((numbers >> kShifts1) & kMask).store_unaligned(out + 3 * Traits::kSimdOutCount); } { - constexpr auto kReorder = - xsimd::make_batch_constant>(); - auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); - ((numbers >> kShifts1) & kMask).store_unaligned(out + 4 * 4); - ((numbers >> kShifts2) & kMask).store_unaligned(out + 5 * 4); + constexpr auto kReorder = make_batch_constant>(); + auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); + ((numbers >> kShifts0) & kMask).store_unaligned(out + 4 * Traits::kSimdOutCount); + ((numbers >> kShifts1) & kMask).store_unaligned(out + 5 * Traits::kSimdOutCount); } { - constexpr auto kReorder = - xsimd::make_batch_constant>(); - auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); - ((numbers >> kShifts1) & kMask).store_unaligned(out + 6 * 4); - ((numbers >> kShifts2) & kMask).store_unaligned(out + 7 * 4); + constexpr auto kReorder = make_batch_constant>(); + auto numbers = xsimd::bitwise_cast(xsimd::swizzle(bytes, kReorder)); + ((numbers >> kShifts0) & kMask).store_unaligned(out + 6 * Traits::kSimdOutCount); + ((numbers >> kShifts1) & kMask).store_unaligned(out + 7 * Traits::kSimdOutCount); } - } - - return in + 4; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<2>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 2-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{0, 2, 4, 6}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{8, 10, 12, 14}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{16, 18, 20, 22}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{24, 26, 28, 30}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{0, 2, 4, 6}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{8, 10, 12, 14}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{16, 18, 20, 22}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 2-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{24, 26, 28, 30}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 2 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<3>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 3-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{0, 3, 6, 9}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{12, 15, 18, 21}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{24, 27, 0, 1}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{4, 7, 10, 13}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{16, 19, 22, 25}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{28, 0, 2, 5}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{8, 11, 14, 17}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 3-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{20, 23, 26, 29}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 3 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<4>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xf; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 4-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{0, 4, 8, 12}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{16, 20, 24, 28}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{0, 4, 8, 12}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{16, 20, 24, 28}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{0, 4, 8, 12}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{16, 20, 24, 28}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{0, 4, 8, 12}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 4-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{16, 20, 24, 28}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 4 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<5>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 5-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{0, 5, 10, 15}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{20, 25, 0, 3}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{8, 13, 18, 23}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{0, 1, 6, 11}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 31 | SafeLoadAs(in + 4 * 3) << 1, - }; - shifts = simd_batch{16, 21, 26, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{4, 9, 14, 19}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 29 | SafeLoadAs(in + 4 * 4) << 3, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{24, 0, 2, 7}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 5-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{12, 17, 22, 27}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 5 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<6>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 6-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{0, 6, 12, 18}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{24, 0, 4, 10}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{16, 22, 0, 2}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{8, 14, 20, 26}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{0, 6, 12, 18}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{24, 0, 4, 10}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{16, 22, 0, 2}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 6-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{8, 14, 20, 26}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 6 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<7>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7f; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 7-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{0, 7, 14, 21}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{0, 3, 10, 17}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{24, 0, 6, 13}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 27 | SafeLoadAs(in + 4 * 3) << 5, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{20, 0, 2, 9}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{16, 23, 0, 5}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{12, 19, 0, 1}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, - }; - shifts = simd_batch{8, 15, 22, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 7-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{4, 11, 18, 25}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 7 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<8>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 8-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - }; - shifts = simd_batch{0, 8, 16, 24}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{0, 8, 16, 24}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{0, 8, 16, 24}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{0, 8, 16, 24}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{0, 8, 16, 24}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{0, 8, 16, 24}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{0, 8, 16, 24}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 8-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{0, 8, 16, 24}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 8 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<9>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 9-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, - }; - shifts = simd_batch{0, 9, 18, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, - }; - shifts = simd_batch{4, 13, 22, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{8, 17, 0, 3}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{12, 21, 0, 7}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 25 | SafeLoadAs(in + 4 * 5) << 7, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{16, 0, 2, 11}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{20, 0, 6, 15}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{0, 1, 10, 19}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 9-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - }; - shifts = simd_batch{0, 5, 14, 23}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 9 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<10>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 10-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - }; - shifts = simd_batch{0, 10, 20, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{8, 18, 0, 6}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{16, 0, 4, 14}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{0, 2, 12, 22}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 30 | SafeLoadAs(in + 4 * 6) << 2, - }; - shifts = simd_batch{0, 10, 20, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{8, 18, 0, 6}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 26 | SafeLoadAs(in + 4 * 8) << 6, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - }; - shifts = simd_batch{16, 0, 4, 14}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 10-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), - }; - shifts = simd_batch{0, 2, 12, 22}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 10 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<11>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7ff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 11-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 22 | SafeLoadAs(in + 4 * 1) << 10, - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{0, 11, 0, 1}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 23 | SafeLoadAs(in + 4 * 2) << 9, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{12, 0, 2, 13}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 2) >> 24 | SafeLoadAs(in + 4 * 3) << 8, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 25 | SafeLoadAs(in + 4 * 4) << 7, - }; - shifts = simd_batch{0, 3, 14, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{4, 15, 0, 5}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 27 | SafeLoadAs(in + 4 * 6) << 5, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{16, 0, 6, 17}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 29 | SafeLoadAs(in + 4 * 8) << 3, - }; - shifts = simd_batch{0, 7, 18, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, - SafeLoadAs(in + 4 * 9), - }; - shifts = simd_batch{8, 19, 0, 9}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 11-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10), - }; - shifts = simd_batch{20, 0, 10, 21}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 11 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<12>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xfff; - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 12-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 24 | SafeLoadAs(in + 4 * 1) << 8, - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{0, 12, 0, 4}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{16, 0, 8, 20}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{0, 12, 0, 4}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{16, 0, 8, 20}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{0, 12, 0, 4}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - }; - shifts = simd_batch{16, 0, 8, 20}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, - SafeLoadAs(in + 4 * 10), - }; - shifts = simd_batch{0, 12, 0, 4}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 12-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11), - }; - shifts = simd_batch{16, 0, 8, 20}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 12 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<13>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 13-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 26 | SafeLoadAs(in + 4 * 1) << 6, - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{0, 13, 0, 7}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 1) >> 20 | SafeLoadAs(in + 4 * 2) << 12, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 27 | SafeLoadAs(in + 4 * 3) << 5, - }; - shifts = simd_batch{0, 1, 14, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; + return in + 4; + } +}; - // extract 13-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 21 | SafeLoadAs(in + 4 * 4) << 11, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{8, 0, 2, 15}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +// static_assert(SimdUnpackerForWidth::ByteSwizzle<0>::get(9, 16) == 1); - // extract 13-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{0, 9, 0, 3}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; +template +struct Simd128Unpacker { + static constexpr int kValuesUnpacked = 32; + using out_type = Uint; - // extract 13-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 29 | SafeLoadAs(in + 4 * 7) << 3, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 23 | SafeLoadAs(in + 4 * 8) << 9, - }; - shifts = simd_batch{16, 0, 10, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, - SafeLoadAs(in + 4 * 9), - }; - shifts = simd_batch{4, 17, 0, 11}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, - }; - shifts = simd_batch{0, 5, 18, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 13-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12), - }; - shifts = simd_batch{12, 0, 6, 19}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 13 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<14>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 14-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{0, 14, 0, 10}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 1) >> 24 | SafeLoadAs(in + 4 * 2) << 8, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{0, 6, 0, 2}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, - }; - shifts = simd_batch{16, 0, 12, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{8, 0, 4, 18}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8), - }; - shifts = simd_batch{0, 14, 0, 10}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, - SafeLoadAs(in + 4 * 10), - }; - shifts = simd_batch{0, 6, 0, 2}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 30 | SafeLoadAs(in + 4 * 11) << 2, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, - }; - shifts = simd_batch{16, 0, 12, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 14-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13), - }; - shifts = simd_batch{8, 0, 4, 18}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 14 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<15>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7fff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 15-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{0, 15, 0, 13}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{0, 11, 0, 9}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{0, 7, 0, 5}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{0, 3, 0, 1}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 31 | SafeLoadAs(in + 4 * 8) << 1, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 29 | SafeLoadAs(in + 4 * 9) << 3, - }; - shifts = simd_batch{16, 0, 14, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 27 | SafeLoadAs(in + 4 * 10) << 5, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 25 | SafeLoadAs(in + 4 * 11) << 7, - }; - shifts = simd_batch{12, 0, 10, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 23 | SafeLoadAs(in + 4 * 12) << 9, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, - }; - shifts = simd_batch{8, 0, 6, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 15-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14), - }; - shifts = simd_batch{4, 0, 2, 17}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 15 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<16>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 16-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1), - }; - shifts = simd_batch{0, 16, 0, 16}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3), - }; - shifts = simd_batch{0, 16, 0, 16}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{0, 16, 0, 16}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{0, 16, 0, 16}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9), - }; - shifts = simd_batch{0, 16, 0, 16}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11), - }; - shifts = simd_batch{0, 16, 0, 16}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13), - }; - shifts = simd_batch{0, 16, 0, 16}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 16-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15), - }; - shifts = simd_batch{0, 16, 0, 16}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 16 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<17>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1ffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 17-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 17 | SafeLoadAs(in + 4 * 1) << 15, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 19 | SafeLoadAs(in + 4 * 2) << 13, - }; - shifts = simd_batch{0, 0, 2, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 21 | SafeLoadAs(in + 4 * 3) << 11, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 23 | SafeLoadAs(in + 4 * 4) << 9, - }; - shifts = simd_batch{4, 0, 6, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 25 | SafeLoadAs(in + 4 * 5) << 7, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 27 | SafeLoadAs(in + 4 * 6) << 5, - }; - shifts = simd_batch{8, 0, 10, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 29 | SafeLoadAs(in + 4 * 7) << 3, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 31 | SafeLoadAs(in + 4 * 8) << 1, - }; - shifts = simd_batch{12, 0, 14, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 8) >> 16 | SafeLoadAs(in + 4 * 9) << 16, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 18 | SafeLoadAs(in + 4 * 10) << 14, - SafeLoadAs(in + 4 * 10), - }; - shifts = simd_batch{0, 1, 0, 3}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 10) >> 20 | SafeLoadAs(in + 4 * 11) << 12, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 22 | SafeLoadAs(in + 4 * 12) << 10, - SafeLoadAs(in + 4 * 12), - }; - shifts = simd_batch{0, 5, 0, 7}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 12) >> 24 | SafeLoadAs(in + 4 * 13) << 8, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 26 | SafeLoadAs(in + 4 * 14) << 6, - SafeLoadAs(in + 4 * 14), - }; - shifts = simd_batch{0, 9, 0, 11}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 17-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 14) >> 28 | SafeLoadAs(in + 4 * 15) << 4, - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, - SafeLoadAs(in + 4 * 16), - }; - shifts = simd_batch{0, 13, 0, 15}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 17 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<18>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3ffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 18-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 18 | SafeLoadAs(in + 4 * 1) << 14, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, - }; - shifts = simd_batch{0, 0, 4, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - }; - shifts = simd_batch{8, 0, 12, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 4) >> 16 | SafeLoadAs(in + 4 * 5) << 16, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{0, 2, 0, 6}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8), - }; - shifts = simd_batch{0, 10, 0, 14}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 18 | SafeLoadAs(in + 4 * 10) << 14, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 22 | SafeLoadAs(in + 4 * 11) << 10, - }; - shifts = simd_batch{0, 0, 4, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 30 | SafeLoadAs(in + 4 * 13) << 2, - }; - shifts = simd_batch{8, 0, 12, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 20 | SafeLoadAs(in + 4 * 15) << 12, - SafeLoadAs(in + 4 * 15), - }; - shifts = simd_batch{0, 2, 0, 6}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 18-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, - SafeLoadAs(in + 4 * 16), - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - SafeLoadAs(in + 4 * 17), - }; - shifts = simd_batch{0, 10, 0, 14}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 18 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<19>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7ffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 19-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 19 | SafeLoadAs(in + 4 * 1) << 13, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 25 | SafeLoadAs(in + 4 * 2) << 7, - }; - shifts = simd_batch{0, 0, 6, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 2), - SafeLoadAs(in + 4 * 2) >> 31 | SafeLoadAs(in + 4 * 3) << 1, - SafeLoadAs(in + 4 * 3) >> 18 | SafeLoadAs(in + 4 * 4) << 14, - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{12, 0, 0, 5}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 4) >> 24 | SafeLoadAs(in + 4 * 5) << 8, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 30 | SafeLoadAs(in + 4 * 6) << 2, - SafeLoadAs(in + 4 * 6) >> 17 | SafeLoadAs(in + 4 * 7) << 15, - }; - shifts = simd_batch{0, 11, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 23 | SafeLoadAs(in + 4 * 8) << 9, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 29 | SafeLoadAs(in + 4 * 9) << 3, - }; - shifts = simd_batch{4, 0, 10, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 9) >> 16 | SafeLoadAs(in + 4 * 10) << 16, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 22 | SafeLoadAs(in + 4 * 11) << 10, - SafeLoadAs(in + 4 * 11), - }; - shifts = simd_batch{0, 3, 0, 9}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 11) >> 28 | SafeLoadAs(in + 4 * 12) << 4, - SafeLoadAs(in + 4 * 12) >> 15 | SafeLoadAs(in + 4 * 13) << 17, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 21 | SafeLoadAs(in + 4 * 14) << 11, - }; - shifts = simd_batch{0, 0, 2, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 27 | SafeLoadAs(in + 4 * 15) << 5, - SafeLoadAs(in + 4 * 15) >> 14 | SafeLoadAs(in + 4 * 16) << 18, - SafeLoadAs(in + 4 * 16), - }; - shifts = simd_batch{8, 0, 0, 1}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 19-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 16) >> 20 | SafeLoadAs(in + 4 * 17) << 12, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, - SafeLoadAs(in + 4 * 18), - }; - shifts = simd_batch{0, 7, 0, 13}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 19 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<20>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xfffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 20-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 20 | SafeLoadAs(in + 4 * 1) << 12, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - }; - shifts = simd_batch{0, 0, 8, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 2) >> 16 | SafeLoadAs(in + 4 * 3) << 16, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4), - }; - shifts = simd_batch{0, 4, 0, 12}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, - }; - shifts = simd_batch{0, 0, 8, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, - SafeLoadAs(in + 4 * 9), - }; - shifts = simd_batch{0, 4, 0, 12}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 20 | SafeLoadAs(in + 4 * 11) << 12, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 28 | SafeLoadAs(in + 4 * 12) << 4, - }; - shifts = simd_batch{0, 0, 8, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 12) >> 16 | SafeLoadAs(in + 4 * 13) << 16, - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 24 | SafeLoadAs(in + 4 * 14) << 8, - SafeLoadAs(in + 4 * 14), - }; - shifts = simd_batch{0, 4, 0, 12}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 20 | SafeLoadAs(in + 4 * 16) << 12, - SafeLoadAs(in + 4 * 16), - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - }; - shifts = simd_batch{0, 0, 8, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 20-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 17) >> 16 | SafeLoadAs(in + 4 * 18) << 16, - SafeLoadAs(in + 4 * 18), - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19), - }; - shifts = simd_batch{0, 4, 0, 12}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 20 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<21>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1fffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 21-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 21 | SafeLoadAs(in + 4 * 1) << 11, - SafeLoadAs(in + 4 * 1), - SafeLoadAs(in + 4 * 1) >> 31 | SafeLoadAs(in + 4 * 2) << 1, - }; - shifts = simd_batch{0, 0, 10, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 30 | SafeLoadAs(in + 4 * 4) << 2, - SafeLoadAs(in + 4 * 4) >> 19 | SafeLoadAs(in + 4 * 5) << 13, - }; - shifts = simd_batch{0, 9, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, - SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, - SafeLoadAs(in + 4 * 7), - }; - shifts = simd_batch{8, 0, 0, 7}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8) >> 17 | SafeLoadAs(in + 4 * 9) << 15, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 27 | SafeLoadAs(in + 4 * 10) << 5, - }; - shifts = simd_batch{0, 0, 6, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, - SafeLoadAs(in + 4 * 12) >> 15 | SafeLoadAs(in + 4 * 13) << 17, - }; - shifts = simd_batch{0, 5, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 25 | SafeLoadAs(in + 4 * 14) << 7, - SafeLoadAs(in + 4 * 14) >> 14 | SafeLoadAs(in + 4 * 15) << 18, - SafeLoadAs(in + 4 * 15), - }; - shifts = simd_batch{4, 0, 0, 3}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, - SafeLoadAs(in + 4 * 16) >> 13 | SafeLoadAs(in + 4 * 17) << 19, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, - }; - shifts = simd_batch{0, 0, 2, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 21-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 18) >> 12 | SafeLoadAs(in + 4 * 19) << 20, - SafeLoadAs(in + 4 * 19), - SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, - SafeLoadAs(in + 4 * 20), - }; - shifts = simd_batch{0, 1, 0, 11}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 21 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<22>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3fffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 22-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 22 | SafeLoadAs(in + 4 * 1) << 10, - SafeLoadAs(in + 4 * 1) >> 12 | SafeLoadAs(in + 4 * 2) << 20, - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{0, 0, 0, 2}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 2) >> 24 | SafeLoadAs(in + 4 * 3) << 8, - SafeLoadAs(in + 4 * 3) >> 14 | SafeLoadAs(in + 4 * 4) << 18, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 26 | SafeLoadAs(in + 4 * 5) << 6, - }; - shifts = simd_batch{0, 0, 4, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 5) >> 16 | SafeLoadAs(in + 4 * 6) << 16, - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 28 | SafeLoadAs(in + 4 * 7) << 4, - SafeLoadAs(in + 4 * 7) >> 18 | SafeLoadAs(in + 4 * 8) << 14, - }; - shifts = simd_batch{0, 6, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, - SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, - SafeLoadAs(in + 4 * 10), - }; - shifts = simd_batch{8, 0, 0, 10}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 11), - SafeLoadAs(in + 4 * 11) >> 22 | SafeLoadAs(in + 4 * 12) << 10, - SafeLoadAs(in + 4 * 12) >> 12 | SafeLoadAs(in + 4 * 13) << 20, - SafeLoadAs(in + 4 * 13), - }; - shifts = simd_batch{0, 0, 0, 2}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 13) >> 24 | SafeLoadAs(in + 4 * 14) << 8, - SafeLoadAs(in + 4 * 14) >> 14 | SafeLoadAs(in + 4 * 15) << 18, - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 26 | SafeLoadAs(in + 4 * 16) << 6, - }; - shifts = simd_batch{0, 0, 4, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 16) >> 16 | SafeLoadAs(in + 4 * 17) << 16, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 28 | SafeLoadAs(in + 4 * 18) << 4, - SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, - }; - shifts = simd_batch{0, 6, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 22-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 19), - SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, - SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, - SafeLoadAs(in + 4 * 21), - }; - shifts = simd_batch{8, 0, 0, 10}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 22 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<23>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7fffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 23-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 23 | SafeLoadAs(in + 4 * 1) << 9, - SafeLoadAs(in + 4 * 1) >> 14 | SafeLoadAs(in + 4 * 2) << 18, - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{0, 0, 0, 5}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 2) >> 28 | SafeLoadAs(in + 4 * 3) << 4, - SafeLoadAs(in + 4 * 3) >> 19 | SafeLoadAs(in + 4 * 4) << 13, - SafeLoadAs(in + 4 * 4) >> 10 | SafeLoadAs(in + 4 * 5) << 22, - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{0, 0, 0, 1}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 5) >> 24 | SafeLoadAs(in + 4 * 6) << 8, - SafeLoadAs(in + 4 * 6) >> 15 | SafeLoadAs(in + 4 * 7) << 17, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 29 | SafeLoadAs(in + 4 * 8) << 3, - }; - shifts = simd_batch{0, 0, 6, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 8) >> 20 | SafeLoadAs(in + 4 * 9) << 12, - SafeLoadAs(in + 4 * 9) >> 11 | SafeLoadAs(in + 4 * 10) << 21, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 25 | SafeLoadAs(in + 4 * 11) << 7, - }; - shifts = simd_batch{0, 0, 2, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 11) >> 16 | SafeLoadAs(in + 4 * 12) << 16, - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 30 | SafeLoadAs(in + 4 * 13) << 2, - SafeLoadAs(in + 4 * 13) >> 21 | SafeLoadAs(in + 4 * 14) << 11, - }; - shifts = simd_batch{0, 7, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 14) >> 12 | SafeLoadAs(in + 4 * 15) << 20, - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 26 | SafeLoadAs(in + 4 * 16) << 6, - SafeLoadAs(in + 4 * 16) >> 17 | SafeLoadAs(in + 4 * 17) << 15, - }; - shifts = simd_batch{0, 3, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 31 | SafeLoadAs(in + 4 * 18) << 1, - SafeLoadAs(in + 4 * 18) >> 22 | SafeLoadAs(in + 4 * 19) << 10, - SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, - }; - shifts = simd_batch{8, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 23-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 20), - SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, - SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, - SafeLoadAs(in + 4 * 22), - }; - shifts = simd_batch{4, 0, 0, 9}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 23 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<24>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 24-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 24 | SafeLoadAs(in + 4 * 1) << 8, - SafeLoadAs(in + 4 * 1) >> 16 | SafeLoadAs(in + 4 * 2) << 16, - SafeLoadAs(in + 4 * 2), - }; - shifts = simd_batch{0, 0, 0, 8}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4) >> 16 | SafeLoadAs(in + 4 * 5) << 16, - SafeLoadAs(in + 4 * 5), - }; - shifts = simd_batch{0, 0, 0, 8}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 6), - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, - SafeLoadAs(in + 4 * 8), - }; - shifts = simd_batch{0, 0, 0, 8}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, - SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, - SafeLoadAs(in + 4 * 11), - }; - shifts = simd_batch{0, 0, 0, 8}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 12), - SafeLoadAs(in + 4 * 12) >> 24 | SafeLoadAs(in + 4 * 13) << 8, - SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, - SafeLoadAs(in + 4 * 14), - }; - shifts = simd_batch{0, 0, 0, 8}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, - SafeLoadAs(in + 4 * 16) >> 16 | SafeLoadAs(in + 4 * 17) << 16, - SafeLoadAs(in + 4 * 17), - }; - shifts = simd_batch{0, 0, 0, 8}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 18), - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19) >> 16 | SafeLoadAs(in + 4 * 20) << 16, - SafeLoadAs(in + 4 * 20), - }; - shifts = simd_batch{0, 0, 0, 8}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 24-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, - SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, - SafeLoadAs(in + 4 * 23), - }; - shifts = simd_batch{0, 0, 0, 8}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 24 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<25>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1ffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 25-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 25 | SafeLoadAs(in + 4 * 1) << 7, - SafeLoadAs(in + 4 * 1) >> 18 | SafeLoadAs(in + 4 * 2) << 14, - SafeLoadAs(in + 4 * 2) >> 11 | SafeLoadAs(in + 4 * 3) << 21, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3), - SafeLoadAs(in + 4 * 3) >> 29 | SafeLoadAs(in + 4 * 4) << 3, - SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, - SafeLoadAs(in + 4 * 5) >> 15 | SafeLoadAs(in + 4 * 6) << 17, - }; - shifts = simd_batch{4, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 8 | SafeLoadAs(in + 4 * 7) << 24, - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 26 | SafeLoadAs(in + 4 * 8) << 6, - SafeLoadAs(in + 4 * 8) >> 19 | SafeLoadAs(in + 4 * 9) << 13, - }; - shifts = simd_batch{0, 1, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 30 | SafeLoadAs(in + 4 * 11) << 2, - SafeLoadAs(in + 4 * 11) >> 23 | SafeLoadAs(in + 4 * 12) << 9, - }; - shifts = simd_batch{0, 5, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 12) >> 16 | SafeLoadAs(in + 4 * 13) << 16, - SafeLoadAs(in + 4 * 13) >> 9 | SafeLoadAs(in + 4 * 14) << 23, - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 27 | SafeLoadAs(in + 4 * 15) << 5, - }; - shifts = simd_batch{0, 0, 2, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 15) >> 20 | SafeLoadAs(in + 4 * 16) << 12, - SafeLoadAs(in + 4 * 16) >> 13 | SafeLoadAs(in + 4 * 17) << 19, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 31 | SafeLoadAs(in + 4 * 18) << 1, - }; - shifts = simd_batch{0, 0, 6, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19) >> 17 | SafeLoadAs(in + 4 * 20) << 15, - SafeLoadAs(in + 4 * 20) >> 10 | SafeLoadAs(in + 4 * 21) << 22, - SafeLoadAs(in + 4 * 21), - }; - shifts = simd_batch{0, 0, 0, 3}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 25-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 21) >> 28 | SafeLoadAs(in + 4 * 22) << 4, - SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, - SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, - SafeLoadAs(in + 4 * 24), - }; - shifts = simd_batch{0, 0, 0, 7}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 25 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<26>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3ffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 26-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 26 | SafeLoadAs(in + 4 * 1) << 6, - SafeLoadAs(in + 4 * 1) >> 20 | SafeLoadAs(in + 4 * 2) << 12, - SafeLoadAs(in + 4 * 2) >> 14 | SafeLoadAs(in + 4 * 3) << 18, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 8 | SafeLoadAs(in + 4 * 4) << 24, - SafeLoadAs(in + 4 * 4), - SafeLoadAs(in + 4 * 4) >> 28 | SafeLoadAs(in + 4 * 5) << 4, - SafeLoadAs(in + 4 * 5) >> 22 | SafeLoadAs(in + 4 * 6) << 10, - }; - shifts = simd_batch{0, 2, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 16 | SafeLoadAs(in + 4 * 7) << 16, - SafeLoadAs(in + 4 * 7) >> 10 | SafeLoadAs(in + 4 * 8) << 22, - SafeLoadAs(in + 4 * 8), - SafeLoadAs(in + 4 * 8) >> 30 | SafeLoadAs(in + 4 * 9) << 2, - }; - shifts = simd_batch{0, 0, 4, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 9) >> 24 | SafeLoadAs(in + 4 * 10) << 8, - SafeLoadAs(in + 4 * 10) >> 18 | SafeLoadAs(in + 4 * 11) << 14, - SafeLoadAs(in + 4 * 11) >> 12 | SafeLoadAs(in + 4 * 12) << 20, - SafeLoadAs(in + 4 * 12), - }; - shifts = simd_batch{0, 0, 0, 6}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 13), - SafeLoadAs(in + 4 * 13) >> 26 | SafeLoadAs(in + 4 * 14) << 6, - SafeLoadAs(in + 4 * 14) >> 20 | SafeLoadAs(in + 4 * 15) << 12, - SafeLoadAs(in + 4 * 15) >> 14 | SafeLoadAs(in + 4 * 16) << 18, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 16) >> 8 | SafeLoadAs(in + 4 * 17) << 24, - SafeLoadAs(in + 4 * 17), - SafeLoadAs(in + 4 * 17) >> 28 | SafeLoadAs(in + 4 * 18) << 4, - SafeLoadAs(in + 4 * 18) >> 22 | SafeLoadAs(in + 4 * 19) << 10, - }; - shifts = simd_batch{0, 2, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 19) >> 16 | SafeLoadAs(in + 4 * 20) << 16, - SafeLoadAs(in + 4 * 20) >> 10 | SafeLoadAs(in + 4 * 21) << 22, - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, - }; - shifts = simd_batch{0, 0, 4, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 26-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 22) >> 24 | SafeLoadAs(in + 4 * 23) << 8, - SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, - SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, - SafeLoadAs(in + 4 * 25), - }; - shifts = simd_batch{0, 0, 0, 6}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 26 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<27>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7ffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 27-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 27 | SafeLoadAs(in + 4 * 1) << 5, - SafeLoadAs(in + 4 * 1) >> 22 | SafeLoadAs(in + 4 * 2) << 10, - SafeLoadAs(in + 4 * 2) >> 17 | SafeLoadAs(in + 4 * 3) << 15, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 12 | SafeLoadAs(in + 4 * 4) << 20, - SafeLoadAs(in + 4 * 4) >> 7 | SafeLoadAs(in + 4 * 5) << 25, - SafeLoadAs(in + 4 * 5), - SafeLoadAs(in + 4 * 5) >> 29 | SafeLoadAs(in + 4 * 6) << 3, - }; - shifts = simd_batch{0, 0, 2, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 6) >> 24 | SafeLoadAs(in + 4 * 7) << 8, - SafeLoadAs(in + 4 * 7) >> 19 | SafeLoadAs(in + 4 * 8) << 13, - SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, - SafeLoadAs(in + 4 * 9) >> 9 | SafeLoadAs(in + 4 * 10) << 23, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 10), - SafeLoadAs(in + 4 * 10) >> 31 | SafeLoadAs(in + 4 * 11) << 1, - SafeLoadAs(in + 4 * 11) >> 26 | SafeLoadAs(in + 4 * 12) << 6, - SafeLoadAs(in + 4 * 12) >> 21 | SafeLoadAs(in + 4 * 13) << 11, - }; - shifts = simd_batch{4, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 13) >> 16 | SafeLoadAs(in + 4 * 14) << 16, - SafeLoadAs(in + 4 * 14) >> 11 | SafeLoadAs(in + 4 * 15) << 21, - SafeLoadAs(in + 4 * 15) >> 6 | SafeLoadAs(in + 4 * 16) << 26, - SafeLoadAs(in + 4 * 16), - }; - shifts = simd_batch{0, 0, 0, 1}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - SafeLoadAs(in + 4 * 17) >> 23 | SafeLoadAs(in + 4 * 18) << 9, - SafeLoadAs(in + 4 * 18) >> 18 | SafeLoadAs(in + 4 * 19) << 14, - SafeLoadAs(in + 4 * 19) >> 13 | SafeLoadAs(in + 4 * 20) << 19, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 20) >> 8 | SafeLoadAs(in + 4 * 21) << 24, - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 30 | SafeLoadAs(in + 4 * 22) << 2, - SafeLoadAs(in + 4 * 22) >> 25 | SafeLoadAs(in + 4 * 23) << 7, - }; - shifts = simd_batch{0, 3, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 27-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, - SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, - SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, - SafeLoadAs(in + 4 * 26), - }; - shifts = simd_batch{0, 0, 0, 5}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 27 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<28>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0xfffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 28-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 28 | SafeLoadAs(in + 4 * 1) << 4, - SafeLoadAs(in + 4 * 1) >> 24 | SafeLoadAs(in + 4 * 2) << 8, - SafeLoadAs(in + 4 * 2) >> 20 | SafeLoadAs(in + 4 * 3) << 12, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 16 | SafeLoadAs(in + 4 * 4) << 16, - SafeLoadAs(in + 4 * 4) >> 12 | SafeLoadAs(in + 4 * 5) << 20, - SafeLoadAs(in + 4 * 5) >> 8 | SafeLoadAs(in + 4 * 6) << 24, - SafeLoadAs(in + 4 * 6), - }; - shifts = simd_batch{0, 0, 0, 4}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 7), - SafeLoadAs(in + 4 * 7) >> 28 | SafeLoadAs(in + 4 * 8) << 4, - SafeLoadAs(in + 4 * 8) >> 24 | SafeLoadAs(in + 4 * 9) << 8, - SafeLoadAs(in + 4 * 9) >> 20 | SafeLoadAs(in + 4 * 10) << 12, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 10) >> 16 | SafeLoadAs(in + 4 * 11) << 16, - SafeLoadAs(in + 4 * 11) >> 12 | SafeLoadAs(in + 4 * 12) << 20, - SafeLoadAs(in + 4 * 12) >> 8 | SafeLoadAs(in + 4 * 13) << 24, - SafeLoadAs(in + 4 * 13), - }; - shifts = simd_batch{0, 0, 0, 4}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 14), - SafeLoadAs(in + 4 * 14) >> 28 | SafeLoadAs(in + 4 * 15) << 4, - SafeLoadAs(in + 4 * 15) >> 24 | SafeLoadAs(in + 4 * 16) << 8, - SafeLoadAs(in + 4 * 16) >> 20 | SafeLoadAs(in + 4 * 17) << 12, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 17) >> 16 | SafeLoadAs(in + 4 * 18) << 16, - SafeLoadAs(in + 4 * 18) >> 12 | SafeLoadAs(in + 4 * 19) << 20, - SafeLoadAs(in + 4 * 19) >> 8 | SafeLoadAs(in + 4 * 20) << 24, - SafeLoadAs(in + 4 * 20), - }; - shifts = simd_batch{0, 0, 0, 4}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 21), - SafeLoadAs(in + 4 * 21) >> 28 | SafeLoadAs(in + 4 * 22) << 4, - SafeLoadAs(in + 4 * 22) >> 24 | SafeLoadAs(in + 4 * 23) << 8, - SafeLoadAs(in + 4 * 23) >> 20 | SafeLoadAs(in + 4 * 24) << 12, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 28-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 24) >> 16 | SafeLoadAs(in + 4 * 25) << 16, - SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, - SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, - SafeLoadAs(in + 4 * 27), - }; - shifts = simd_batch{0, 0, 0, 4}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 28 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<29>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x1fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 29-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 29 | SafeLoadAs(in + 4 * 1) << 3, - SafeLoadAs(in + 4 * 1) >> 26 | SafeLoadAs(in + 4 * 2) << 6, - SafeLoadAs(in + 4 * 2) >> 23 | SafeLoadAs(in + 4 * 3) << 9, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 20 | SafeLoadAs(in + 4 * 4) << 12, - SafeLoadAs(in + 4 * 4) >> 17 | SafeLoadAs(in + 4 * 5) << 15, - SafeLoadAs(in + 4 * 5) >> 14 | SafeLoadAs(in + 4 * 6) << 18, - SafeLoadAs(in + 4 * 6) >> 11 | SafeLoadAs(in + 4 * 7) << 21, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 8 | SafeLoadAs(in + 4 * 8) << 24, - SafeLoadAs(in + 4 * 8) >> 5 | SafeLoadAs(in + 4 * 9) << 27, - SafeLoadAs(in + 4 * 9), - SafeLoadAs(in + 4 * 9) >> 31 | SafeLoadAs(in + 4 * 10) << 1, - }; - shifts = simd_batch{0, 0, 2, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 10) >> 28 | SafeLoadAs(in + 4 * 11) << 4, - SafeLoadAs(in + 4 * 11) >> 25 | SafeLoadAs(in + 4 * 12) << 7, - SafeLoadAs(in + 4 * 12) >> 22 | SafeLoadAs(in + 4 * 13) << 10, - SafeLoadAs(in + 4 * 13) >> 19 | SafeLoadAs(in + 4 * 14) << 13, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 14) >> 16 | SafeLoadAs(in + 4 * 15) << 16, - SafeLoadAs(in + 4 * 15) >> 13 | SafeLoadAs(in + 4 * 16) << 19, - SafeLoadAs(in + 4 * 16) >> 10 | SafeLoadAs(in + 4 * 17) << 22, - SafeLoadAs(in + 4 * 17) >> 7 | SafeLoadAs(in + 4 * 18) << 25, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 18) >> 4 | SafeLoadAs(in + 4 * 19) << 28, - SafeLoadAs(in + 4 * 19), - SafeLoadAs(in + 4 * 19) >> 30 | SafeLoadAs(in + 4 * 20) << 2, - SafeLoadAs(in + 4 * 20) >> 27 | SafeLoadAs(in + 4 * 21) << 5, - }; - shifts = simd_batch{0, 1, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 21) >> 24 | SafeLoadAs(in + 4 * 22) << 8, - SafeLoadAs(in + 4 * 22) >> 21 | SafeLoadAs(in + 4 * 23) << 11, - SafeLoadAs(in + 4 * 23) >> 18 | SafeLoadAs(in + 4 * 24) << 14, - SafeLoadAs(in + 4 * 24) >> 15 | SafeLoadAs(in + 4 * 25) << 17, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 29-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 25) >> 12 | SafeLoadAs(in + 4 * 26) << 20, - SafeLoadAs(in + 4 * 26) >> 9 | SafeLoadAs(in + 4 * 27) << 23, - SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, - SafeLoadAs(in + 4 * 28), - }; - shifts = simd_batch{0, 0, 0, 3}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 29 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<30>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x3fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 30-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 30 | SafeLoadAs(in + 4 * 1) << 2, - SafeLoadAs(in + 4 * 1) >> 28 | SafeLoadAs(in + 4 * 2) << 4, - SafeLoadAs(in + 4 * 2) >> 26 | SafeLoadAs(in + 4 * 3) << 6, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 24 | SafeLoadAs(in + 4 * 4) << 8, - SafeLoadAs(in + 4 * 4) >> 22 | SafeLoadAs(in + 4 * 5) << 10, - SafeLoadAs(in + 4 * 5) >> 20 | SafeLoadAs(in + 4 * 6) << 12, - SafeLoadAs(in + 4 * 6) >> 18 | SafeLoadAs(in + 4 * 7) << 14, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 16 | SafeLoadAs(in + 4 * 8) << 16, - SafeLoadAs(in + 4 * 8) >> 14 | SafeLoadAs(in + 4 * 9) << 18, - SafeLoadAs(in + 4 * 9) >> 12 | SafeLoadAs(in + 4 * 10) << 20, - SafeLoadAs(in + 4 * 10) >> 10 | SafeLoadAs(in + 4 * 11) << 22, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 11) >> 8 | SafeLoadAs(in + 4 * 12) << 24, - SafeLoadAs(in + 4 * 12) >> 6 | SafeLoadAs(in + 4 * 13) << 26, - SafeLoadAs(in + 4 * 13) >> 4 | SafeLoadAs(in + 4 * 14) << 28, - SafeLoadAs(in + 4 * 14), - }; - shifts = simd_batch{0, 0, 0, 2}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 15), - SafeLoadAs(in + 4 * 15) >> 30 | SafeLoadAs(in + 4 * 16) << 2, - SafeLoadAs(in + 4 * 16) >> 28 | SafeLoadAs(in + 4 * 17) << 4, - SafeLoadAs(in + 4 * 17) >> 26 | SafeLoadAs(in + 4 * 18) << 6, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 18) >> 24 | SafeLoadAs(in + 4 * 19) << 8, - SafeLoadAs(in + 4 * 19) >> 22 | SafeLoadAs(in + 4 * 20) << 10, - SafeLoadAs(in + 4 * 20) >> 20 | SafeLoadAs(in + 4 * 21) << 12, - SafeLoadAs(in + 4 * 21) >> 18 | SafeLoadAs(in + 4 * 22) << 14, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 22) >> 16 | SafeLoadAs(in + 4 * 23) << 16, - SafeLoadAs(in + 4 * 23) >> 14 | SafeLoadAs(in + 4 * 24) << 18, - SafeLoadAs(in + 4 * 24) >> 12 | SafeLoadAs(in + 4 * 25) << 20, - SafeLoadAs(in + 4 * 25) >> 10 | SafeLoadAs(in + 4 * 26) << 22, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 30-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 26) >> 8 | SafeLoadAs(in + 4 * 27) << 24, - SafeLoadAs(in + 4 * 27) >> 6 | SafeLoadAs(in + 4 * 28) << 26, - SafeLoadAs(in + 4 * 28) >> 4 | SafeLoadAs(in + 4 * 29) << 28, - SafeLoadAs(in + 4 * 29), - }; - shifts = simd_batch{0, 0, 0, 2}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 30 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<31>(const uint8_t* in, uint32_t* out) { - constexpr uint32_t kMask = 0x7fffffff; - - simd_batch masks(kMask); - simd_batch words, shifts; - simd_batch results; - - // extract 31-bit bundles 0 to 3 - words = simd_batch{ - SafeLoadAs(in + 4 * 0), - SafeLoadAs(in + 4 * 0) >> 31 | SafeLoadAs(in + 4 * 1) << 1, - SafeLoadAs(in + 4 * 1) >> 30 | SafeLoadAs(in + 4 * 2) << 2, - SafeLoadAs(in + 4 * 2) >> 29 | SafeLoadAs(in + 4 * 3) << 3, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 4 to 7 - words = simd_batch{ - SafeLoadAs(in + 4 * 3) >> 28 | SafeLoadAs(in + 4 * 4) << 4, - SafeLoadAs(in + 4 * 4) >> 27 | SafeLoadAs(in + 4 * 5) << 5, - SafeLoadAs(in + 4 * 5) >> 26 | SafeLoadAs(in + 4 * 6) << 6, - SafeLoadAs(in + 4 * 6) >> 25 | SafeLoadAs(in + 4 * 7) << 7, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 8 to 11 - words = simd_batch{ - SafeLoadAs(in + 4 * 7) >> 24 | SafeLoadAs(in + 4 * 8) << 8, - SafeLoadAs(in + 4 * 8) >> 23 | SafeLoadAs(in + 4 * 9) << 9, - SafeLoadAs(in + 4 * 9) >> 22 | SafeLoadAs(in + 4 * 10) << 10, - SafeLoadAs(in + 4 * 10) >> 21 | SafeLoadAs(in + 4 * 11) << 11, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 12 to 15 - words = simd_batch{ - SafeLoadAs(in + 4 * 11) >> 20 | SafeLoadAs(in + 4 * 12) << 12, - SafeLoadAs(in + 4 * 12) >> 19 | SafeLoadAs(in + 4 * 13) << 13, - SafeLoadAs(in + 4 * 13) >> 18 | SafeLoadAs(in + 4 * 14) << 14, - SafeLoadAs(in + 4 * 14) >> 17 | SafeLoadAs(in + 4 * 15) << 15, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 16 to 19 - words = simd_batch{ - SafeLoadAs(in + 4 * 15) >> 16 | SafeLoadAs(in + 4 * 16) << 16, - SafeLoadAs(in + 4 * 16) >> 15 | SafeLoadAs(in + 4 * 17) << 17, - SafeLoadAs(in + 4 * 17) >> 14 | SafeLoadAs(in + 4 * 18) << 18, - SafeLoadAs(in + 4 * 18) >> 13 | SafeLoadAs(in + 4 * 19) << 19, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 20 to 23 - words = simd_batch{ - SafeLoadAs(in + 4 * 19) >> 12 | SafeLoadAs(in + 4 * 20) << 20, - SafeLoadAs(in + 4 * 20) >> 11 | SafeLoadAs(in + 4 * 21) << 21, - SafeLoadAs(in + 4 * 21) >> 10 | SafeLoadAs(in + 4 * 22) << 22, - SafeLoadAs(in + 4 * 22) >> 9 | SafeLoadAs(in + 4 * 23) << 23, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 24 to 27 - words = simd_batch{ - SafeLoadAs(in + 4 * 23) >> 8 | SafeLoadAs(in + 4 * 24) << 24, - SafeLoadAs(in + 4 * 24) >> 7 | SafeLoadAs(in + 4 * 25) << 25, - SafeLoadAs(in + 4 * 25) >> 6 | SafeLoadAs(in + 4 * 26) << 26, - SafeLoadAs(in + 4 * 26) >> 5 | SafeLoadAs(in + 4 * 27) << 27, - }; - shifts = simd_batch{0, 0, 0, 0}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - // extract 31-bit bundles 28 to 31 - words = simd_batch{ - SafeLoadAs(in + 4 * 27) >> 4 | SafeLoadAs(in + 4 * 28) << 28, - SafeLoadAs(in + 4 * 28) >> 3 | SafeLoadAs(in + 4 * 29) << 29, - SafeLoadAs(in + 4 * 29) >> 2 | SafeLoadAs(in + 4 * 30) << 30, - SafeLoadAs(in + 4 * 30), - }; - shifts = simd_batch{0, 0, 0, 1}; - results = (words >> shifts) & masks; - results.store_unaligned(out); - out += 4; - - in += 31 * 4; - return in; -} - -template <> -const uint8_t* Simd128Unpacker::unpack<32>(const uint8_t* in, uint32_t* out) { - std::memcpy(out, in, 32 * sizeof(*out)); - in += 4 * 32; - out += 32; - return in; -} + template + static const uint8_t* unpack(const uint8_t* in, out_type* out) { + return SimdUnpackerForWidth::unpack(in, out); + } +}; } // namespace arrow::internal From cf20fd3d7147a20020ca7915f6e7e7b4f6a0ade5 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 23 Sep 2025 16:13:51 +0200 Subject: [PATCH 28/76] Struct unpacker for width and no generate null/full --- cpp/src/arrow/util/bpacking.cc | 4 +- .../arrow/util/bpacking_dispatch_internal.h | 237 ++++++++++-------- cpp/src/arrow/util/bpacking_neon.cc | 2 +- cpp/src/arrow/util/bpacking_scalar_codegen.py | 98 +++----- cpp/src/arrow/util/bpacking_simd_codegen.py | 102 +++----- 5 files changed, 195 insertions(+), 248 deletions(-) diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index 286614156a2..4dec6e83454 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -46,7 +46,7 @@ int unpack32_sse4_2(const uint8_t* in, uint32_t* out, int batch_size, int num_bi #endif int unpack32_scalar(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack_jump32>(in, out, batch_size, num_bits); + return unpack_jump32(in, out, batch_size, num_bits); } namespace { @@ -80,7 +80,7 @@ int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { } int unpack64_scalar(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { - return unpack_jump64>(in, out, batch_size, num_bits); + return unpack_jump64(in, out, batch_size, num_bits); } int unpack64(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h index 92b8bc867d7..91e57e664b8 100644 --- a/cpp/src/arrow/util/bpacking_dispatch_internal.h +++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h @@ -15,93 +15,120 @@ // specific language governing permissions and limitations // under the License. +#include + +#include "arrow/util/endian.h" #include "arrow/util/logging.h" +#include "arrow/util/ubsan.h" namespace arrow::internal { -template -int unpack(const uint8_t* in, typename Unpacker::out_type* out, int batch_size, - int num_bits) { - constexpr auto kValuesUnpacked = Unpacker::kValuesUnpacked; +template + +int unpack_null(const uint8_t* in, Uint* out, int batch_size) { + std::memset(out, 0, batch_size * sizeof(Uint)); + return batch_size; +} + +template +int unpack_full(const uint8_t* in, Uint* out, int batch_size) { + if constexpr (ARROW_LITTLE_ENDIAN == 1) { + std::memcpy(out, in, batch_size * sizeof(Uint)); + } else { + using bit_util::FromLittleEndian; + using util::SafeLoadAs; + + for (int k = 0; k < batch_size; k += 1) { + out[k] = FromLittleEndian(SafeLoadAs(in + (k * sizeof(Uint)))); + } + } + return batch_size; +} + +template typename Unpacker, typename Uint> +int unpack(const uint8_t* in, Uint* out, int batch_size) { + using UnpackerForWidth = Unpacker; + + constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked; batch_size = batch_size / kValuesUnpacked * kValuesUnpacked; int num_loops = batch_size / kValuesUnpacked; for (int i = 0; i < num_loops; ++i) { - in = Unpacker::template unpack(in, out + i * kValuesUnpacked); + in = UnpackerForWidth::unpack(in, out + i * kValuesUnpacked); } return batch_size; } -template +template