From 17972791577022b163d3e7dd62ee2f994f017637 Mon Sep 17 00:00:00 2001 From: ZMZ Date: Mon, 12 Apr 2021 15:11:27 +0800 Subject: [PATCH 001/719] support null data type in gdv --- cpp/src/gandiva/CMakeLists.txt | 2 + cpp/src/gandiva/annotator.cc | 18 ++- cpp/src/gandiva/dex.h | 8 ++ cpp/src/gandiva/dex_visitor.h | 3 + cpp/src/gandiva/exported_funcs.h | 6 + cpp/src/gandiva/expr_decomposer.cc | 7 ++ cpp/src/gandiva/expr_decomposer.h | 1 + cpp/src/gandiva/expr_validator.cc | 22 +++- cpp/src/gandiva/expr_validator.h | 1 + cpp/src/gandiva/function_registry.cc | 4 + cpp/src/gandiva/function_registry_common.h | 1 + cpp/src/gandiva/function_registry_null.h | 40 +++++++ cpp/src/gandiva/llvm_generator.cc | 28 ++++- cpp/src/gandiva/llvm_generator.h | 1 + cpp/src/gandiva/node.h | 15 ++- cpp/src/gandiva/node_visitor.h | 2 + cpp/src/gandiva/null_ops.cc | 50 ++++++++ cpp/src/gandiva/null_ops.h | 30 +++++ cpp/src/gandiva/null_ops_test.cc | 30 +++++ cpp/src/gandiva/precompiled/types.h | 1 + cpp/src/gandiva/projector.cc | 13 ++- cpp/src/gandiva/tests/CMakeLists.txt | 1 + cpp/src/gandiva/tests/null_test.cc | 130 +++++++++++++++++++++ cpp/src/gandiva/tree_expr_builder.cc | 2 + 24 files changed, 399 insertions(+), 17 deletions(-) create mode 100644 cpp/src/gandiva/function_registry_null.h create mode 100644 cpp/src/gandiva/null_ops.cc create mode 100644 cpp/src/gandiva/null_ops.h create mode 100644 cpp/src/gandiva/null_ops_test.cc create mode 100644 cpp/src/gandiva/tests/null_test.cc diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index fcdaf97d526..e9bbc19c5b0 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -69,6 +69,7 @@ set(SRC_FILES expression_registry.cc exported_funcs_registry.cc filter.cc + null_ops.cc function_ir_builder.cc function_registry.cc function_registry_arithmetic.cc @@ -233,6 +234,7 @@ add_gandiva_test(internals-test random_generator_holder_test.cc hash_utils_test.cc gdv_function_stubs_test.cc + null_ops_test.cc EXTRA_DEPENDENCIES LLVM::LLVM_INTERFACE ${GANDIVA_OPENSSL_LIBS} diff --git a/cpp/src/gandiva/annotator.cc b/cpp/src/gandiva/annotator.cc index f6acaff1804..8d0eb145e17 100644 --- a/cpp/src/gandiva/annotator.cc +++ b/cpp/src/gandiva/annotator.cc @@ -77,13 +77,21 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, ++buffer_idx; } - uint8_t* data_buf = const_cast(array_data.buffers[buffer_idx]->data()); - eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.offset); + if (array_data.type->id() == arrow::Type::NA) { + eval_batch->SetBuffer(desc.data_idx(), nullptr, array_data.offset); + } else { + uint8_t* data_buf = const_cast(array_data.buffers[buffer_idx]->data()); + eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.offset); + } if (is_output) { // pass in the Buffer object for output data buffers. Can be used for resizing. - uint8_t* data_buf_ptr = - reinterpret_cast(array_data.buffers[buffer_idx].get()); - eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.offset); + if (array_data.type->id() == arrow::Type::NA) { + eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), nullptr, array_data.offset); + } else { + uint8_t* data_buf_ptr = + reinterpret_cast(array_data.buffers[buffer_idx].get()); + eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.offset); + } } } diff --git a/cpp/src/gandiva/dex.h b/cpp/src/gandiva/dex.h index 3920f82f1d7..c4b3a81ca2e 100644 --- a/cpp/src/gandiva/dex.h +++ b/cpp/src/gandiva/dex.h @@ -205,6 +205,14 @@ class GANDIVA_EXPORT LiteralDex : public Dex { LiteralHolder holder_; }; +/// decomposed expression for a null literal. +class GANDIVA_EXPORT NullLiteralDex : public Dex { + public: + NullLiteralDex() {} + + void Accept(DexVisitor& visitor) override { visitor.Visit(*this); } +}; + /// decomposed if-else expression. class GANDIVA_EXPORT IfDex : public Dex { public: diff --git a/cpp/src/gandiva/dex_visitor.h b/cpp/src/gandiva/dex_visitor.h index ba5de970dda..07277286b53 100644 --- a/cpp/src/gandiva/dex_visitor.h +++ b/cpp/src/gandiva/dex_visitor.h @@ -30,6 +30,7 @@ class VectorReadFixedLenValueDex; class VectorReadVarLenValueDex; class LocalBitMapValidityDex; class LiteralDex; +class NullLiteralDex; class TrueDex; class FalseDex; class NonNullableFuncDex; @@ -53,6 +54,7 @@ class GANDIVA_EXPORT DexVisitor { virtual void Visit(const TrueDex& dex) = 0; virtual void Visit(const FalseDex& dex) = 0; virtual void Visit(const LiteralDex& dex) = 0; + virtual void Visit(const NullLiteralDex& dex) = 0; virtual void Visit(const NonNullableFuncDex& dex) = 0; virtual void Visit(const NullableNeverFuncDex& dex) = 0; virtual void Visit(const NullableInternalFuncDex& dex) = 0; @@ -77,6 +79,7 @@ class GANDIVA_EXPORT DexDefaultVisitor : public DexVisitor { VISIT_DCHECK(TrueDex) VISIT_DCHECK(FalseDex) VISIT_DCHECK(LiteralDex) + VISIT_DCHECK(NullLiteralDex) VISIT_DCHECK(NonNullableFuncDex) VISIT_DCHECK(NullableNeverFuncDex) VISIT_DCHECK(NullableInternalFuncDex) diff --git a/cpp/src/gandiva/exported_funcs.h b/cpp/src/gandiva/exported_funcs.h index 58205266094..1dc1f57f770 100644 --- a/cpp/src/gandiva/exported_funcs.h +++ b/cpp/src/gandiva/exported_funcs.h @@ -32,6 +32,12 @@ class ExportedFuncsBase { virtual void AddMappings(Engine* engine) const = 0; }; +// Class for exporting Null functions +class ExportedNullFunctions : public ExportedFuncsBase { + void AddMappings(Engine* engine) const override; +}; +REGISTER_EXPORTED_FUNCS(ExportedNullFunctions); + // Class for exporting Stub functions class ExportedStubFunctions : public ExportedFuncsBase { void AddMappings(Engine* engine) const override; diff --git a/cpp/src/gandiva/expr_decomposer.cc b/cpp/src/gandiva/expr_decomposer.cc index 07252b42fd2..834a7211e89 100644 --- a/cpp/src/gandiva/expr_decomposer.cc +++ b/cpp/src/gandiva/expr_decomposer.cc @@ -223,6 +223,13 @@ Status ExprDecomposer::Visit(const LiteralNode& node) { return Status::OK(); } +Status ExprDecomposer::Visit(const NullLiteralNode& node) { + auto value_dex = std::make_shared(); + auto validity_dex = std::make_shared(); + result_ = std::make_shared(validity_dex, value_dex); + return Status::OK(); +} + // The bolow functions use a stack to detect : // a. nested if-else expressions. // In such cases, the local bitmap can be re-used. diff --git a/cpp/src/gandiva/expr_decomposer.h b/cpp/src/gandiva/expr_decomposer.h index 3e8e67de255..ebc64e227db 100644 --- a/cpp/src/gandiva/expr_decomposer.h +++ b/cpp/src/gandiva/expr_decomposer.h @@ -63,6 +63,7 @@ class GANDIVA_EXPORT ExprDecomposer : public NodeVisitor { Status Visit(const FunctionNode& node) override; Status Visit(const IfNode& node) override; Status Visit(const LiteralNode& node) override; + Status Visit(const NullLiteralNode& node) override; Status Visit(const BooleanNode& node) override; Status Visit(const InExpressionNode& node) override; Status Visit(const InExpressionNode& node) override; diff --git a/cpp/src/gandiva/expr_validator.cc b/cpp/src/gandiva/expr_validator.cc index fd46c2894b9..47e11f3a836 100644 --- a/cpp/src/gandiva/expr_validator.cc +++ b/cpp/src/gandiva/expr_validator.cc @@ -42,11 +42,14 @@ Status ExprValidator::Validate(const ExpressionPtr& expr) { } Status ExprValidator::Visit(const FieldNode& node) { - auto llvm_type = types_->IRType(node.return_type()->id()); - ARROW_RETURN_IF(llvm_type == nullptr, - Status::ExpressionValidationError("Field ", node.field()->name(), - " has unsupported data type ", - node.return_type()->name())); + auto return_type = node.return_type(); + if (return_type->id() != arrow::Type::NA) { + auto llvm_type = types_->DataVecType(node.return_type()); + ARROW_RETURN_IF(llvm_type == nullptr, + Status::ExpressionValidationError("Field ", node.field()->name(), + " has unsupported data type ", + node.return_type()->name())); + } // Ensure that field is found in schema auto field_in_schema_entry = field_map_.find(node.field()->name()); @@ -120,6 +123,15 @@ Status ExprValidator::Visit(const LiteralNode& node) { return Status::OK(); } +Status ExprValidator::Visit(const NullLiteralNode& node) { + auto llvm_type = types_->DataVecType(node.return_type()); + ARROW_RETURN_IF(llvm_type != nullptr, + Status::ExpressionValidationError("Should be data type ", + node.return_type()->name())); + + return Status::OK(); +} + Status ExprValidator::Visit(const BooleanNode& node) { ARROW_RETURN_IF( node.children().size() < 2, diff --git a/cpp/src/gandiva/expr_validator.h b/cpp/src/gandiva/expr_validator.h index e25afe5e7e8..b3399ff517c 100644 --- a/cpp/src/gandiva/expr_validator.h +++ b/cpp/src/gandiva/expr_validator.h @@ -57,6 +57,7 @@ class ExprValidator : public NodeVisitor { Status Visit(const FunctionNode& node) override; Status Visit(const IfNode& node) override; Status Visit(const LiteralNode& node) override; + Status Visit(const NullLiteralNode& node) override; Status Visit(const BooleanNode& node) override; Status Visit(const InExpressionNode& node) override; Status Visit(const InExpressionNode& node) override; diff --git a/cpp/src/gandiva/function_registry.cc b/cpp/src/gandiva/function_registry.cc index d5d015c10b4..2d622124102 100644 --- a/cpp/src/gandiva/function_registry.cc +++ b/cpp/src/gandiva/function_registry.cc @@ -20,6 +20,7 @@ #include "gandiva/function_registry_datetime.h" #include "gandiva/function_registry_hash.h" #include "gandiva/function_registry_math_ops.h" +#include "gandiva/function_registry_null.h" #include "gandiva/function_registry_string.h" #include "gandiva/function_registry_timestamp_arithmetic.h" @@ -65,6 +66,9 @@ SignatureMap FunctionRegistry::InitPCMap() { auto v6 = GetDateTimeArithmeticFunctionRegistry(); pc_registry_.insert(std::end(pc_registry_), v6.begin(), v6.end()); + auto v8 = GetNullFunctionRegistry(); + pc_registry_.insert(std::end(pc_registry_), v8.begin(), v8.end()); + for (auto& elem : pc_registry_) { for (auto& func_signature : elem.signatures()) { map.insert(std::make_pair(&(func_signature), &elem)); diff --git a/cpp/src/gandiva/function_registry_common.h b/cpp/src/gandiva/function_registry_common.h index d1555fba3ce..1ccba270c03 100644 --- a/cpp/src/gandiva/function_registry_common.h +++ b/cpp/src/gandiva/function_registry_common.h @@ -43,6 +43,7 @@ using arrow::int16; using arrow::int32; using arrow::int64; using arrow::int8; +using arrow::null; using arrow::uint16; using arrow::uint32; using arrow::uint64; diff --git a/cpp/src/gandiva/function_registry_null.h b/cpp/src/gandiva/function_registry_null.h new file mode 100644 index 00000000000..a01cbef6fc1 --- /dev/null +++ b/cpp/src/gandiva/function_registry_null.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "gandiva/native_function.h" + +namespace gandiva { + +std::vector GetNullFunctionRegistry() { + static std::vector null_fn_registry_ = { + NativeFunction("equal", + {"not_equal", "less_than", "less_than_or_equal_to", "greater_than", + "greater_than_or_equal_to"}, + DataTypeVector{null(), null()}, boolean(), kResultNullNever, + "compare_null_null"), + NativeFunction("isnull", {}, DataTypeVector{null()}, boolean(), kResultNullNever, + "isnull_null"), + NativeFunction("isnotnull", {}, DataTypeVector{null()}, boolean(), kResultNullNever, + "isnotnull_null")}; + return null_fn_registry_; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 1a80f1e7586..4ab96eb6999 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -170,6 +170,9 @@ llvm::Value* LLVMGenerator::GetDataReference(llvm::Value* arg_addrs, int idx, llvm::Value* load = LoadVectorAtIndex(arg_addrs, idx, name); llvm::Type* base_type = types()->DataVecType(field->type()); llvm::Value* ret; + if (base_type == nullptr) { + return nullptr; + } if (base_type->isPointerTy()) { ret = ir_builder()->CreateIntToPtr(load, base_type, name + "_darray"); } else { @@ -363,6 +366,8 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, AddFunctionCall("gdv_fn_populate_varlen_vector", types()->i32_type(), {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, loop_var, output_value->data(), output_value->length()}); + } else if (output_type_id == arrow::Type::NA) { + // Do nothing when data type is null } else { return Status::NotImplemented("output type ", output->Type()->ToString(), " not supported"); @@ -452,6 +457,10 @@ void LLVMGenerator::ComputeBitMapsForExpr(const CompiledExpr& compiled_expr, // Extract the destination bitmap address. int out_idx = compiled_expr.output()->validity_idx(); uint8_t* dst_bitmap = eval_batch.GetBuffer(out_idx); + if (dst_bitmap == nullptr) { + // Return when dst_bitmap is null meaning data type is null + return; + } // Compute the destination bitmap. if (selection_vector == nullptr) { accumulator.ComputeResult(dst_bitmap); @@ -556,6 +565,9 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueDex& dex) { break; } + case arrow::Type::NA: + break; + default: { auto slot_offset = builder->CreateGEP(slot_ref, slot_index); slot_value = builder->CreateLoad(slot_offset, dex.FieldName()); @@ -720,6 +732,13 @@ void LLVMGenerator::Visitor::Visit(const LiteralDex& dex) { result_.reset(new LValue(value, len)); } +void LLVMGenerator::Visitor::Visit(const NullLiteralDex& dex) { + llvm::Value* value = nullptr; + llvm::Value* len = nullptr; + ADD_VISITOR_TRACE("visit Literal null"); + result_.reset(new LValue(value, len)); +} + void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { const std::string& function_name = dex.func_descriptor()->name(); ADD_VISITOR_TRACE("visit NonNullableFunc base function " + function_name); @@ -1240,10 +1259,11 @@ std::vector LLVMGenerator::Visitor::BuildParams( // build value. DexPtr value_expr = pair->value_expr(); value_expr->Accept(*this); - LValue& result_ref = *result(); - - // append all the parameters corresponding to this LValue. - result_ref.AppendFunctionParams(¶ms); + if (auto result_ptr = result()) { + LValue& result_ref = *result_ptr; + // append all the parameters corresponding to this LValue. + result_ref.AppendFunctionParams(¶ms); + } // build validity. if (with_validity) { diff --git a/cpp/src/gandiva/llvm_generator.h b/cpp/src/gandiva/llvm_generator.h index 8ff9711c0f9..a6fa1bb0339 100644 --- a/cpp/src/gandiva/llvm_generator.h +++ b/cpp/src/gandiva/llvm_generator.h @@ -100,6 +100,7 @@ class GANDIVA_EXPORT LLVMGenerator { void Visit(const TrueDex& dex) override; void Visit(const FalseDex& dex) override; void Visit(const LiteralDex& dex) override; + void Visit(const NullLiteralDex& dex) override; void Visit(const NonNullableFuncDex& dex) override; void Visit(const NullableNeverFuncDex& dex) override; void Visit(const NullableInternalFuncDex& dex) override; diff --git a/cpp/src/gandiva/node.h b/cpp/src/gandiva/node.h index 20807d4a0cb..6e4c22e93b1 100644 --- a/cpp/src/gandiva/node.h +++ b/cpp/src/gandiva/node.h @@ -23,7 +23,6 @@ #include #include "arrow/status.h" - #include "gandiva/arrow.h" #include "gandiva/func_descriptor.h" #include "gandiva/gandiva_aliases.h" @@ -94,6 +93,20 @@ class GANDIVA_EXPORT LiteralNode : public Node { bool is_null_; }; +/// \brief Node in the expression tree, representing a NullLiteralNode. +class GANDIVA_EXPORT NullLiteralNode : public Node { + public: + NullLiteralNode() : Node(arrow::null()) {} + + Status Accept(NodeVisitor& visitor) const override { return visitor.Visit(*this); } + + std::string ToString() const override { + std::stringstream ss; + ss << "(const " << return_type()->ToString() << ") null"; + return ss.str(); + } +}; + /// \brief Node in the expression tree, representing an arrow field. class GANDIVA_EXPORT FieldNode : public Node { public: diff --git a/cpp/src/gandiva/node_visitor.h b/cpp/src/gandiva/node_visitor.h index b118e496383..c8516907788 100644 --- a/cpp/src/gandiva/node_visitor.h +++ b/cpp/src/gandiva/node_visitor.h @@ -30,6 +30,7 @@ class FieldNode; class FunctionNode; class IfNode; class LiteralNode; +class NullLiteralNode; class BooleanNode; template class InExpressionNode; @@ -43,6 +44,7 @@ class GANDIVA_EXPORT NodeVisitor { virtual Status Visit(const FunctionNode& node) = 0; virtual Status Visit(const IfNode& node) = 0; virtual Status Visit(const LiteralNode& node) = 0; + virtual Status Visit(const NullLiteralNode& node) = 0; virtual Status Visit(const BooleanNode& node) = 0; virtual Status Visit(const InExpressionNode& node) = 0; virtual Status Visit(const InExpressionNode& node) = 0; diff --git a/cpp/src/gandiva/null_ops.cc b/cpp/src/gandiva/null_ops.cc new file mode 100644 index 00000000000..79d21ae6c9a --- /dev/null +++ b/cpp/src/gandiva/null_ops.cc @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "gandiva/null_ops.h" + +#include "gandiva/engine.h" +#include "gandiva/exported_funcs.h" +#include "gandiva/gdv_function_stubs.h" + +/// Stub functions that can be accessed from LLVM or the pre-compiled library. + +extern "C" { +bool compare_null_null() { return false; } + +bool isnull_null() { return true; } + +bool isnotnull_null() { return false; } +} + +namespace gandiva { +void ExportedNullFunctions::AddMappings(Engine* engine) const { + std::vector args; + auto types = engine->types(); + + args = {types->i1_type(), types->i1_type()}; + engine->AddGlobalMappingForFunc("compare_null_null", types->i1_type() /*return_type*/, + args, reinterpret_cast(compare_null_null)); + + args = {types->i1_type()}; + engine->AddGlobalMappingForFunc("isnull_null", types->i1_type() /*return_type*/, args, + reinterpret_cast(isnull_null)); + + args = {types->i1_type()}; + engine->AddGlobalMappingForFunc("isnotnull_null", types->i1_type() /*return_type*/, + args, reinterpret_cast(isnotnull_null)); +} +} // namespace gandiva diff --git a/cpp/src/gandiva/null_ops.h b/cpp/src/gandiva/null_ops.h new file mode 100644 index 00000000000..65bce6fe149 --- /dev/null +++ b/cpp/src/gandiva/null_ops.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +/// Stub functions that can be accessed from LLVM. +extern "C" { + +bool compare_null_null(); + +bool isnull_null(); + +bool isnotnull_null(); +} \ No newline at end of file diff --git a/cpp/src/gandiva/null_ops_test.cc b/cpp/src/gandiva/null_ops_test.cc new file mode 100644 index 00000000000..3ef351cb773 --- /dev/null +++ b/cpp/src/gandiva/null_ops_test.cc @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "gandiva/precompiled/types.h" + +namespace gandiva { + +TEST(TestNullOps, Test) { + EXPECT_FALSE(compare_null_null()); + EXPECT_TRUE(isnull_null()); + EXPECT_FALSE(isnotnull_null()); +} +} // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 1b0f96e0ab7..2c0bbd47f1e 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -19,6 +19,7 @@ #include #include "gandiva/gdv_function_stubs.h" +#include "gandiva/null_ops.h" // Use the same names as in arrow data types. Makes it easy to write pre-processor macros. using gdv_boolean = bool; diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index 734720c64c9..50440dd5e0a 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -24,7 +24,6 @@ #include "arrow/util/hash_util.h" #include "arrow/util/logging.h" - #include "gandiva/cache.h" #include "gandiva/expr_validator.h" #include "gandiva/llvm_generator.h" @@ -289,6 +288,8 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, } else if (arrow::is_binary_like(type_id)) { // we don't know the expected size for varlen output vectors. data_len = 0; + } else if (type_id == arrow::Type::NA) { + data_len = 0; } else { return Status::Invalid("Unsupported output data type " + type->ToString()); } @@ -301,7 +302,11 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, } buffers.push_back(std::move(data_buffer)); - *array_data = arrow::ArrayData::Make(type, num_records, std::move(buffers)); + if (type_id == arrow::Type::NA) { + *array_data = arrow::ArrayData::Make(type, num_records, {nullptr}); + } else { + *array_data = arrow::ArrayData::Make(type, num_records, std::move(buffers)); + } return Status::OK(); } @@ -350,6 +355,10 @@ Status Projector::ValidateArrayDataCapacity(const arrow::ArrayData& array_data, int64_t data_len = array_data.buffers[1]->capacity(); ARROW_RETURN_IF(data_len < min_data_len, Status::Invalid("Data buffer too small for ", field.name())); + } else if (type_id == arrow::Type::NA) { + ARROW_RETURN_IF(array_data.buffers.size() == 1 && array_data.buffers[0] == nullptr, + Status::Invalid("Data buffer should be nullptr for null typed field", + field.name())); } else { return Status::Invalid("Unsupported output data type " + field.type()->ToString()); } diff --git a/cpp/src/gandiva/tests/CMakeLists.txt b/cpp/src/gandiva/tests/CMakeLists.txt index 5fa2da16c63..a57085c589e 100644 --- a/cpp/src/gandiva/tests/CMakeLists.txt +++ b/cpp/src/gandiva/tests/CMakeLists.txt @@ -25,6 +25,7 @@ add_gandiva_test(binary_test) add_gandiva_test(date_time_test) add_gandiva_test(to_string_test) add_gandiva_test(utf8_test) +add_gandiva_test(null_test) add_gandiva_test(hash_test) add_gandiva_test(in_expr_test) add_gandiva_test(null_validity_test) diff --git a/cpp/src/gandiva/tests/null_test.cc b/cpp/src/gandiva/tests/null_test.cc new file mode 100644 index 00000000000..a3ff18baa32 --- /dev/null +++ b/cpp/src/gandiva/tests/null_test.cc @@ -0,0 +1,130 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "gandiva/projector.h" +#include "gandiva/tests/test_util.h" +#include "gandiva/tree_expr_builder.h" + +namespace gandiva { + +using arrow::boolean; +using arrow::null; + +class TestNull : public ::testing::Test { + public: + void SetUp() { pool_ = arrow::default_memory_pool(); } + + protected: + arrow::MemoryPool* pool_; +}; + +TEST_F(TestNull, TestSimple) { + // schema for input fields + auto field_null = field("field_null", null()); + auto schema = arrow::schema({field_null}); + + auto literal_null = TreeExprBuilder::MakeNull(arrow::null()); + auto node_field_null = TreeExprBuilder::MakeField(field_null); + + // output fields + auto res_1 = field("res1", null()); + auto res_2 = field("res2", null()); + auto expr_1 = TreeExprBuilder::MakeExpression(literal_null, res_1); + auto expr_2 = TreeExprBuilder::MakeExpression(node_field_null, res_2); + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = + Projector::Make(schema, {expr_1, expr_2}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + arrow::ArrayVector outputs; + auto nb = std::make_shared(); + auto _ = nb->AppendNulls(4); + std::shared_ptr null_array; + _ = nb->Finish(&null_array); + auto in_batch = arrow::RecordBatch::Make(schema, 4, {null_array}); + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(null_array, outputs.at(0)); + EXPECT_ARROW_ARRAY_EQUALS(null_array, outputs.at(1)); +} + +TEST_F(TestNull, TestOps) { + // schema for input fields + auto field_null = field("field_null", null()); + auto schema = arrow::schema({field_null}); + + // output fields + auto res_1 = field("res1", boolean()); + auto res_2 = field("res2", boolean()); + auto res_3 = field("res3", boolean()); + auto res_4 = field("res4", boolean()); + auto res_5 = field("res5", boolean()); + auto res_6 = field("res6", boolean()); + auto res_7 = field("res7", boolean()); + auto res_8 = field("res8", boolean()); + auto expr_1 = TreeExprBuilder::MakeExpression("equal", {field_null, field_null}, res_1); + auto expr_2 = + TreeExprBuilder::MakeExpression("not_equal", {field_null, field_null}, res_2); + auto expr_3 = + TreeExprBuilder::MakeExpression("less_than", {field_null, field_null}, res_3); + auto expr_4 = TreeExprBuilder::MakeExpression("less_than_or_equal_to", + {field_null, field_null}, res_4); + auto expr_5 = + TreeExprBuilder::MakeExpression("greater_than", {field_null, field_null}, res_5); + auto expr_6 = TreeExprBuilder::MakeExpression("greater_than_or_equal_to", + {field_null, field_null}, res_6); + auto expr_7 = TreeExprBuilder::MakeExpression("isnull", {field_null}, res_7); + auto expr_8 = TreeExprBuilder::MakeExpression("isnotnull", {field_null}, res_8); + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make( + schema, {expr_1, expr_2, expr_3, expr_4, expr_5, expr_6, expr_7, expr_8}, + TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + arrow::ArrayVector outputs; + auto nb = std::make_shared(); + auto _ = nb->AppendNulls(4); + std::shared_ptr null_array; + _ = nb->Finish(&null_array); + auto in_batch = arrow::RecordBatch::Make(schema, 4, {null_array}); + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()); + + // Validate results + auto exp_true = MakeArrowArrayBool({true, true, true, true}, {true, true, true, true}); + auto exp_false = + MakeArrowArrayBool({false, false, false, false}, {true, true, true, true}); + for (int i = 0; i < 8; i++) { + if (i == 6) { + EXPECT_ARROW_ARRAY_EQUALS(exp_true, outputs.at(i)); + } else { + EXPECT_ARROW_ARRAY_EQUALS(exp_false, outputs.at(i)); + } + } +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/tree_expr_builder.cc b/cpp/src/gandiva/tree_expr_builder.cc index b27b92010e8..7a66cb0a49c 100644 --- a/cpp/src/gandiva/tree_expr_builder.cc +++ b/cpp/src/gandiva/tree_expr_builder.cc @@ -105,6 +105,8 @@ NodePtr TreeExprBuilder::MakeNull(DataTypePtr data_type) { DecimalScalar128 literal(decimal_type->precision(), decimal_type->scale()); return std::make_shared(data_type, LiteralHolder(literal), true); } + case arrow::Type::NA: + return std::make_shared(); default: return nullptr; } From e493e940ce56de7b268ad9b9f22798126616335c Mon Sep 17 00:00:00 2001 From: ZMZ Date: Tue, 13 Apr 2021 09:39:09 +0800 Subject: [PATCH 002/719] update compare function return type --- cpp/src/gandiva/function_registry_null.h | 2 +- cpp/src/gandiva/llvm_generator.cc | 2 +- cpp/src/gandiva/null_ops.cc | 8 ++++---- cpp/src/gandiva/null_ops.h | 6 +++--- cpp/src/gandiva/null_ops_test.cc | 6 +++--- cpp/src/gandiva/tests/null_test.cc | 22 ++++++++++------------ 6 files changed, 22 insertions(+), 24 deletions(-) diff --git a/cpp/src/gandiva/function_registry_null.h b/cpp/src/gandiva/function_registry_null.h index a01cbef6fc1..ab45e6f4e41 100644 --- a/cpp/src/gandiva/function_registry_null.h +++ b/cpp/src/gandiva/function_registry_null.h @@ -28,7 +28,7 @@ std::vector GetNullFunctionRegistry() { NativeFunction("equal", {"not_equal", "less_than", "less_than_or_equal_to", "greater_than", "greater_than_or_equal_to"}, - DataTypeVector{null(), null()}, boolean(), kResultNullNever, + DataTypeVector{null(), null()}, null(), kResultNullNever, "compare_null_null"), NativeFunction("isnull", {}, DataTypeVector{null()}, boolean(), kResultNullNever, "isnull_null"), diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 4ab96eb6999..f7f1d464474 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -500,7 +500,7 @@ llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, // build a call to the llvm function. llvm::Value* value; - if (ret_type->isVoidTy()) { + if (ret_type == nullptr || ret_type->isVoidTy()) { // void functions can't have a name for the call. value = ir_builder()->CreateCall(fn, args); } else { diff --git a/cpp/src/gandiva/null_ops.cc b/cpp/src/gandiva/null_ops.cc index 79d21ae6c9a..b7179a8e8be 100644 --- a/cpp/src/gandiva/null_ops.cc +++ b/cpp/src/gandiva/null_ops.cc @@ -23,11 +23,11 @@ /// Stub functions that can be accessed from LLVM or the pre-compiled library. extern "C" { -bool compare_null_null() { return false; } +void compare_null_null(bool in1_valid, bool in2_valid) {} -bool isnull_null() { return true; } +bool isnull_null(bool in_valid) { return true; } -bool isnotnull_null() { return false; } +bool isnotnull_null(bool in_valid) { return false; } } namespace gandiva { @@ -36,7 +36,7 @@ void ExportedNullFunctions::AddMappings(Engine* engine) const { auto types = engine->types(); args = {types->i1_type(), types->i1_type()}; - engine->AddGlobalMappingForFunc("compare_null_null", types->i1_type() /*return_type*/, + engine->AddGlobalMappingForFunc("compare_null_null", types->void_type() /*return_type*/, args, reinterpret_cast(compare_null_null)); args = {types->i1_type()}; diff --git a/cpp/src/gandiva/null_ops.h b/cpp/src/gandiva/null_ops.h index 65bce6fe149..492eb6033cd 100644 --- a/cpp/src/gandiva/null_ops.h +++ b/cpp/src/gandiva/null_ops.h @@ -22,9 +22,9 @@ /// Stub functions that can be accessed from LLVM. extern "C" { -bool compare_null_null(); +void compare_null_null(bool in1_valid, bool in2_valid); -bool isnull_null(); +bool isnull_null(bool in_valid); -bool isnotnull_null(); +bool isnotnull_null(bool in_valid); } \ No newline at end of file diff --git a/cpp/src/gandiva/null_ops_test.cc b/cpp/src/gandiva/null_ops_test.cc index 3ef351cb773..a979b82a771 100644 --- a/cpp/src/gandiva/null_ops_test.cc +++ b/cpp/src/gandiva/null_ops_test.cc @@ -23,8 +23,8 @@ namespace gandiva { TEST(TestNullOps, Test) { - EXPECT_FALSE(compare_null_null()); - EXPECT_TRUE(isnull_null()); - EXPECT_FALSE(isnotnull_null()); + compare_null_null(true, true); + EXPECT_TRUE(isnull_null(true)); + EXPECT_FALSE(isnotnull_null(true)); } } // namespace gandiva diff --git a/cpp/src/gandiva/tests/null_test.cc b/cpp/src/gandiva/tests/null_test.cc index a3ff18baa32..e018ab0dff4 100644 --- a/cpp/src/gandiva/tests/null_test.cc +++ b/cpp/src/gandiva/tests/null_test.cc @@ -76,12 +76,12 @@ TEST_F(TestNull, TestOps) { auto schema = arrow::schema({field_null}); // output fields - auto res_1 = field("res1", boolean()); - auto res_2 = field("res2", boolean()); - auto res_3 = field("res3", boolean()); - auto res_4 = field("res4", boolean()); - auto res_5 = field("res5", boolean()); - auto res_6 = field("res6", boolean()); + auto res_1 = field("res1", null()); + auto res_2 = field("res2", null()); + auto res_3 = field("res3", null()); + auto res_4 = field("res4", null()); + auto res_5 = field("res5", null()); + auto res_6 = field("res6", null()); auto res_7 = field("res7", boolean()); auto res_8 = field("res8", boolean()); auto expr_1 = TreeExprBuilder::MakeExpression("equal", {field_null, field_null}, res_1); @@ -118,13 +118,11 @@ TEST_F(TestNull, TestOps) { auto exp_true = MakeArrowArrayBool({true, true, true, true}, {true, true, true, true}); auto exp_false = MakeArrowArrayBool({false, false, false, false}, {true, true, true, true}); - for (int i = 0; i < 8; i++) { - if (i == 6) { - EXPECT_ARROW_ARRAY_EQUALS(exp_true, outputs.at(i)); - } else { - EXPECT_ARROW_ARRAY_EQUALS(exp_false, outputs.at(i)); - } + for (int i = 0; i < 6; i++) { + EXPECT_EQ(outputs.at(i)->null_count(), 4); } + EXPECT_ARROW_ARRAY_EQUALS(exp_true, outputs.at(6)); + EXPECT_ARROW_ARRAY_EQUALS(exp_false, outputs.at(7)); } } // namespace gandiva From c51c19e2ebfd4cce95c156c3cd63c14113dbde2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 13 Apr 2021 12:31:23 +0200 Subject: [PATCH 003/719] ARROW-12342: [Packaging] Fix tabulation in crossbow templates for submitting nightly builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The generated templates can be checked using `archery crossbow render --arrow-branch master` Closes #10000 from kszucs/crossbow-template-tabulation Authored-by: Krisztián Szűcs Signed-off-by: Krisztián Szűcs --- dev/archery/archery/crossbow/core.py | 3 ++- dev/tasks/linux-packages/travis.linux.arm64.yml | 3 +-- dev/tasks/macros.jinja | 8 ++++---- dev/tasks/tasks.yml | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/dev/archery/archery/crossbow/core.py b/dev/archery/archery/crossbow/core.py index d4d3d5183b5..9d3074a21d5 100644 --- a/dev/archery/archery/crossbow/core.py +++ b/dev/archery/archery/crossbow/core.py @@ -121,7 +121,8 @@ def format_all(items, pattern): loader = jinja2.FileSystemLoader(searchpath) env = jinja2.Environment(loader=loader, trim_blocks=True, - lstrip_blocks=True) + lstrip_blocks=True, + undefined=jinja2.StrictUndefined) env.filters['format_all'] = format_all template = env.get_template(template) return template.render(**params) diff --git a/dev/tasks/linux-packages/travis.linux.arm64.yml b/dev/tasks/linux-packages/travis.linux.arm64.yml index e9457d6a337..aba604161d8 100644 --- a/dev/tasks/linux-packages/travis.linux.arm64.yml +++ b/dev/tasks/linux-packages/travis.linux.arm64.yml @@ -144,5 +144,4 @@ script: - popd after_success: - {% set patterns = upload_extensions | format_all("arrow/python/repaired_wheels/*.whl") %} - {{ macros.github_upload_releases(patterns) }} + {{ macros.travis_upload_releases(upload_extensions) }} diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja index e0552b11bcf..db1b64cd649 100644 --- a/dev/tasks/macros.jinja +++ b/dev/tasks/macros.jinja @@ -91,11 +91,11 @@ on: {% endmacro %} {%- macro github_upload_gemfury(pattern) -%} - {% if arrow.branch == 'master' %} + {%- if arrow.branch == 'master' -%} - name: Upload package to Gemfury shell: bash run: | - path=$(ls {{ patter }}) + path=$(ls {{ pattern }}) curl -F "package=@${path}" https://${CROSSBOW_GEMFURY_TOKEN}@push.fury.io/${CROSSBOW_GEMFURY_ORG}/ env: CROSSBOW_GEMFURY_TOKEN: {{ '${{ secrets.CROSSBOW_GEMFURY_TOKEN }}' }} @@ -138,7 +138,7 @@ on: {% endmacro %} {%- macro azure_upload_anaconda(pattern) -%} - {% if arrow.branch == 'master' %} + {%- if arrow.branch == 'master' -%} - task: CondaEnvironment@1 inputs: packageSpecs: 'anaconda-client' @@ -188,7 +188,7 @@ on: {% endmacro %} {%- macro travis_upload_gemfury(pattern) -%} - {% if arrow.branch == 'master' %} + {%- if arrow.branch == 'master' -%} - | WHEEL_PATH=$(echo arrow/python/repaired_wheels/*.whl) curl \ diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 5a04c98f640..dcf49414fcc 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -372,7 +372,7 @@ tasks: # enable S3 support from macOS 10.13 so we don't need to bundle curl, crypt and ssl {% for macos_version, macos_codename, arrow_s3 in [("10.9", "mavericks", "OFF"), ("10.13", "high-sierra", "ON")] %} - {% set platform_tag = "macosx_{}_{}".format(macos_version.replace('.', '_'), arch_alias) %} + {% set platform_tag = "macosx_{}_x86_64".format(macos_version.replace('.', '_')) %} wheel-osx-{{ macos_codename }}-{{ python_tag }}: ci: github From 62f8c20306f366a848f1392eba97665ef155a2b2 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 13 Apr 2021 12:49:54 +0200 Subject: [PATCH 004/719] ARROW-12326: [C++] Avoid needless c-ares detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we use system gRPC, we don't need to detect c-ares. This change also simplifies gRPC detection. System gRPC detection requires CMake config or pkg-config. System gRPC detection by gRPC_ROOT is removed because we can't maintain Abseil dependencies. Closes #9977 from kou/cpp-avoid-needless-c-ares-detection Authored-by: Sutou Kouhei Signed-off-by: Krisztián Szűcs --- cpp/cmake_modules/Find-c-aresAlt.cmake | 71 ++++++ cpp/cmake_modules/FindgRPCAlt.cmake | 247 ++++---------------- cpp/cmake_modules/ThirdpartyToolchain.cmake | 77 +++--- cpp/src/arrow/flight/CMakeLists.txt | 15 +- 4 files changed, 151 insertions(+), 259 deletions(-) create mode 100644 cpp/cmake_modules/Find-c-aresAlt.cmake diff --git a/cpp/cmake_modules/Find-c-aresAlt.cmake b/cpp/cmake_modules/Find-c-aresAlt.cmake new file mode 100644 index 00000000000..dd16393cad2 --- /dev/null +++ b/cpp/cmake_modules/Find-c-aresAlt.cmake @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set(find_package_args) +if(c-aresAlt_FIND_VERSION) + list(APPEND find_package_args ${c-aresAlt_FIND_VERSION}) +endif() +if(c-aresAlt_FIND_QUIETLY) + list(APPEND find_package_args QUIET) +endif() +find_package(c-ares ${find_package_args}) +if(c-ares_FOUND) + set(c-aresAlt_FOUND TRUE) + return() +endif() + +find_package(PkgConfig QUIET) +pkg_check_modules(c-ares_PC libcares) +if(c-ares_PC_FOUND) + set(c-ares_INCLUDE_DIR "${c-ares_PC_INCLUDEDIR}") + + list(APPEND c-ares_PC_LIBRARY_DIRS "${c-ares_PC_LIBDIR}") + find_library(c-ares_LIB cares + PATHS ${c-ares_PC_LIBRARY_DIRS} + PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} + NO_DEFAULT_PATH) +elseif(c-ares_ROOT) + find_library(c-ares_LIB + NAMES cares + "${CMAKE_SHARED_LIBRARY_PREFIX}cares${CMAKE_SHARED_LIBRARY_SUFFIX}" + PATHS ${c-ares_ROOT} + PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} + NO_DEFAULT_PATH) + find_path(c-ares_INCLUDE_DIR + NAMES ares.h + PATHS ${c-ares_ROOT} + NO_DEFAULT_PATH + PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) +else() + find_library(c-ares_LIB + NAMES cares + "${CMAKE_SHARED_LIBRARY_PREFIX}cares${CMAKE_SHARED_LIBRARY_SUFFIX}" + PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) + find_path(c-ares_INCLUDE_DIR NAMES ares.h PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) +endif() + +find_package_handle_standard_args(c-aresAlt REQUIRED_VARS c-ares_LIB c-ares_INCLUDE_DIR) + +if(c-aresAlt_FOUND) + if(NOT TARGET c-ares::cares) + add_library(c-ares::cares UNKNOWN IMPORTED) + set_target_properties( + c-ares::cares + PROPERTIES IMPORTED_LOCATION "${c-ares_LIB}" INTERFACE_INCLUDE_DIRECTORIES + "${c-ares_INCLUDE_DIR}") + endif() +endif() diff --git a/cpp/cmake_modules/FindgRPCAlt.cmake b/cpp/cmake_modules/FindgRPCAlt.cmake index 79fe01744d3..841b3b61b83 100644 --- a/cpp/cmake_modules/FindgRPCAlt.cmake +++ b/cpp/cmake_modules/FindgRPCAlt.cmake @@ -24,222 +24,57 @@ if(gRPC_FOUND) return() endif() -unset(GRPC_ALT_VERSION) - -if(ARROW_GRPC_USE_SHARED) - set(GRPC_GPR_LIB_NAMES) - set(GRPC_GRPC_LIB_NAMES) - set(GRPC_GRPCPP_LIB_NAMES) - set(GRPC_ADDRESS_SORTING_LIB_NAMES) - set(GRPC_UPB_LIB_NAMES) - if(CMAKE_IMPORT_LIBRARY_SUFFIX) - list(APPEND GRPC_GPR_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}gpr${CMAKE_IMPORT_LIBRARY_SUFFIX}") - list(APPEND GRPC_GRPC_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}grpc${CMAKE_IMPORT_LIBRARY_SUFFIX}") - list(APPEND GRPC_GRPCPP_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}grpc++${CMAKE_IMPORT_LIBRARY_SUFFIX}") - list( - APPEND GRPC_ADDRESS_SORTING_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}address_sorting${CMAKE_IMPORT_LIBRARY_SUFFIX}" - ) - list(APPEND GRPC_UPB_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}upb${CMAKE_IMPORT_LIBRARY_SUFFIX}") +find_package(PkgConfig QUIET) +pkg_check_modules(GRPCPP_PC grpc++) +if(GRPCPP_PC_FOUND) + set(gRPCAlt_VERSION "${GRPCPP_PC_VERSION}") + set(GRPCPP_INCLUDE_DIRECTORIES ${GRPCPP_PC_INCLUDEDIR}) + if(ARROW_GRPC_USE_SHARED) + set(GRPCPP_LINK_LIBRARIES ${GRPCPP_PC_LINK_LIBRARIES}) + set(GRPCPP_LINK_OPTIONS ${GRPCPP_PC_LDFLAGS_OTHER}) + set(GRPCPP_COMPILE_OPTIONS ${GRPCPP_PC_CFLAGS_OTHER}) + else() + set(GRPCPP_LINK_LIBRARIES) + foreach(GRPCPP_LIBRARY_NAME ${GRPCPP_PC_STATIC_LIBRARIES}) + find_library( + GRPCPP_LIBRARY_${GRPCPP_LIBRARY_NAME} + NAMES + "${CMAKE_STATIC_LIBRARY_PREFIX}${GRPCPP_LIBRARY_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" + HINTS ${GRPCPP_PC_STATIC_LIBRARY_DIRS}) + list(APPEND GRPCPP_LINK_LIBRARIES "${GRPCPP_LIBRARY_${GRPCPP_LIBRARY_NAME}}") + endforeach() + set(GRPCPP_LINK_OPTIONS ${GRPCPP_PC_STATIC_LDFLAGS_OTHER}) + set(GRPCPP_COMPILE_OPTIONS ${GRPCPP_PC_STATIC_CFLAGS_OTHER}) endif() - list(APPEND GRPC_GPR_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}gpr${CMAKE_SHARED_LIBRARY_SUFFIX}") - list(APPEND GRPC_GRPC_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}grpc${CMAKE_SHARED_LIBRARY_SUFFIX}") - list(APPEND GRPC_GRPCPP_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}grpc++${CMAKE_SHARED_LIBRARY_SUFFIX}") - list( - APPEND GRPC_ADDRESS_SORTING_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}address_sorting${CMAKE_SHARED_LIBRARY_SUFFIX}") - list(APPEND GRPC_UPB_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}upb${CMAKE_SHARED_LIBRARY_SUFFIX}") -else() - set(GRPC_GPR_LIB_NAMES - "${CMAKE_STATIC_LIBRARY_PREFIX}gpr${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(GRPC_GRPC_LIB_NAMES - "${CMAKE_STATIC_LIBRARY_PREFIX}grpc${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(GRPC_GRPCPP_LIB_NAMES - "${CMAKE_STATIC_LIBRARY_PREFIX}grpc++${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(GRPC_ADDRESS_SORTING_LIB_NAMES - "${CMAKE_STATIC_LIBRARY_PREFIX}address_sorting${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(GRPC_UPB_LIB_NAMES - "${CMAKE_STATIC_LIBRARY_PREFIX}upb${CMAKE_STATIC_LIBRARY_SUFFIX}") -endif() - -if(gRPC_ROOT) - find_library(GRPC_GPR_LIB - NAMES ${GRPC_GPR_LIB_NAMES} - PATHS ${gRPC_ROOT} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_library(GRPC_GRPC_LIB - NAMES ${GRPC_GRPC_LIB_NAMES} - PATHS ${gRPC_ROOT} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_library(GRPC_GRPCPP_LIB - NAMES ${GRPC_GRPCPP_LIB_NAMES} - PATHS ${gRPC_ROOT} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_library(GRPC_ADDRESS_SORTING_LIB - NAMES ${GRPC_ADDRESS_SORTING_LIB_NAMES} - PATHS ${gRPC_ROOT} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_library(GRPC_UPB_LIB - NAMES ${GRPC_UPB_LIB_NAMES} - PATHS ${gRPC_ROOT} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_program(GRPC_CPP_PLUGIN grpc_cpp_plugin NO_DEFAULT_PATH - PATHS ${gRPC_ROOT} + list(GET GRPCPP_LINK_LIBRARIES 0 GRPCPP_IMPORTED_LOCATION) + list(REMOVE_AT GRPCPP_LINK_LIBRARIES 0) + find_program(GRPC_CPP_PLUGIN grpc_cpp_plugin + HINTS ${GRPCPP_PC_PREFIX} + NO_DEFAULT_PATH PATH_SUFFIXES "bin") - find_path(GRPC_INCLUDE_DIR - NAMES grpc/grpc.h - PATHS ${gRPC_ROOT} - NO_DEFAULT_PATH - PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) -else() - find_package(PkgConfig QUIET) - pkg_check_modules(GRPC_PC grpc++) - if(GRPC_PC_FOUND) - set(GRPC_ALT_VERSION "${GRPC_PC_VERSION}") - set(GRPC_INCLUDE_DIR "${GRPC_PC_INCLUDEDIR}") - list(APPEND GRPC_PC_LIBRARY_DIRS "${GRPC_PC_LIBDIR}") - message(STATUS "${GRPC_PC_LIBRARY_DIRS}") - - find_library(GRPC_GPR_LIB - NAMES ${GRPC_GPR_LIB_NAMES} - PATHS ${GRPC_PC_LIBRARY_DIRS} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_library(GRPC_GRPC_LIB - NAMES ${GRPC_GRPC_LIB_NAMES} - PATHS ${GRPC_PC_LIBRARY_DIRS} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_library(GRPC_GRPCPP_LIB - NAMES ${GRPC_GRPCPP_LIB_NAMES} - PATHS ${GRPC_PC_LIBRARY_DIRS} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_library(GRPC_ADDRESS_SORTING_LIB - NAMES ${GRPC_ADDRESS_SORTING_LIB_NAMES} - PATHS ${GRPC_PC_LIBRARY_DIRS} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_library(GRPC_UPB_LIB - NAMES ${GRPC_UPB_LIB_NAMES} - PATHS ${GRPC_PC_LIBRARY_DIRS} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_program(GRPC_CPP_PLUGIN grpc_cpp_plugin - HINTS ${GRPC_PC_PREFIX} - NO_DEFAULT_PATH - PATH_SUFFIXES "bin") - else() - find_library(GRPC_GPR_LIB - NAMES ${GRPC_GPR_LIB_NAMES} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) - find_library(GRPC_GRPC_LIB - NAMES ${GRPC_GRPC_LIB_NAMES} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) - find_library(GRPC_GRPCPP_LIB - NAMES ${GRPC_GRPCPP_LIB_NAMES} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) - find_library(GRPC_ADDRESS_SORTING_LIB - NAMES ${GRPC_ADDRESS_SORTING_LIB_NAMES} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) - find_library(GRPC_UPB_LIB - NAMES ${GRPC_UPB_LIB_NAMES} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) - find_program(GRPC_CPP_PLUGIN grpc_cpp_plugin PATH_SUFFIXES "bin") - find_path(GRPC_INCLUDE_DIR - NAMES grpc/grpc.h - PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) + set(gRPCAlt_FIND_PACKAGE_ARGS gRPCAlt REQUIRED_VARS GRPCPP_IMPORTED_LOCATION + GRPC_CPP_PLUGIN) + if(gRPCAlt_VERSION) + list(APPEND gRPCAlt_FIND_PACKAGE_ARGS VERSION_VAR gRPCAlt_VERSION) endif() + find_package_handle_standard_args(${gRPCAlt_FIND_PACKAGE_ARGS}) +else() + set(gRPCAlt_FOUND FALSE) endif() -set(GRPC_ALT_FIND_PACKAGE_ARGS - gRPCAlt - REQUIRED_VARS - GRPC_INCLUDE_DIR - GRPC_GPR_LIB - GRPC_GRPC_LIB - GRPC_GRPCPP_LIB - GRPC_CPP_PLUGIN) -if(GRPC_ALT_VERSION) - list(APPEND GRPC_ALT_FIND_PACKAGE_ARGS VERSION_VAR GRPC_ALT_VERSION) -endif() -find_package_handle_standard_args(${GRPC_ALT_FIND_PACKAGE_ARGS}) - if(gRPCAlt_FOUND) - add_library(gRPC::gpr UNKNOWN IMPORTED) - set_target_properties(gRPC::gpr - PROPERTIES IMPORTED_LOCATION "${GRPC_GPR_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") - - add_library(gRPC::grpc UNKNOWN IMPORTED) - set_target_properties( - gRPC::grpc - PROPERTIES IMPORTED_LOCATION - "${GRPC_GRPC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES - "${GRPC_INCLUDE_DIR}" - INTERFACE_LINK_LIBRARIES - "OpenSSL::SSL;OpenSSL::Crypto;ZLIB::ZLIB;c-ares::cares") - - set(_GRPCPP_LINK_LIBRARIES "gRPC::grpc;gRPC::gpr") - - if(GRPC_ADDRESS_SORTING_LIB) - # Address sorting is optional and not always required. - add_library(gRPC::address_sorting UNKNOWN IMPORTED) - set_target_properties(gRPC::address_sorting - PROPERTIES IMPORTED_LOCATION "${GRPC_ADDRESS_SORTING_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") - set(_GRPCPP_LINK_LIBRARIES "${_GRPCPP_LINK_LIBRARIES};gRPC::address_sorting") - endif() - - if(GRPC_UPB_LIB) - # upb is used by recent gRPC versions - add_library(gRPC::upb UNKNOWN IMPORTED) - set_target_properties(gRPC::upb - PROPERTIES IMPORTED_LOCATION "${GRPC_UPB_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") - set(_GRPCPP_LINK_LIBRARIES "${_GRPCPP_LINK_LIBRARIES};gRPC::upb") - endif() - - find_package(absl CONFIG) - if(absl_FOUND) - # Abseil libraries that recent gRPC versions depend on - set(_ABSL_LIBS - bad_optional_access - int128 - raw_logging_internal - str_format_internal - strings - throw_delegate - time - time_zone) - - foreach(_ABSL_LIB ${_ABSL_LIBS}) - set(_GRPCPP_LINK_LIBRARIES "${_GRPCPP_LINK_LIBRARIES};absl::${_ABSL_LIB}") - endforeach() - endif() - add_library(gRPC::grpc++ UNKNOWN IMPORTED) set_target_properties(gRPC::grpc++ PROPERTIES IMPORTED_LOCATION - "${GRPC_GRPCPP_LIB}" - INTERFACE_LINK_LIBRARIES - "${_GRPCPP_LINK_LIBRARIES}" + "${GRPCPP_IMPORTED_LOCATION}" + INTERFACE_COMPILE_OPTIONS + "${GRPCPP_COMPILE_OPTIONS}" INTERFACE_INCLUDE_DIRECTORIES - "${GRPC_INCLUDE_DIR}") + "${GRPCPP_INCLUDE_DIRECTORIES}" + INTERFACE_LINK_LIBRARIES + "${GRPCPP_LINK_LIBRARIES}" + INTERFACE_LINK_OPTIONS + "${GRPCPP_LINK_OPTIONS}") add_executable(gRPC::grpc_cpp_plugin IMPORTED) set_target_properties(gRPC::grpc_cpp_plugin diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 9f240e448f6..3f686346bb1 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2338,29 +2338,15 @@ macro(build_cares) list(APPEND ARROW_BUNDLED_STATIC_LIBS c-ares::cares) endmacro() -if(ARROW_WITH_GRPC) - if(c-ares_SOURCE STREQUAL "AUTO") - find_package(c-ares QUIET CONFIG) - if(c-ares_FOUND) - set(CARES_INCLUDE_DIR ${c-ares_INCLUDE_DIR}) - else() - build_cares() - endif() - elseif(c-ares_SOURCE STREQUAL "BUNDLED") - build_cares() - elseif(c-ares_SOURCE STREQUAL "SYSTEM") - find_package(c-ares REQUIRED CONFIG) - set(CARES_INCLUDE_DIR ${c-ares_INCLUDE_DIR}) - endif() - - # TODO: Don't use global includes but rather target_include_directories - include_directories(SYSTEM ${CARES_INCLUDE_DIR}) -endif() - # ---------------------------------------------------------------------- # Dependencies for Arrow Flight RPC macro(build_grpc) + resolve_dependency(c-ares HAVE_ALT TRUE) + # TODO: Don't use global includes but rather target_include_directories + get_target_property(c-ares_INCLUDE_DIR c-ares::cares INTERFACE_INCLUDE_DIRECTORIES) + include_directories(SYSTEM ${c-ares_INCLUDE_DIR}) + message(STATUS "Building gRPC from source") # First need to build Abseil @@ -2548,27 +2534,38 @@ macro(build_grpc) PROPERTIES IMPORTED_LOCATION "${GRPC_STATIC_LIBRARY_GPR}" INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") - add_library(gRPC::grpc STATIC IMPORTED) - set_target_properties(gRPC::grpc - PROPERTIES IMPORTED_LOCATION "${GRPC_STATIC_LIBRARY_GRPC}" - INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") - add_library(gRPC::address_sorting STATIC IMPORTED) set_target_properties(gRPC::address_sorting PROPERTIES IMPORTED_LOCATION "${GRPC_STATIC_LIBRARY_ADDRESS_SORTING}" INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") + add_library(gRPC::grpc STATIC IMPORTED) + set(GRPC_LINK_LIBRARIES + gRPC::gpr + gRPC::upb + gRPC::address_sorting + ${ABSL_LIBRARIES} + c-ares::cares + ZLIB::ZLIB + Threads::Threads) + set_target_properties(gRPC::grpc + PROPERTIES IMPORTED_LOCATION + "${GRPC_STATIC_LIBRARY_GRPC}" + INTERFACE_INCLUDE_DIRECTORIES + "${GRPC_INCLUDE_DIR}" + INTERFACE_LINK_LIBRARIES + "${GRPC_LINK_LIBRARIES}") + add_library(gRPC::grpc++ STATIC IMPORTED) - set_target_properties( - gRPC::grpc++ - PROPERTIES - IMPORTED_LOCATION - "${GRPC_STATIC_LIBRARY_GRPCPP}" - INTERFACE_LINK_LIBRARIES - "gRPC::grpc;gRPC::gpr;gRPC::upb;gRPC::address_sorting;${ABSL_LIBRARIES};Threads::Threads" - INTERFACE_INCLUDE_DIRECTORIES - "${GRPC_INCLUDE_DIR}") + set(GRPCPP_LINK_LIBRARIES gRPC::grpc ${ARROW_PROTOBUF_LIBPROTOBUF}) + set_target_properties(gRPC::grpc++ + PROPERTIES IMPORTED_LOCATION + "${GRPC_STATIC_LIBRARY_GRPCPP}" + INTERFACE_INCLUDE_DIRECTORIES + "${GRPC_INCLUDE_DIR}" + INTERFACE_LINK_LIBRARIES + "${GRPCPP_LINK_LIBRARIES}") add_executable(gRPC::grpc_cpp_plugin IMPORTED) set_target_properties(gRPC::grpc_cpp_plugin @@ -2607,11 +2604,11 @@ macro(build_grpc) list(APPEND ARROW_BUNDLED_STATIC_LIBS ${ABSL_LIBRARIES} - gRPC::upb + gRPC::address_sorting gRPC::gpr gRPC::grpc - gRPC::address_sorting - gRPC::grpcpp_for_bundling) + gRPC::grpcpp_for_bundling + gRPC::upb) endmacro() if(ARROW_WITH_GRPC) @@ -2622,14 +2619,8 @@ if(ARROW_WITH_GRPC) REQUIRED_VERSION ${ARROW_GRPC_REQUIRED_VERSION}) - if(TARGET gRPC::address_sorting) - set(GRPC_HAS_ADDRESS_SORTING TRUE) - else() - set(GRPC_HAS_ADDRESS_SORTING FALSE) - endif() - # TODO: Don't use global includes but rather target_include_directories - get_target_property(GRPC_INCLUDE_DIR gRPC::grpc INTERFACE_INCLUDE_DIRECTORIES) + get_target_property(GRPC_INCLUDE_DIR gRPC::grpc++ INTERFACE_INCLUDE_DIRECTORIES) include_directories(SYSTEM ${GRPC_INCLUDE_DIR}) if(GRPC_VENDORED) diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt index bc91d7e8c22..e1176ff0ac0 100644 --- a/cpp/src/arrow/flight/CMakeLists.txt +++ b/cpp/src/arrow/flight/CMakeLists.txt @@ -19,15 +19,10 @@ add_custom_target(arrow_flight) arrow_install_all_headers("arrow/flight") -set(ARROW_FLIGHT_STATIC_LINK_LIBS - gRPC::grpc++ - ${ABSL_LIBRARIES} - ${ARROW_PROTOBUF_LIBPROTOBUF} - c-ares::cares - ZLIB::ZLIB) +set(ARROW_FLIGHT_LINK_LIBS gRPC::grpc++) if(WIN32) - list(APPEND ARROW_FLIGHT_STATIC_LINK_LIBS ws2_32.lib) + list(APPEND ARROW_FLIGHT_LINK_LIBS ws2_32.lib) endif() if(ARROW_TEST_LINKAGE STREQUAL "static") @@ -83,7 +78,7 @@ function(test_grpc_version DST_VAR DETECT_VERSION TEST_FILE) try_compile(HAS_GRPC_VERSION ${CMAKE_CURRENT_BINARY_DIR}/try_compile SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/try_compile/${TEST_FILE}" CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CURRENT_INCLUDE_DIRECTORIES}" - LINK_LIBRARIES gRPC::grpc gRPC::grpc++ + LINK_LIBRARIES gRPC::grpc++ OUTPUT_VARIABLE TLS_CREDENTIALS_OPTIONS_CHECK_OUTPUT CXX_STANDARD 11) if(HAS_GRPC_VERSION) set(${DST_VAR} @@ -177,10 +172,10 @@ add_arrow_lib(arrow_flight ${ARROW_VERSION_SCRIPT_FLAGS} # Defined in cpp/arrow/CMakeLists.txt SHARED_LINK_LIBS arrow_shared - ${ARROW_FLIGHT_STATIC_LINK_LIBS} + ${ARROW_FLIGHT_LINK_LIBS} STATIC_LINK_LIBS arrow_static - ${ARROW_FLIGHT_STATIC_LINK_LIBS}) + ${ARROW_FLIGHT_LINK_LIBS}) foreach(LIB_TARGET ${ARROW_FLIGHT_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_FLIGHT_EXPORTING) From 2c77f3b00a9c8d4fa089a33174a55bfa88b25e69 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 13 Apr 2021 12:57:25 +0200 Subject: [PATCH 005/719] ARROW-12352: [CI][R][Windows] Remove needless workaround for MSYS2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit repo.msys2.org is alive. sf.net may be fragile than repo.msys2.org. See also ARROW-10202: https://issues.apache.org/jira/browse/ARROW-10202 Closes #10004 from kou/ci-r-remove-needless-msys2-workaround Authored-by: Sutou Kouhei Signed-off-by: Krisztián Szűcs --- ci/scripts/r_windows_build.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh index be03b75f5ad..9988dfb6494 100755 --- a/ci/scripts/r_windows_build.sh +++ b/ci/scripts/r_windows_build.sh @@ -28,13 +28,8 @@ if [ "$RTOOLS_VERSION" = "35" ]; then curl https://raw.githubusercontent.com/r-windows/rtools-backports/master/pacman.conf > /etc/pacman.conf # Update keys: https://www.msys2.org/news/#2020-06-29-new-packagers msys2_repo_base_url=https://repo.msys2.org/msys - # Mirror - msys2_repo_base_url=https://sourceforge.net/projects/msys2/files/REPOS/MSYS2 curl -OSsL "${msys2_repo_base_url}/x86_64/msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz" pacman -U --noconfirm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz && rm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz - # Use sf.net instead of http://repo.msys2.org/ temporary. - sed -i -e "s,^Server = http://repo\.msys2\.org/msys,Server = ${msys2_repo_base_url},g" \ - /etc/pacman.conf pacman --noconfirm -Scc pacman --noconfirm -Syy # lib-4.9.3 is for libraries compiled with gcc 4.9 (Rtools 3.5) From 72249203be90b45a315cf8028536fd72a7f9427b Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 13 Apr 2021 13:02:11 +0200 Subject: [PATCH 006/719] ARROW-11752: [R] Replace usage of testthat::expect_is() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `testthat::expect_is` is now deprecated - this PR replaces uses of it with alternative functions. When updating `expect_dplyr_error`, I fixed an issue with its usage which led to one of the tests failing as it no longer gives the expected error, so have set this test to skip. Closes #9909 from thisisnic/arrow-11752 Lead-authored-by: Nic Crane Co-authored-by: Nic Signed-off-by: Krisztián Szűcs --- r/tests/testthat/helper-expectation.R | 32 +++++++++- r/tests/testthat/test-Array.R | 6 +- r/tests/testthat/test-RecordBatch.R | 4 +- r/tests/testthat/test-Table.R | 6 +- r/tests/testthat/test-arrow-info.R | 2 +- r/tests/testthat/test-buffer-reader.R | 8 +-- r/tests/testthat/test-buffer.R | 10 ++-- r/tests/testthat/test-chunked-array.R | 6 +- r/tests/testthat/test-compute-aggregate.R | 28 ++++----- r/tests/testthat/test-compute-vector.R | 2 +- r/tests/testthat/test-data-type.R | 6 +- r/tests/testthat/test-dataset.R | 66 ++++++++++----------- r/tests/testthat/test-dplyr-filter.R | 12 ++-- r/tests/testthat/test-dplyr-mutate.R | 2 +- r/tests/testthat/test-dplyr.R | 20 +++---- r/tests/testthat/test-expression.R | 44 +++++++------- r/tests/testthat/test-feather.R | 16 ++--- r/tests/testthat/test-filesystem.R | 16 ++--- r/tests/testthat/test-json.R | 6 +- r/tests/testthat/test-memory-pool.R | 4 +- r/tests/testthat/test-message-reader.R | 32 +++++----- r/tests/testthat/test-message.R | 12 ++-- r/tests/testthat/test-metadata.R | 2 +- r/tests/testthat/test-python.R | 16 ++--- r/tests/testthat/test-read-record-batch.R | 2 +- r/tests/testthat/test-read-write.R | 2 +- r/tests/testthat/test-record-batch-reader.R | 26 ++++---- r/tests/testthat/test-s3-minio.R | 2 +- r/tests/testthat/test-scalar.R | 2 +- r/tests/testthat/test-schema.R | 14 ++--- 30 files changed, 219 insertions(+), 187 deletions(-) diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R index 39cc9e0597a..2ebd44f7bba 100644 --- a/r/tests/testthat/helper-expectation.R +++ b/r/tests/testthat/helper-expectation.R @@ -23,6 +23,11 @@ expect_data_frame <- function(x, y, ...) { expect_equal(as.data.frame(x), y, ...) } +expect_r6_class <- function(object, class){ + expect_s3_class(object, class) + expect_s3_class(object, "R6") +} + expect_equivalent <- function(object, expected, ...) { # HACK: dplyr includes an all.equal.tbl_df method that is causing failures. # They look spurious, like: @@ -98,12 +103,35 @@ expect_dplyr_equal <- function(expr, # A dplyr pipeline with `input` as its star expect_dplyr_error <- function(expr, # A dplyr pipeline with `input` as its start tbl, # A tbl/df as reference, will make RB/Table with ...) { + # ensure we have supplied tbl + force(tbl) + expr <- rlang::enquo(expr) msg <- tryCatch( rlang::eval_tidy(expr, rlang::new_data_mask(rlang::env(input = tbl))), - error = function (e) conditionMessage(e) + error = function (e) { + msg <- conditionMessage(e) + + # The error here is of the form: + # + # Problem with `filter()` input `..1`. + # x object 'b_var' not found + # ℹ Input `..1` is `chr == b_var`. + # + # but what we really care about is the `x` block + # so (temporarily) let's pull those blocks out when we find them + pattern <- i18ize_error_messages() + + if (grepl(pattern, msg)) { + msg <- sub(paste0("^.*(", pattern, ").*$"), "\\1", msg) + } + msg + } ) - expect_is(msg, "character", label = "dplyr on data.frame did not error") + # make sure msg is a character object (i.e. there has been an error) + # If it did not error, we would get a data.frame or whatever + # This expectation will tell us "dplyr on data.frame errored is not TRUE" + expect_true(identical(typeof(msg), "character"), label = "dplyr on data.frame errored") expect_error( rlang::eval_tidy( diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 35ae357f703..b4fa8296d3a 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -496,7 +496,7 @@ test_that("Array$create() supports tibble with no columns (ARROW-8354)", { test_that("Array$create() handles vector -> list arrays (ARROW-7662)", { # Should be able to create an empty list with a type hint. - expect_is(Array$create(list(), list_of(bool())), "ListArray") + expect_r6_class(Array$create(list(), list_of(bool())), "ListArray") # logical expect_array_roundtrip(list(NA), list_of(bool())) @@ -542,7 +542,7 @@ test_that("Array$create() handles vector -> list arrays (ARROW-7662)", { test_that("Array$create() handles vector -> large list arrays", { # Should be able to create an empty list with a type hint. - expect_is(Array$create(list(), type = large_list_of(bool())), "LargeListArray") + expect_r6_class(Array$create(list(), type = large_list_of(bool())), "LargeListArray") # logical expect_array_roundtrip(list(NA), large_list_of(bool()), as = large_list_of(bool())) @@ -587,7 +587,7 @@ test_that("Array$create() handles vector -> large list arrays", { test_that("Array$create() handles vector -> fixed size list arrays", { # Should be able to create an empty list with a type hint. - expect_is(Array$create(list(), type = fixed_size_list_of(bool(), 20)), "FixedSizeListArray") + expect_r6_class(Array$create(list(), type = fixed_size_list_of(bool(), 20)), "FixedSizeListArray") # logical expect_array_roundtrip(list(NA), fixed_size_list_of(bool(), 1L), as = fixed_size_list_of(bool(), 1L)) diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index b71c07b78c2..ff7f17eca6e 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -438,8 +438,8 @@ test_that("RecordBatch$Equals(check_metadata)", { rb1 <- record_batch(df) rb2 <- record_batch(df, schema = rb1$schema$WithMetadata(list(some="metadata"))) - expect_is(rb1, "RecordBatch") - expect_is(rb2, "RecordBatch") + expect_r6_class(rb1, "RecordBatch") + expect_r6_class(rb2, "RecordBatch") expect_false(rb1$schema$HasMetadata) expect_true(rb2$schema$HasMetadata) expect_identical(rb2$schema$metadata, list(some = "metadata")) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 7a0b8bd6c02..86bda393e2d 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -299,7 +299,7 @@ test_that("table active bindings", { tab <- Table$create(tbl) expect_identical(dim(tbl), dim(tab)) - expect_is(tab$columns, "list") + expect_type(tab$columns, "list") expect_equal(tab$columns[[1]], tab[[1]]) }) @@ -400,8 +400,8 @@ test_that("Table$Equals(check_metadata)", { tab2 <- Table$create(x = 1:2, y = c("a", "b"), schema = tab1$schema$WithMetadata(list(some="metadata"))) - expect_is(tab1, "Table") - expect_is(tab2, "Table") + expect_r6_class(tab1, "Table") + expect_r6_class(tab2, "Table") expect_false(tab1$schema$HasMetadata) expect_true(tab2$schema$HasMetadata) expect_identical(tab2$schema$metadata, list(some = "metadata")) diff --git a/r/tests/testthat/test-arrow-info.R b/r/tests/testthat/test-arrow-info.R index 2a7af3aac67..3fac3f422e8 100644 --- a/r/tests/testthat/test-arrow-info.R +++ b/r/tests/testthat/test-arrow-info.R @@ -16,7 +16,7 @@ # under the License. test_that("arrow_info()", { - expect_is(arrow_info(), "arrow_info") + expect_s3_class(arrow_info(), "arrow_info") expect_output(print(arrow_info()), "Arrow package version") options(arrow.foo=FALSE) expect_output(print(arrow_info()), "arrow.foo") diff --git a/r/tests/testthat/test-buffer-reader.R b/r/tests/testthat/test-buffer-reader.R index 94be16ad569..3236a3a477d 100644 --- a/r/tests/testthat/test-buffer-reader.R +++ b/r/tests/testthat/test-buffer-reader.R @@ -22,9 +22,9 @@ test_that("BufferReader can be created from R objects", { int <- BufferReader$create(integer(13)) raw <- BufferReader$create(raw(16)) - expect_is(num, "BufferReader") - expect_is(int, "BufferReader") - expect_is(raw, "BufferReader") + expect_r6_class(num, "BufferReader") + expect_r6_class(int, "BufferReader") + expect_r6_class(raw, "BufferReader") expect_equal(num$GetSize(), 13*8) expect_equal(int$GetSize(), 13*4) @@ -35,6 +35,6 @@ test_that("BufferReader can be created from Buffer", { buf <- buffer(raw(76)) reader <- BufferReader$create(buf) - expect_is(reader, "BufferReader") + expect_r6_class(reader, "BufferReader") expect_equal(reader$GetSize(), 76) }) diff --git a/r/tests/testthat/test-buffer.R b/r/tests/testthat/test-buffer.R index c19f61196ee..1b3ea09cb92 100644 --- a/r/tests/testthat/test-buffer.R +++ b/r/tests/testthat/test-buffer.R @@ -20,33 +20,33 @@ context("Buffer") test_that("Buffer can be created from raw vector", { vec <- raw(123) buf <- buffer(vec) - expect_is(buf, "Buffer") + expect_r6_class(buf, "Buffer") expect_equal(buf$size, 123) }) test_that("Buffer can be created from integer vector", { vec <- integer(17) buf <- buffer(vec) - expect_is(buf, "Buffer") + expect_r6_class(buf, "Buffer") expect_equal(buf$size, 17 * 4) }) test_that("Buffer can be created from numeric vector", { vec <- numeric(17) buf <- buffer(vec) - expect_is(buf, "Buffer") + expect_r6_class(buf, "Buffer") expect_equal(buf$size, 17 * 8) }) test_that("Buffer can be created from complex vector", { vec <- complex(3) buf <- buffer(vec) - expect_is(buf, "Buffer") + expect_r6_class(buf, "Buffer") expect_equal(buf$size, 3 * 16) }) test_that("buffer buffer buffers buffers", { - expect_is(buffer(buffer(42)), "Buffer") + expect_r6_class(buffer(buffer(42)), "Buffer") }) test_that("Other types can't be converted to Buffers", { diff --git a/r/tests/testthat/test-chunked-array.R b/r/tests/testthat/test-chunked-array.R index 17a82de810f..e72067a6d5f 100644 --- a/r/tests/testthat/test-chunked-array.R +++ b/r/tests/testthat/test-chunked-array.R @@ -177,7 +177,7 @@ test_that("ChunkedArray supports integer64 (ARROW-3716)", { expect_type_equal(zero, int64()) ca <- ChunkedArray$create(zero, x) expect_type_equal(ca, int64()) - expect_is(as.vector(ca), "integer64") + expect_s3_class(as.vector(ca), "integer64") expect_identical(as.vector(ca), c(bit64::as.integer64(0L), x)) }) @@ -199,12 +199,12 @@ test_that("integer types casts for ChunkedArray (ARROW-3741)", { a <- chunked_array(1:10, 1:10) for (type in c(int_types, uint_types)) { casted <- a$cast(type) - expect_is(casted, "ChunkedArray") + expect_r6_class(casted, "ChunkedArray") expect_type_equal(casted$type, type) } # Also test casting to double(), not actually a type, a base R function but should be alias for float64 dbl <- a$cast(double()) - expect_is(dbl, "ChunkedArray") + expect_r6_class(dbl, "ChunkedArray") expect_type_equal(dbl$type, float64()) }) diff --git a/r/tests/testthat/test-compute-aggregate.R b/r/tests/testthat/test-compute-aggregate.R index 2208f581de9..77010579d78 100644 --- a/r/tests/testthat/test-compute-aggregate.R +++ b/r/tests/testthat/test-compute-aggregate.R @@ -28,7 +28,7 @@ test_that("list_compute_functions", { test_that("sum.Array", { ints <- 1:5 a <- Array$create(ints) - expect_is(sum(a), "Scalar") + expect_r6_class(sum(a), "Scalar") expect_identical(as.integer(sum(a)), sum(ints)) floats <- c(1.3, 2.4, 3) @@ -38,7 +38,7 @@ test_that("sum.Array", { floats <- c(floats, NA) na <- Array$create(floats) expect_identical(as.numeric(sum(na)), sum(floats)) - expect_is(sum(na, na.rm = TRUE), "Scalar") + expect_r6_class(sum(na, na.rm = TRUE), "Scalar") expect_identical(as.numeric(sum(na, na.rm = TRUE)), sum(floats, na.rm = TRUE)) bools <- c(TRUE, NA, TRUE, FALSE) @@ -49,7 +49,7 @@ test_that("sum.Array", { test_that("sum.ChunkedArray", { a <- ChunkedArray$create(1:4, c(1:4, NA), 1:5) - expect_is(sum(a), "Scalar") + expect_r6_class(sum(a), "Scalar") expect_true(is.na(as.vector(sum(a)))) expect_identical(as.numeric(sum(a, na.rm = TRUE)), 35) }) @@ -69,7 +69,7 @@ test_that("sum.Scalar", { test_that("mean.Array", { ints <- 1:4 a <- Array$create(ints) - expect_is(mean(a), "Scalar") + expect_r6_class(mean(a), "Scalar") expect_identical(as.vector(mean(a)), mean(ints)) floats <- c(1.3, 2.4, 3) @@ -79,7 +79,7 @@ test_that("mean.Array", { floats <- c(floats, NA) na <- Array$create(floats) expect_identical(as.vector(mean(na)), mean(floats)) - expect_is(mean(na, na.rm = TRUE), "Scalar") + expect_r6_class(mean(na, na.rm = TRUE), "Scalar") expect_identical(as.vector(mean(na, na.rm = TRUE)), mean(floats, na.rm = TRUE)) bools <- c(TRUE, NA, TRUE, FALSE) @@ -90,7 +90,7 @@ test_that("mean.Array", { test_that("mean.ChunkedArray", { a <- ChunkedArray$create(1:4, c(1:4, NA), 1:5) - expect_is(mean(a), "Scalar") + expect_r6_class(mean(a), "Scalar") expect_true(is.na(as.vector(mean(a)))) expect_identical(as.vector(mean(a, na.rm = TRUE)), 35/13) }) @@ -111,7 +111,7 @@ test_that("Bad input handling of call_function", { test_that("min.Array", { ints <- 1:4 a <- Array$create(ints) - expect_is(min(a), "Scalar") + expect_r6_class(min(a), "Scalar") expect_identical(as.vector(min(a)), min(ints)) floats <- c(1.3, 3, 2.4) @@ -121,7 +121,7 @@ test_that("min.Array", { floats <- c(floats, NA) na <- Array$create(floats) expect_identical(as.vector(min(na)), min(floats)) - expect_is(min(na, na.rm = TRUE), "Scalar") + expect_r6_class(min(na, na.rm = TRUE), "Scalar") expect_identical(as.vector(min(na, na.rm = TRUE)), min(floats, na.rm = TRUE)) bools <- c(TRUE, TRUE, FALSE) @@ -133,7 +133,7 @@ test_that("min.Array", { test_that("max.Array", { ints <- 1:4 a <- Array$create(ints) - expect_is(max(a), "Scalar") + expect_r6_class(max(a), "Scalar") expect_identical(as.vector(max(a)), max(ints)) floats <- c(1.3, 3, 2.4) @@ -143,7 +143,7 @@ test_that("max.Array", { floats <- c(floats, NA) na <- Array$create(floats) expect_identical(as.vector(max(na)), max(floats)) - expect_is(max(na, na.rm = TRUE), "Scalar") + expect_r6_class(max(na, na.rm = TRUE), "Scalar") expect_identical(as.vector(max(na, na.rm = TRUE)), max(floats, na.rm = TRUE)) bools <- c(TRUE, TRUE, FALSE) @@ -155,7 +155,7 @@ test_that("max.Array", { test_that("min.ChunkedArray", { ints <- 1:4 a <- ChunkedArray$create(ints) - expect_is(min(a), "Scalar") + expect_r6_class(min(a), "Scalar") expect_identical(as.vector(min(a)), min(ints)) floats <- c(1.3, 3, 2.4) @@ -165,7 +165,7 @@ test_that("min.ChunkedArray", { floats <- c(floats, NA) na <- ChunkedArray$create(floats) expect_identical(as.vector(min(na)), min(floats)) - expect_is(min(na, na.rm = TRUE), "Scalar") + expect_r6_class(min(na, na.rm = TRUE), "Scalar") expect_identical(as.vector(min(na, na.rm = TRUE)), min(floats, na.rm = TRUE)) bools <- c(TRUE, TRUE, FALSE) @@ -177,7 +177,7 @@ test_that("min.ChunkedArray", { test_that("max.ChunkedArray", { ints <- 1:4 a <- ChunkedArray$create(ints) - expect_is(max(a), "Scalar") + expect_r6_class(max(a), "Scalar") expect_identical(as.vector(max(a)), max(ints)) floats <- c(1.3, 3, 2.4) @@ -187,7 +187,7 @@ test_that("max.ChunkedArray", { floats <- c(floats, NA) na <- ChunkedArray$create(floats) expect_identical(as.vector(max(na)), max(floats)) - expect_is(max(na, na.rm = TRUE), "Scalar") + expect_r6_class(max(na, na.rm = TRUE), "Scalar") expect_identical(as.vector(max(na, na.rm = TRUE)), max(floats, na.rm = TRUE)) bools <- c(TRUE, TRUE, FALSE) diff --git a/r/tests/testthat/test-compute-vector.R b/r/tests/testthat/test-compute-vector.R index 0b184889bee..95e93634934 100644 --- a/r/tests/testthat/test-compute-vector.R +++ b/r/tests/testthat/test-compute-vector.R @@ -18,7 +18,7 @@ expect_bool_function_equal <- function(array_exp, r_exp) { # Assert that the Array operation returns a boolean array # and that its contents are equal to expected - expect_is(array_exp, "ArrowDatum") + expect_r6_class(array_exp, "ArrowDatum") expect_type_equal(array_exp, bool()) expect_identical(as.vector(array_exp), r_exp) } diff --git a/r/tests/testthat/test-data-type.R b/r/tests/testthat/test-data-type.R index a5ecb41de64..5c0a31191a1 100644 --- a/r/tests/testthat/test-data-type.R +++ b/r/tests/testthat/test-data-type.R @@ -392,17 +392,17 @@ test_that("decimal type and validation", { expect_error(decimal(100, 2), "Invalid: Decimal precision out of range: 100") expect_error(decimal(4, NA), '"scale" must be an integer') - expect_is(decimal(4, 2), "Decimal128Type") + expect_r6_class(decimal(4, 2), "Decimal128Type") }) test_that("Binary", { - expect_is(binary(), "Binary") + expect_r6_class(binary(), "Binary") expect_equal(binary()$ToString(), "binary") }) test_that("FixedSizeBinary", { - expect_is(fixed_size_binary(4), "FixedSizeBinary") + expect_r6_class(fixed_size_binary(4), "FixedSizeBinary") expect_equal(fixed_size_binary(4)$ToString(), "fixed_size_binary[4]") # input validation diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 932c568cd38..192b4b4220d 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -100,9 +100,9 @@ if(arrow_with_parquet()) { test_that("Simple interface for datasets", { skip_if_not_available("parquet") ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) - expect_is(ds$format, "ParquetFileFormat") - expect_is(ds$filesystem, "LocalFileSystem") - expect_is(ds, "Dataset") + expect_r6_class(ds$format, "ParquetFileFormat") + expect_r6_class(ds$filesystem, "LocalFileSystem") + expect_r6_class(ds, "Dataset") expect_equivalent( ds %>% select(chr, dbl) %>% @@ -208,7 +208,7 @@ test_that("dataset from directory URI", { skip_if_not_available("parquet") uri <- paste0("file://", dataset_dir) ds <- open_dataset(uri, partitioning = schema(part = uint8())) - expect_is(ds, "Dataset") + expect_r6_class(ds, "Dataset") expect_equivalent( ds %>% select(chr, dbl) %>% @@ -276,7 +276,7 @@ test_that("Simple interface for datasets (custom ParquetFileFormat)", { test_that("Hive partitioning", { skip_if_not_available("parquet") ds <- open_dataset(hive_dir, partitioning = hive_partition(other = utf8(), group = uint8())) - expect_is(ds, "Dataset") + expect_r6_class(ds, "Dataset") expect_equivalent( ds %>% filter(group == 2) %>% @@ -327,8 +327,8 @@ test_that("Partitioning inference", { test_that("IPC/Feather format data", { ds <- open_dataset(ipc_dir, partitioning = "part", format = "feather") - expect_is(ds$format, "IpcFileFormat") - expect_is(ds$filesystem, "LocalFileSystem") + expect_r6_class(ds$format, "IpcFileFormat") + expect_r6_class(ds$filesystem, "LocalFileSystem") expect_identical(names(ds), c(names(df1), "part")) expect_warning( expect_identical(dim(ds), c(NA, 7L)) @@ -356,8 +356,8 @@ test_that("IPC/Feather format data", { test_that("CSV dataset", { skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-12181 ds <- open_dataset(csv_dir, partitioning = "part", format = "csv") - expect_is(ds$format, "CsvFileFormat") - expect_is(ds$filesystem, "LocalFileSystem") + expect_r6_class(ds$format, "CsvFileFormat") + expect_r6_class(ds$filesystem, "LocalFileSystem") expect_identical(names(ds), c(names(df1), "part")) expect_warning( expect_identical(dim(ds), c(NA, 7L)) @@ -426,8 +426,8 @@ test_that("compressed CSV dataset", { write.csv(df1, gzfile(dst_file), row.names = FALSE, quote = FALSE) format <- FileFormat$create("csv") ds <- open_dataset(dst_dir, format = format) - expect_is(ds$format, "CsvFileFormat") - expect_is(ds$filesystem, "LocalFileSystem") + expect_r6_class(ds$format, "CsvFileFormat") + expect_r6_class(ds$filesystem, "LocalFileSystem") expect_equivalent( ds %>% @@ -590,7 +590,7 @@ test_that("Creating UnionDataset", { ds1 <- open_dataset(file.path(dataset_dir, 1)) ds2 <- open_dataset(file.path(dataset_dir, 2)) union1 <- open_dataset(list(ds1, ds2)) - expect_is(union1, "UnionDataset") + expect_r6_class(union1, "UnionDataset") expect_equivalent( union1 %>% select(chr, dbl) %>% @@ -605,7 +605,7 @@ test_that("Creating UnionDataset", { # Now with the c() method union2 <- c(ds1, ds2) - expect_is(union2, "UnionDataset") + expect_r6_class(union2, "UnionDataset") expect_equivalent( union2 %>% select(chr, dbl) %>% @@ -624,7 +624,7 @@ test_that("Creating UnionDataset", { test_that("InMemoryDataset", { ds <- InMemoryDataset$create(rbind(df1, df2)) - expect_is(ds, "InMemoryDataset") + expect_r6_class(ds, "InMemoryDataset") expect_equivalent( ds %>% select(chr, dbl) %>% @@ -861,9 +861,9 @@ test_that("filter() on date32 columns", { test_that("filter() with expressions", { skip_if_not_available("parquet") ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) - expect_is(ds$format, "ParquetFileFormat") - expect_is(ds$filesystem, "LocalFileSystem") - expect_is(ds, "Dataset") + expect_r6_class(ds$format, "ParquetFileFormat") + expect_r6_class(ds$filesystem, "LocalFileSystem") + expect_r6_class(ds, "Dataset") expect_equivalent( ds %>% select(chr, dbl) %>% @@ -1314,7 +1314,7 @@ test_that("Dataset and query print methods", { ), fixed = TRUE ) - expect_is(ds$metadata, "list") + expect_type(ds$metadata, "list") q <- select(ds, string = chr, lgl, integer = int) expect_output( print(q), @@ -1348,16 +1348,16 @@ test_that("Dataset and query print methods", { expect_scan_result <- function(ds, schm) { sb <- ds$NewScan() - expect_is(sb, "ScannerBuilder") + expect_r6_class(sb, "ScannerBuilder") expect_equal(sb$schema, schm) sb$Project(c("chr", "lgl")) sb$Filter(Expression$field_ref("dbl") == 8) scn <- sb$Finish() - expect_is(scn, "Scanner") + expect_r6_class(scn, "Scanner") tab <- scn$ToTable() - expect_is(tab, "Table") + expect_r6_class(tab, "Table") expect_equivalent( as.data.frame(tab), @@ -1373,19 +1373,19 @@ test_that("Assembling a Dataset manually and getting a Table", { fmt <- FileFormat$create("parquet") factory <- FileSystemDatasetFactory$create(fs, selector, NULL, fmt, partitioning = partitioning) - expect_is(factory, "FileSystemDatasetFactory") - + expect_r6_class(factory, "FileSystemDatasetFactory") + schm <- factory$Inspect() - expect_is(schm, "Schema") + expect_r6_class(schm, "Schema") phys_schm <- ParquetFileReader$create(files[1])$GetSchema() expect_equal(names(phys_schm), names(df1)) expect_equal(names(schm), c(names(phys_schm), "part")) child <- factory$Finish(schm) - expect_is(child, "FileSystemDataset") - expect_is(child$schema, "Schema") - expect_is(child$format, "ParquetFileFormat") + expect_r6_class(child, "FileSystemDataset") + expect_r6_class(child$schema, "Schema") + expect_r6_class(child$format, "ParquetFileFormat") expect_equal(names(schm), names(child$schema)) expect_equivalent(child$files, files) @@ -1396,22 +1396,22 @@ test_that("Assembling a Dataset manually and getting a Table", { test_that("Assembling multiple DatasetFactories with DatasetFactory", { skip_if_not_available("parquet") factory1 <- dataset_factory(file.path(dataset_dir, 1), format = "parquet") - expect_is(factory1, "FileSystemDatasetFactory") + expect_r6_class(factory1, "FileSystemDatasetFactory") factory2 <- dataset_factory(file.path(dataset_dir, 2), format = "parquet") - expect_is(factory2, "FileSystemDatasetFactory") + expect_r6_class(factory2, "FileSystemDatasetFactory") factory <- DatasetFactory$create(list(factory1, factory2)) - expect_is(factory, "DatasetFactory") + expect_r6_class(factory, "DatasetFactory") schm <- factory$Inspect() - expect_is(schm, "Schema") + expect_r6_class(schm, "Schema") phys_schm <- ParquetFileReader$create(files[1])$GetSchema() expect_equal(names(phys_schm), names(df1)) ds <- factory$Finish(schm) - expect_is(ds, "UnionDataset") - expect_is(ds$schema, "Schema") + expect_r6_class(ds, "UnionDataset") + expect_r6_class(ds$schema, "Schema") expect_equal(names(schm), names(ds$schema)) expect_equivalent(map(ds$children, ~.$files), files) diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R index bac64297c5a..c4ab042380f 100644 --- a/r/tests/testthat/test-dplyr-filter.R +++ b/r/tests/testthat/test-dplyr-filter.R @@ -272,7 +272,7 @@ test_that("filter() with string ops", { test_that("filter environment scope", { # "object 'b_var' not found" - expect_dplyr_error(input %>% filter(batch, chr == b_var)) + expect_dplyr_error(input %>% filter(chr == b_var), tbl) b_var <- "b" expect_dplyr_equal( @@ -283,7 +283,8 @@ test_that("filter environment scope", { ) # Also for functions # 'could not find function "isEqualTo"' because we haven't defined it yet - expect_dplyr_error(filter(batch, isEqualTo(int, 4))) + expect_dplyr_error(input %>% filter(isEqualTo(int, 4)), tbl) + skip("Need to substitute in user defined function too") # TODO: fix this: this isEqualTo function is eagerly evaluating; it should @@ -389,11 +390,14 @@ test_that("filter() with .data pronoun", { tbl ) + skip("test now faulty - code no longer gives error & outputs a empty tibble") # but there is an error if we don't override the masking with `.env` expect_dplyr_error( - tbl %>% + input %>% filter(.data$dbl > chr) %>% select(.data$chr, .data$int, .data$lgl) %>% - collect() + collect(), + tbl ) + }) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 662f6d7478a..4f202fa5958 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -26,7 +26,7 @@ tbl$verses <- verses[[1]] tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2*(1:10)+1, side = "both") test_that("mutate() is lazy", { - expect_is( + expect_s3_class( tbl %>% record_batch() %>% mutate(int = int + 6L), "arrow_dplyr_query" ) diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R index def7886a0bf..a02b00f3d95 100644 --- a/r/tests/testthat/test-dplyr.R +++ b/r/tests/testthat/test-dplyr.R @@ -32,7 +32,7 @@ test_that("basic select/filter/collect", { select(int, chr) %>% filter(int > 5) - expect_is(b2, "arrow_dplyr_query") + expect_s3_class(b2, "arrow_dplyr_query") t2 <- collect(b2) expect_equal(t2, tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")]) # Test that the original object is not affected @@ -187,7 +187,7 @@ test_that("collect(as_data_frame=FALSE)", { filter(int > 5) %>% collect(as_data_frame = FALSE) - expect_is(b2, "RecordBatch") + expect_r6_class(b2, "RecordBatch") expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")] expect_equal(as.data.frame(b2), expected) @@ -195,7 +195,7 @@ test_that("collect(as_data_frame=FALSE)", { select(int, strng = chr) %>% filter(int > 5) %>% collect(as_data_frame = FALSE) - expect_is(b3, "RecordBatch") + expect_r6_class(b3, "RecordBatch") expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng"))) b4 <- batch %>% @@ -203,7 +203,7 @@ test_that("collect(as_data_frame=FALSE)", { filter(int > 5) %>% group_by(int) %>% collect(as_data_frame = FALSE) - expect_is(b4, "arrow_dplyr_query") + expect_s3_class(b4, "arrow_dplyr_query") expect_equal( as.data.frame(b4), expected %>% @@ -257,7 +257,7 @@ test_that("head", { filter(int > 5) %>% head(2) - expect_is(b2, "RecordBatch") + expect_r6_class(b2, "RecordBatch") expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")][1:2, ] expect_equal(as.data.frame(b2), expected) @@ -265,7 +265,7 @@ test_that("head", { select(int, strng = chr) %>% filter(int > 5) %>% head(2) - expect_is(b3, "RecordBatch") + expect_r6_class(b3, "RecordBatch") expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng"))) b4 <- batch %>% @@ -273,7 +273,7 @@ test_that("head", { filter(int > 5) %>% group_by(int) %>% head(2) - expect_is(b4, "arrow_dplyr_query") + expect_s3_class(b4, "arrow_dplyr_query") expect_equal( as.data.frame(b4), expected %>% @@ -290,7 +290,7 @@ test_that("tail", { filter(int > 5) %>% tail(2) - expect_is(b2, "RecordBatch") + expect_r6_class(b2, "RecordBatch") expected <- tail(tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")], 2) expect_equal(as.data.frame(b2), expected) @@ -298,7 +298,7 @@ test_that("tail", { select(int, strng = chr) %>% filter(int > 5) %>% tail(2) - expect_is(b3, "RecordBatch") + expect_r6_class(b3, "RecordBatch") expect_equal(as.data.frame(b3), set_names(expected, c("int", "strng"))) b4 <- batch %>% @@ -306,7 +306,7 @@ test_that("tail", { filter(int > 5) %>% group_by(int) %>% tail(2) - expect_is(b4, "arrow_dplyr_query") + expect_s3_class(b4, "arrow_dplyr_query") expect_equal( as.data.frame(b4), expected %>% diff --git a/r/tests/testthat/test-expression.R b/r/tests/testthat/test-expression.R index d7eb6df63e3..dd61b5e3ca2 100644 --- a/r/tests/testthat/test-expression.R +++ b/r/tests/testthat/test-expression.R @@ -18,7 +18,7 @@ context("Expressions") test_that("Can create an expression", { - expect_is(build_array_expression(">", Array$create(1:5), 4), "array_expression") + expect_s3_class(build_array_expression(">", Array$create(1:5), 4), "array_expression") }) test_that("as.vector(array_expression)", { @@ -37,11 +37,11 @@ test_that("array_expression print method", { test_that("array_refs", { tab <- Table$create(a = 1:5) ex <- build_array_expression(">", array_expression("array_ref", field_name = "a"), 4) - expect_is(ex, "array_expression") + expect_s3_class(ex, "array_expression") expect_identical(ex$args[[1]]$args$field_name, "a") expect_identical(find_array_refs(ex), "a") out <- eval_array_expression(ex, tab) - expect_is(out, "ChunkedArray") + expect_r6_class(out, "ChunkedArray") expect_equal(as.vector(out), c(FALSE, FALSE, FALSE, FALSE, TRUE)) }) @@ -55,45 +55,45 @@ test_that("C++ expressions", { i64 <- Expression$scalar(bit64::as.integer64(42)) time <- Expression$scalar(hms::hms(56, 34, 12)) - expect_is(f == g, "Expression") - expect_is(f == 4, "Expression") - expect_is(f == "", "Expression") - expect_is(f == NULL, "Expression") - expect_is(f == date, "Expression") - expect_is(f == i64, "Expression") - expect_is(f == time, "Expression") + expect_r6_class(f == g, "Expression") + expect_r6_class(f == 4, "Expression") + expect_r6_class(f == "", "Expression") + expect_r6_class(f == NULL, "Expression") + expect_r6_class(f == date, "Expression") + expect_r6_class(f == i64, "Expression") + expect_r6_class(f == time, "Expression") # can't seem to make this work right now because of R Ops.method dispatch - # expect_is(f == as.Date("2020-01-15"), "Expression") - expect_is(f == ts, "Expression") - expect_is(f <= 2L, "Expression") - expect_is(f != FALSE, "Expression") - expect_is(f > 4, "Expression") - expect_is(f < 4 & f > 2, "Expression") - expect_is(f < 4 | f > 2, "Expression") - expect_is(!(f < 4), "Expression") + # expect_r6_class(f == as.Date("2020-01-15"), "Expression") + expect_r6_class(f == ts, "Expression") + expect_r6_class(f <= 2L, "Expression") + expect_r6_class(f != FALSE, "Expression") + expect_r6_class(f > 4, "Expression") + expect_r6_class(f < 4 & f > 2, "Expression") + expect_r6_class(f < 4 | f > 2, "Expression") + expect_r6_class(!(f < 4), "Expression") expect_output( print(f > 4), 'Expression\n(f > 4)', fixed = TRUE ) # Interprets that as a list type - expect_is(f == c(1L, 2L), "Expression") + expect_r6_class(f == c(1L, 2L), "Expression") }) test_that("Can create an expression", { a <- Array$create(as.numeric(1:5)) expr <- array_expression("cast", a, options = list(to_type = int32())) - expect_is(expr, "array_expression") + expect_s3_class(expr, "array_expression") expect_equal(eval_array_expression(expr), Array$create(1:5)) b <- Array$create(0.5:4.5) bad_expr <- array_expression("cast", b, options = list(to_type = int32())) - expect_is(bad_expr, "array_expression") + expect_s3_class(bad_expr, "array_expression") expect_error( eval_array_expression(bad_expr), "Invalid: Float value .* was truncated converting" ) expr <- array_expression("cast", b, options = list(to_type = int32(), allow_float_truncate = TRUE)) - expect_is(expr, "array_expression") + expect_s3_class(expr, "array_expression") expect_equal(eval_array_expression(expr), Array$create(0:4)) }) diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index abaae2c7195..d5d82a73e12 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -44,18 +44,18 @@ expect_feather_roundtrip <- function(write_fun) { # Read both back tab2 <- read_feather(tf2) - expect_is(tab2, "data.frame") + expect_s3_class(tab2, "data.frame") tab3 <- read_feather(tf3) - expect_is(tab3, "data.frame") + expect_s3_class(tab3, "data.frame") # reading directly from arrow::io::MemoryMappedFile tab4 <- read_feather(mmap_open(tf3)) - expect_is(tab4, "data.frame") + expect_s3_class(tab4, "data.frame") # reading directly from arrow::io::ReadableFile tab5 <- read_feather(ReadableFile$create(tf3)) - expect_is(tab5, "data.frame") + expect_s3_class(tab5, "data.frame") expect_equal(tib, tab2) expect_equal(tib, tab3) @@ -105,7 +105,7 @@ test_that("write_feather option error handling", { test_that("read_feather supports col_select = ", { tab1 <- read_feather(feather_file, col_select = c("x", "y")) - expect_is(tab1, "data.frame") + expect_s3_class(tab1, "data.frame") expect_equal(tib$x, tab1$x) expect_equal(tib$y, tab1$y) @@ -113,7 +113,7 @@ test_that("read_feather supports col_select = ", { test_that("feather handles col_select = ", { tab1 <- read_feather(feather_file, col_select = 1:2) - expect_is(tab1, "data.frame") + expect_s3_class(tab1, "data.frame") expect_equal(tib$x, tab1$x) expect_equal(tib$y, tab1$y) @@ -135,7 +135,7 @@ test_that("feather handles col_select = ", { test_that("feather read/write round trip", { tab1 <- read_feather(feather_file, as_data_frame = FALSE) - expect_is(tab1, "Table") + expect_r6_class(tab1, "Table") expect_equal(tib, as.data.frame(tab1)) }) @@ -143,7 +143,7 @@ test_that("feather read/write round trip", { test_that("Read feather from raw vector", { test_raw <- readBin(feather_file, what = "raw", n = 5000) df <- read_feather(test_raw) - expect_is(df, "data.frame") + expect_s3_class(df, "data.frame") }) test_that("FeatherReader", { diff --git a/r/tests/testthat/test-filesystem.R b/r/tests/testthat/test-filesystem.R index 918c495ec04..344865c077a 100644 --- a/r/tests/testthat/test-filesystem.R +++ b/r/tests/testthat/test-filesystem.R @@ -81,9 +81,9 @@ test_that("SubTreeFilesystem", { file.copy(DESCRIPTION, file.path(td, "DESCRIPTION")) st_fs <- SubTreeFileSystem$create(td) - expect_is(st_fs, "SubTreeFileSystem") - expect_is(st_fs, "FileSystem") - expect_is(st_fs$base_fs, "LocalFileSystem") + expect_r6_class(st_fs, "SubTreeFileSystem") + expect_r6_class(st_fs, "FileSystem") + expect_r6_class(st_fs$base_fs, "LocalFileSystem") expect_identical( capture.output(print(st_fs)), paste0("SubTreeFileSystem: ", "file://", st_fs$base_path) @@ -137,7 +137,7 @@ test_that("FileSystem$from_uri", { skip_on_cran() skip_if_not_available("s3") fs_and_path <- FileSystem$from_uri("s3://ursa-labs-taxi-data") - expect_is(fs_and_path$fs, "S3FileSystem") + expect_r6_class(fs_and_path$fs, "S3FileSystem") expect_identical(fs_and_path$fs$region, "us-east-2") }) @@ -145,7 +145,7 @@ test_that("SubTreeFileSystem$create() with URI", { skip_on_cran() skip_if_not_available("s3") fs <- SubTreeFileSystem$create("s3://ursa-labs-taxi-data") - expect_is(fs, "SubTreeFileSystem") + expect_r6_class(fs, "SubTreeFileSystem") expect_identical( capture.output(print(fs)), "SubTreeFileSystem: s3://ursa-labs-taxi-data/" @@ -156,15 +156,15 @@ test_that("S3FileSystem", { skip_on_cran() skip_if_not_available("s3") s3fs <- S3FileSystem$create() - expect_is(s3fs, "S3FileSystem") + expect_r6_class(s3fs, "S3FileSystem") }) test_that("s3_bucket", { skip_on_cran() skip_if_not_available("s3") bucket <- s3_bucket("ursa-labs-r-test") - expect_is(bucket, "SubTreeFileSystem") - expect_is(bucket$base_fs, "S3FileSystem") + expect_r6_class(bucket, "SubTreeFileSystem") + expect_r6_class(bucket$base_fs, "S3FileSystem") expect_identical(bucket$region, "us-west-2") expect_identical( capture.output(print(bucket)), diff --git a/r/tests/testthat/test-json.R b/r/tests/testthat/test-json.R index a35a465bf0b..b0b508bbc4b 100644 --- a/r/tests/testthat/test-json.R +++ b/r/tests/testthat/test-json.R @@ -58,9 +58,9 @@ test_that("read_json_arrow() converts to tibble", { tab2 <- read_json_arrow(mmap_open(tf)) tab3 <- read_json_arrow(ReadableFile$create(tf)) - expect_is(tab1, "tbl_df") - expect_is(tab2, "tbl_df") - expect_is(tab3, "tbl_df") + expect_s3_class(tab1, "tbl_df") + expect_s3_class(tab2, "tbl_df") + expect_s3_class(tab3, "tbl_df") expect_equal(tab1, tab2) expect_equal(tab1, tab3) diff --git a/r/tests/testthat/test-memory-pool.R b/r/tests/testthat/test-memory-pool.R index ab38cc71ffd..0aa18aadc20 100644 --- a/r/tests/testthat/test-memory-pool.R +++ b/r/tests/testthat/test-memory-pool.R @@ -18,8 +18,8 @@ test_that("default_memory_pool and its attributes", { pool <- default_memory_pool() # Not integer bc can be >2gb, so we cast to double - expect_is(pool$bytes_allocated, "numeric") - expect_is(pool$max_memory, "numeric") + expect_type(pool$bytes_allocated, "double") + expect_type(pool$max_memory, "double") expect_true(pool$backend_name %in% c("system", "jemalloc", "mimalloc")) expect_true(all(supported_memory_backends() %in% c("system", "jemalloc", "mimalloc"))) diff --git a/r/tests/testthat/test-message-reader.R b/r/tests/testthat/test-message-reader.R index 0bd6d66c544..340a3e3ed1e 100644 --- a/r/tests/testthat/test-message-reader.R +++ b/r/tests/testthat/test-message-reader.R @@ -24,10 +24,10 @@ test_that("MessageReader can be created from raw vectors", { reader <- MessageReader$create(bytes) message <- reader$ReadNextMessage() - expect_is(message, "Message") + expect_r6_class(message, "Message") expect_equal(message$type, MessageType$RECORD_BATCH) - expect_is(message$body, "Buffer") - expect_is(message$metadata, "Buffer") + expect_r6_class(message$body, "Buffer") + expect_r6_class(message$metadata, "Buffer") message <- reader$ReadNextMessage() expect_null(message) @@ -38,10 +38,10 @@ test_that("MessageReader can be created from raw vectors", { reader <- MessageReader$create(bytes) message <- reader$ReadNextMessage() - expect_is(message, "Message") + expect_r6_class(message, "Message") expect_equal(message$type, MessageType$SCHEMA) - expect_is(message$body, "Buffer") - expect_is(message$metadata, "Buffer") + expect_r6_class(message$body, "Buffer") + expect_r6_class(message$metadata, "Buffer") message <- reader$ReadNextMessage() expect_null(message) @@ -52,16 +52,16 @@ test_that("MessageReader can be created from input stream", { bytes <- batch$serialize() stream <- BufferReader$create(bytes) - expect_is(stream, "BufferReader") + expect_r6_class(stream, "BufferReader") reader <- MessageReader$create(stream) - expect_is(reader, "MessageReader") + expect_r6_class(reader, "MessageReader") message <- reader$ReadNextMessage() - expect_is(message, "Message") + expect_r6_class(message, "Message") expect_equal(message$type, MessageType$RECORD_BATCH) - expect_is(message$body, "Buffer") - expect_is(message$metadata, "Buffer") + expect_r6_class(message$body, "Buffer") + expect_r6_class(message$metadata, "Buffer") message <- reader$ReadNextMessage() expect_null(message) @@ -70,16 +70,16 @@ test_that("MessageReader can be created from input stream", { bytes <- schema$serialize() stream <- BufferReader$create(bytes) - expect_is(stream, "BufferReader") + expect_r6_class(stream, "BufferReader") reader <- MessageReader$create(stream) - expect_is(reader, "MessageReader") + expect_r6_class(reader, "MessageReader") message <- reader$ReadNextMessage() - expect_is(message, "Message") + expect_r6_class(message, "Message") expect_equal(message$type, MessageType$SCHEMA) - expect_is(message$body, "Buffer") - expect_is(message$metadata, "Buffer") + expect_r6_class(message$body, "Buffer") + expect_r6_class(message$metadata, "Buffer") message <- reader$ReadNextMessage() expect_null(message) diff --git a/r/tests/testthat/test-message.R b/r/tests/testthat/test-message.R index c6cd9fe4b09..b9fb3a162a7 100644 --- a/r/tests/testthat/test-message.R +++ b/r/tests/testthat/test-message.R @@ -23,10 +23,10 @@ test_that("read_message can read from input stream", { stream <- BufferReader$create(bytes) message <- read_message(stream) - expect_is(message, "Message") + expect_r6_class(message, "Message") expect_equal(message$type, MessageType$RECORD_BATCH) - expect_is(message$body, "Buffer") - expect_is(message$metadata, "Buffer") + expect_r6_class(message$body, "Buffer") + expect_r6_class(message$metadata, "Buffer") message <- read_message(stream) expect_null(read_message(stream)) @@ -37,10 +37,10 @@ test_that("read_message() can read Schema messages", { stream <- BufferReader$create(bytes) message <- read_message(stream) - expect_is(message, "Message") + expect_r6_class(message, "Message") expect_equal(message$type, MessageType$SCHEMA) - expect_is(message$body, "Buffer") - expect_is(message$metadata, "Buffer") + expect_r6_class(message$body, "Buffer") + expect_r6_class(message$metadata, "Buffer") message <- read_message(stream) expect_null(read_message(stream)) diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index 4e1895e82ec..afce1c2244c 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -167,7 +167,7 @@ test_that("haven types roundtrip via feather", { test_that("Date/time type roundtrip", { rb <- record_batch(example_with_times) - expect_is(rb$schema$posixlt$type, "StructType") + expect_r6_class(rb$schema$posixlt$type, "StructType") expect_identical(as.data.frame(rb), example_with_times) }) diff --git a/r/tests/testthat/test-python.R b/r/tests/testthat/test-python.R index b564bfee950..885274846e1 100644 --- a/r/tests/testthat/test-python.R +++ b/r/tests/testthat/test-python.R @@ -43,7 +43,7 @@ test_that("Array to Python", { pa <- reticulate::import("pyarrow", convert = FALSE) r <- Array$create(c(1, 2, 3)) py <- pa$concat_arrays(list(r)) - expect_is(py, "pyarrow.lib.Array") + expect_s3_class(py, "pyarrow.lib.Array") expect_equal(reticulate::py_to_r(py), r) }) @@ -52,7 +52,7 @@ test_that("RecordBatch to/from Python", { pa <- reticulate::import("pyarrow", convert = FALSE) batch <- record_batch(col1 = c(1, 2, 3), col2 = letters[1:3]) py <- reticulate::r_to_py(batch) - expect_is(py, "pyarrow.lib.RecordBatch") + expect_s3_class(py, "pyarrow.lib.RecordBatch") expect_equal(reticulate::py_to_r(py), batch) }) @@ -63,8 +63,8 @@ test_that("Table and ChunkedArray from Python", { tab <- Table$create(batch, batch) pybatch <- reticulate::r_to_py(batch) pytab <- pa$Table$from_batches(list(pybatch, pybatch)) - expect_is(pytab, "pyarrow.lib.Table") - expect_is(pytab[0], "pyarrow.lib.ChunkedArray") + expect_s3_class(pytab, "pyarrow.lib.Table") + expect_s3_class(pytab[0], "pyarrow.lib.ChunkedArray") expect_equal(reticulate::py_to_r(pytab[0]), tab$col1) expect_equal(reticulate::py_to_r(pytab), tab) }) @@ -75,11 +75,11 @@ test_that("Table and ChunkedArray to Python", { tab <- Table$create(batch, batch) pychunked <- reticulate::r_to_py(tab$col1) - expect_is(pychunked, "pyarrow.lib.ChunkedArray") + expect_s3_class(pychunked, "pyarrow.lib.ChunkedArray") expect_equal(reticulate::py_to_r(pychunked), tab$col1) pytab <- reticulate::r_to_py(tab) - expect_is(pytab, "pyarrow.lib.Table") + expect_s3_class(pytab, "pyarrow.lib.Table") expect_equal(reticulate::py_to_r(pytab), tab) }) @@ -87,7 +87,7 @@ test_that("RecordBatch with metadata roundtrip", { skip_if_no_pyarrow() batch <- RecordBatch$create(example_with_times) pybatch <- reticulate::r_to_py(batch) - expect_is(pybatch, "pyarrow.lib.RecordBatch") + expect_s3_class(pybatch, "pyarrow.lib.RecordBatch") expect_equal(reticulate::py_to_r(pybatch), batch) expect_identical(as.data.frame(reticulate::py_to_r(pybatch)), example_with_times) }) @@ -96,7 +96,7 @@ test_that("Table with metadata roundtrip", { skip_if_no_pyarrow() tab <- Table$create(example_with_times) pytab <- reticulate::r_to_py(tab) - expect_is(pytab, "pyarrow.lib.Table") + expect_s3_class(pytab, "pyarrow.lib.Table") expect_equal(reticulate::py_to_r(pytab), tab) expect_identical(as.data.frame(reticulate::py_to_r(pytab)), example_with_times) }) diff --git a/r/tests/testthat/test-read-record-batch.R b/r/tests/testthat/test-read-record-batch.R index 9383c476588..56f4e8e6e00 100644 --- a/r/tests/testthat/test-read-record-batch.R +++ b/r/tests/testthat/test-read-record-batch.R @@ -34,7 +34,7 @@ test_that("RecordBatchFileWriter / RecordBatchFileReader roundtrips", { stream <- FileOutputStream$create(tf) writer <- RecordBatchFileWriter$create(stream, tab$schema) - expect_is(writer, "RecordBatchWriter") + expect_r6_class(writer, "RecordBatchWriter") writer$write_table(tab) writer$close() stream$close() diff --git a/r/tests/testthat/test-read-write.R b/r/tests/testthat/test-read-write.R index a9ce5f12809..ea3aa34a424 100644 --- a/r/tests/testthat/test-read-write.R +++ b/r/tests/testthat/test-read-write.R @@ -119,7 +119,7 @@ test_that("reading/writing a raw vector (sparklyr integration)", { as.data.frame(RecordBatchStreamReader$create(x)$read_next_batch()) } bytes <- write_to_raw(example_data) - expect_is(bytes, "raw") + expect_type(bytes, "raw") expect_identical(read_from_raw_test(bytes), example_data) # this could just be `read_ipc_stream(x)`; propose that expect_identical(read_ipc_stream(bytes), example_data) diff --git a/r/tests/testthat/test-record-batch-reader.R b/r/tests/testthat/test-record-batch-reader.R index 9a5e4dd4cc0..483588ab4bb 100644 --- a/r/tests/testthat/test-record-batch-reader.R +++ b/r/tests/testthat/test-record-batch-reader.R @@ -28,7 +28,7 @@ test_that("RecordBatchStreamReader / Writer", { sink <- BufferOutputStream$create() expect_equal(sink$tell(), 0) writer <- RecordBatchStreamWriter$create(sink, batch$schema) - expect_is(writer, "RecordBatchWriter") + expect_r6_class(writer, "RecordBatchWriter") writer$write(batch) writer$write(tab) writer$write(tbl) @@ -36,19 +36,19 @@ test_that("RecordBatchStreamReader / Writer", { writer$close() buf <- sink$finish() - expect_is(buf, "Buffer") + expect_r6_class(buf, "Buffer") reader <- RecordBatchStreamReader$create(buf) - expect_is(reader, "RecordBatchStreamReader") + expect_r6_class(reader, "RecordBatchStreamReader") batch1 <- reader$read_next_batch() - expect_is(batch1, "RecordBatch") + expect_r6_class(batch1, "RecordBatch") expect_equal(batch, batch1) batch2 <- reader$read_next_batch() - expect_is(batch2, "RecordBatch") + expect_r6_class(batch2, "RecordBatch") expect_equal(batch, batch2) batch3 <- reader$read_next_batch() - expect_is(batch3, "RecordBatch") + expect_r6_class(batch3, "RecordBatch") expect_equal(batch, batch3) expect_null(reader$read_next_batch()) }) @@ -56,20 +56,20 @@ test_that("RecordBatchStreamReader / Writer", { test_that("RecordBatchFileReader / Writer", { sink <- BufferOutputStream$create() writer <- RecordBatchFileWriter$create(sink, batch$schema) - expect_is(writer, "RecordBatchWriter") + expect_r6_class(writer, "RecordBatchWriter") writer$write(batch) writer$write(tab) writer$write(tbl) writer$close() buf <- sink$finish() - expect_is(buf, "Buffer") + expect_r6_class(buf, "Buffer") reader <- RecordBatchFileReader$create(buf) - expect_is(reader, "RecordBatchFileReader") + expect_r6_class(reader, "RecordBatchFileReader") batch1 <- reader$get_batch(0) - expect_is(batch1, "RecordBatch") + expect_r6_class(batch1, "RecordBatch") expect_equal(batch, batch1) expect_equal(reader$num_record_batches, 3) @@ -78,7 +78,7 @@ test_that("RecordBatchFileReader / Writer", { test_that("StreamReader read_table", { sink <- BufferOutputStream$create() writer <- RecordBatchStreamWriter$create(sink, batch$schema) - expect_is(writer, "RecordBatchWriter") + expect_r6_class(writer, "RecordBatchWriter") writer$write(batch) writer$write(tab) writer$write(tbl) @@ -93,7 +93,7 @@ test_that("StreamReader read_table", { test_that("FileReader read_table", { sink <- BufferOutputStream$create() writer <- RecordBatchFileWriter$create(sink, batch$schema) - expect_is(writer, "RecordBatchWriter") + expect_r6_class(writer, "RecordBatchWriter") writer$write(batch) writer$write(tab) writer$write(tbl) @@ -137,6 +137,6 @@ test_that("reader with 0 batches", { reader <- RecordBatchStreamReader$create(buf) tab <- reader$read_table() - expect_is(tab, "Table") + expect_r6_class(tab, "Table") expect_identical(dim(tab), c(0L, 1L)) }) diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index d3493f8110a..8cb0dafdfe4 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -38,7 +38,7 @@ if (arrow_with_s3() && process_is_running("minio server")) { scheme = "http", endpoint_override = paste0("localhost:", minio_port) ) - expect_is(fs, "S3FileSystem") + expect_r6_class(fs, "S3FileSystem") now <- as.character(as.numeric(Sys.time())) # If minio isn't running, this will hang for a few seconds and fail with a # curl timeout, causing `run_these` to be set to FALSE and skipping the tests diff --git a/r/tests/testthat/test-scalar.R b/r/tests/testthat/test-scalar.R index 501298a8021..21b2836496e 100644 --- a/r/tests/testthat/test-scalar.R +++ b/r/tests/testthat/test-scalar.R @@ -19,7 +19,7 @@ context("Scalar") expect_scalar_roundtrip <- function(x, type) { s <- Scalar$create(x) - expect_is(s, "Scalar") + expect_r6_class(s, "Scalar") expect_type_equal(s$type, type) expect_identical(length(s), 1L) if (inherits(type, "NestedType")) { diff --git a/r/tests/testthat/test-schema.R b/r/tests/testthat/test-schema.R index ac888d94101..87dad175e2b 100644 --- a/r/tests/testthat/test-schema.R +++ b/r/tests/testthat/test-schema.R @@ -120,27 +120,27 @@ test_that("reading schema from Buffer", { # TODO: this uses the streaming format, i.e. from RecordBatchStreamWriter # maybe there is an easier way to serialize a schema batch <- record_batch(x = 1:10) - expect_is(batch, "RecordBatch") + expect_r6_class(batch, "RecordBatch") stream <- BufferOutputStream$create() writer <- RecordBatchStreamWriter$create(stream, batch$schema) - expect_is(writer, "RecordBatchWriter") + expect_r6_class(writer, "RecordBatchWriter") writer$close() buffer <- stream$finish() - expect_is(buffer, "Buffer") + expect_r6_class(buffer, "Buffer") reader <- MessageReader$create(buffer) - expect_is(reader, "MessageReader") + expect_r6_class(reader, "MessageReader") message <- reader$ReadNextMessage() - expect_is(message, "Message") + expect_r6_class(message, "Message") expect_equal(message$type, MessageType$SCHEMA) stream <- BufferReader$create(buffer) - expect_is(stream, "BufferReader") + expect_r6_class(stream, "BufferReader") message <- read_message(stream) - expect_is(message, "Message") + expect_r6_class(message, "Message") expect_equal(message$type, MessageType$SCHEMA) }) From 1ed681912be7246695cdd938ea632e1751403f67 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 13 Apr 2021 07:14:45 -0400 Subject: [PATCH 007/719] ARROW-12277: [Rust][DataFusion] Implement Sum/Count/Min/Max aggregates for Timestamp(_,_) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Rationale: If you try and aggregate (via SUM, for example) a column of a timestamp type, DataFusion generates an error: ``` Coercion from [Timestamp(Nanosecond, None)] to the signature Uniform(1, [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float32, Float64]) failed. ``` For example, from IOx ``` > show columns from t; +---------------+--------------+------------+-------------+-----------------------------+-------------+ | table_catalog | table_schema | table_name | column_name | data_type | is_nullable | +---------------+--------------+------------+-------------+-----------------------------+-------------+ | datafusion | public | t | a | Utf8 | NO | | datafusion | public | t | b | Timestamp(Nanosecond, None) | NO | +---------------+--------------+------------+-------------+-----------------------------+-------------+ 2 row in set. Query took 0 seconds. > select sum(b) from t; Plan("Coercion from [Timestamp(Nanosecond, None)] to the signature Uniform(1, [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float32, Float64]) failed.") ``` # Changes: Add support for aggregating timestamp types and tests for same # Notes Note this is follow on / more fleshing out of the work done in #9773 by @velvia (👋 thanks for adding Timestamps to `ScalarValue`) Supporting AVG on timestamps is tracked by https://issues.apache.org/jira/browse/ARROW-12318. It is more involved (as currently Avg assumes the output type is always F64), and not important for myuse case at the moment. Closes #9970 from alamb/alamb/ARROW-12277-aggregate-timestamps Authored-by: Andrew Lamb Signed-off-by: Andrew Lamb --- rust/datafusion/src/execution/context.rs | 115 +++++++++++++++ .../src/physical_plan/aggregates.rs | 19 ++- .../src/physical_plan/datetime_expressions.rs | 4 +- .../src/physical_plan/expressions/min_max.rs | 49 ++++++- .../src/physical_plan/group_scalar.rs | 24 +++- rust/datafusion/src/scalar.rs | 132 ++++++++++++++---- rust/datafusion/src/test/mod.rs | 92 +++++++++++- 7 files changed, 393 insertions(+), 42 deletions(-) diff --git a/rust/datafusion/src/execution/context.rs b/rust/datafusion/src/execution/context.rs index ce0ea6d0050..07d5b629e1b 100644 --- a/rust/datafusion/src/execution/context.rs +++ b/rust/datafusion/src/execution/context.rs @@ -1403,6 +1403,121 @@ mod tests { Ok(()) } + #[tokio::test] + async fn aggregate_timestamps_sum() -> Result<()> { + let tmp_dir = TempDir::new()?; + let mut ctx = create_ctx(&tmp_dir, 1)?; + ctx.register_table("t", test::table_with_timestamps()) + .unwrap(); + + let results = plan_and_collect( + &mut ctx, + "SELECT sum(nanos), sum(micros), sum(millis), sum(secs) FROM t", + ) + .await + .unwrap_err(); + + assert_eq!(results.to_string(), "Error during planning: Coercion from [Timestamp(Nanosecond, None)] to the signature Uniform(1, [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float32, Float64]) failed."); + + Ok(()) + } + + #[tokio::test] + async fn aggregate_timestamps_count() -> Result<()> { + let tmp_dir = TempDir::new()?; + let mut ctx = create_ctx(&tmp_dir, 1)?; + ctx.register_table("t", test::table_with_timestamps()) + .unwrap(); + + let results = plan_and_collect( + &mut ctx, + "SELECT count(nanos), count(micros), count(millis), count(secs) FROM t", + ) + .await + .unwrap(); + + let expected = vec![ + "+--------------+---------------+---------------+-------------+", + "| COUNT(nanos) | COUNT(micros) | COUNT(millis) | COUNT(secs) |", + "+--------------+---------------+---------------+-------------+", + "| 3 | 3 | 3 | 3 |", + "+--------------+---------------+---------------+-------------+", + ]; + assert_batches_sorted_eq!(expected, &results); + + Ok(()) + } + + #[tokio::test] + async fn aggregate_timestamps_min() -> Result<()> { + let tmp_dir = TempDir::new()?; + let mut ctx = create_ctx(&tmp_dir, 1)?; + ctx.register_table("t", test::table_with_timestamps()) + .unwrap(); + + let results = plan_and_collect( + &mut ctx, + "SELECT min(nanos), min(micros), min(millis), min(secs) FROM t", + ) + .await + .unwrap(); + + let expected = vec![ + "+----------------------------+----------------------------+-------------------------+---------------------+", + "| MIN(nanos) | MIN(micros) | MIN(millis) | MIN(secs) |", + "+----------------------------+----------------------------+-------------------------+---------------------+", + "| 2011-12-13 11:13:10.123450 | 2011-12-13 11:13:10.123450 | 2011-12-13 11:13:10.123 | 2011-12-13 11:13:10 |", + "+----------------------------+----------------------------+-------------------------+---------------------+", + ]; + assert_batches_sorted_eq!(expected, &results); + + Ok(()) + } + + #[tokio::test] + async fn aggregate_timestamps_max() -> Result<()> { + let tmp_dir = TempDir::new()?; + let mut ctx = create_ctx(&tmp_dir, 1)?; + ctx.register_table("t", test::table_with_timestamps()) + .unwrap(); + + let results = plan_and_collect( + &mut ctx, + "SELECT max(nanos), max(micros), max(millis), max(secs) FROM t", + ) + .await + .unwrap(); + + let expected = vec![ + "+-------------------------+-------------------------+-------------------------+---------------------+", + "| MAX(nanos) | MAX(micros) | MAX(millis) | MAX(secs) |", + "+-------------------------+-------------------------+-------------------------+---------------------+", + "| 2021-01-01 05:11:10.432 | 2021-01-01 05:11:10.432 | 2021-01-01 05:11:10.432 | 2021-01-01 05:11:10 |", + "+-------------------------+-------------------------+-------------------------+---------------------+", +]; + assert_batches_sorted_eq!(expected, &results); + + Ok(()) + } + + #[tokio::test] + async fn aggregate_timestamps_avg() -> Result<()> { + let tmp_dir = TempDir::new()?; + let mut ctx = create_ctx(&tmp_dir, 1)?; + ctx.register_table("t", test::table_with_timestamps()) + .unwrap(); + + let results = plan_and_collect( + &mut ctx, + "SELECT avg(nanos), avg(micros), avg(millis), avg(secs) FROM t", + ) + .await + .unwrap_err(); + + assert_eq!(results.to_string(), "Error during planning: Coercion from [Timestamp(Nanosecond, None)] to the signature Uniform(1, [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float32, Float64]) failed."); + Ok(()) + } + #[tokio::test] async fn join_partitioned() -> Result<()> { // self join on partition id (workaround for duplicate column name) diff --git a/rust/datafusion/src/physical_plan/aggregates.rs b/rust/datafusion/src/physical_plan/aggregates.rs index be90daa954d..9417c7c8f05 100644 --- a/rust/datafusion/src/physical_plan/aggregates.rs +++ b/rust/datafusion/src/physical_plan/aggregates.rs @@ -34,7 +34,7 @@ use super::{ use crate::error::{DataFusionError, Result}; use crate::physical_plan::distinct_expressions; use crate::physical_plan::expressions; -use arrow::datatypes::{DataType, Schema}; +use arrow::datatypes::{DataType, Schema, TimeUnit}; use expressions::{avg_return_type, sum_return_type}; use std::{fmt, str::FromStr, sync::Arc}; @@ -160,6 +160,8 @@ pub fn create_aggregate_expr( }) } +static STRINGS: &[DataType] = &[DataType::Utf8, DataType::LargeUtf8]; + static NUMERICS: &[DataType] = &[ DataType::Int8, DataType::Int16, @@ -173,14 +175,25 @@ static NUMERICS: &[DataType] = &[ DataType::Float64, ]; +static TIMESTAMPS: &[DataType] = &[ + DataType::Timestamp(TimeUnit::Second, None), + DataType::Timestamp(TimeUnit::Millisecond, None), + DataType::Timestamp(TimeUnit::Microsecond, None), + DataType::Timestamp(TimeUnit::Nanosecond, None), +]; + /// the signatures supported by the function `fun`. fn signature(fun: &AggregateFunction) -> Signature { // note: the physical expression must accept the type returned by this function or the execution panics. match fun { AggregateFunction::Count => Signature::Any(1), AggregateFunction::Min | AggregateFunction::Max => { - let mut valid = vec![DataType::Utf8, DataType::LargeUtf8]; - valid.extend_from_slice(NUMERICS); + let valid = STRINGS + .iter() + .chain(NUMERICS.iter()) + .chain(TIMESTAMPS.iter()) + .cloned() + .collect::>(); Signature::Uniform(1, valid) } AggregateFunction::Avg | AggregateFunction::Sum => { diff --git a/rust/datafusion/src/physical_plan/datetime_expressions.rs b/rust/datafusion/src/physical_plan/datetime_expressions.rs index 3d363ce97d2..7b5816186f2 100644 --- a/rust/datafusion/src/physical_plan/datetime_expressions.rs +++ b/rust/datafusion/src/physical_plan/datetime_expressions.rs @@ -324,8 +324,8 @@ pub fn date_trunc(args: &[ColumnarValue]) -> Result { Ok(match array { ColumnarValue::Scalar(scalar) => { - if let ScalarValue::TimeNanosecond(v) = scalar { - ColumnarValue::Scalar(ScalarValue::TimeNanosecond((f)(*v)?)) + if let ScalarValue::TimestampNanosecond(v) = scalar { + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond((f)(*v)?)) } else { return Err(DataFusionError::Execution( "array of `date_trunc` must be non-null scalar Utf8".to_string(), diff --git a/rust/datafusion/src/physical_plan/expressions/min_max.rs b/rust/datafusion/src/physical_plan/expressions/min_max.rs index 2fd84a6cc70..5ed14610ada 100644 --- a/rust/datafusion/src/physical_plan/expressions/min_max.rs +++ b/rust/datafusion/src/physical_plan/expressions/min_max.rs @@ -25,12 +25,13 @@ use crate::error::{DataFusionError, Result}; use crate::physical_plan::{Accumulator, AggregateExpr, PhysicalExpr}; use crate::scalar::ScalarValue; use arrow::compute; -use arrow::datatypes::DataType; +use arrow::datatypes::{DataType, TimeUnit}; use arrow::{ array::{ ArrayRef, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, - Int8Array, LargeStringArray, StringArray, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, + Int8Array, LargeStringArray, StringArray, TimestampMicrosecondArray, + TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, + UInt16Array, UInt32Array, UInt64Array, UInt8Array, }, datatypes::Field, }; @@ -128,6 +129,27 @@ macro_rules! min_max_batch { DataType::UInt32 => typed_min_max_batch!($VALUES, UInt32Array, UInt32, $OP), DataType::UInt16 => typed_min_max_batch!($VALUES, UInt16Array, UInt16, $OP), DataType::UInt8 => typed_min_max_batch!($VALUES, UInt8Array, UInt8, $OP), + DataType::Timestamp(TimeUnit::Second, _) => { + typed_min_max_batch!($VALUES, TimestampSecondArray, TimestampSecond, $OP) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => typed_min_max_batch!( + $VALUES, + TimestampMillisecondArray, + TimestampMillisecond, + $OP + ), + DataType::Timestamp(TimeUnit::Microsecond, _) => typed_min_max_batch!( + $VALUES, + TimestampMicrosecondArray, + TimestampMicrosecond, + $OP + ), + DataType::Timestamp(TimeUnit::Nanosecond, _) => typed_min_max_batch!( + $VALUES, + TimestampNanosecondArray, + TimestampNanosecond, + $OP + ), other => { // This should have been handled before return Err(DataFusionError::Internal(format!( @@ -229,6 +251,27 @@ macro_rules! min_max { (ScalarValue::LargeUtf8(lhs), ScalarValue::LargeUtf8(rhs)) => { typed_min_max_string!(lhs, rhs, LargeUtf8, $OP) } + (ScalarValue::TimestampSecond(lhs), ScalarValue::TimestampSecond(rhs)) => { + typed_min_max!(lhs, rhs, TimestampSecond, $OP) + } + ( + ScalarValue::TimestampMillisecond(lhs), + ScalarValue::TimestampMillisecond(rhs), + ) => { + typed_min_max!(lhs, rhs, TimestampMillisecond, $OP) + } + ( + ScalarValue::TimestampMicrosecond(lhs), + ScalarValue::TimestampMicrosecond(rhs), + ) => { + typed_min_max!(lhs, rhs, TimestampMicrosecond, $OP) + } + ( + ScalarValue::TimestampNanosecond(lhs), + ScalarValue::TimestampNanosecond(rhs), + ) => { + typed_min_max!(lhs, rhs, TimestampNanosecond, $OP) + } e => { return Err(DataFusionError::Internal(format!( "MIN/MAX is not expected to receive a scalar {:?}", diff --git a/rust/datafusion/src/physical_plan/group_scalar.rs b/rust/datafusion/src/physical_plan/group_scalar.rs index a55e1d7a9a3..f4987ae3a7d 100644 --- a/rust/datafusion/src/physical_plan/group_scalar.rs +++ b/rust/datafusion/src/physical_plan/group_scalar.rs @@ -64,9 +64,15 @@ impl TryFrom<&ScalarValue> for GroupByScalar { ScalarValue::UInt16(Some(v)) => GroupByScalar::UInt16(*v), ScalarValue::UInt32(Some(v)) => GroupByScalar::UInt32(*v), ScalarValue::UInt64(Some(v)) => GroupByScalar::UInt64(*v), - ScalarValue::TimeMillisecond(Some(v)) => GroupByScalar::TimeMillisecond(*v), - ScalarValue::TimeMicrosecond(Some(v)) => GroupByScalar::TimeMicrosecond(*v), - ScalarValue::TimeNanosecond(Some(v)) => GroupByScalar::TimeNanosecond(*v), + ScalarValue::TimestampMillisecond(Some(v)) => { + GroupByScalar::TimeMillisecond(*v) + } + ScalarValue::TimestampMicrosecond(Some(v)) => { + GroupByScalar::TimeMicrosecond(*v) + } + ScalarValue::TimestampNanosecond(Some(v)) => { + GroupByScalar::TimeNanosecond(*v) + } ScalarValue::Utf8(Some(v)) => GroupByScalar::Utf8(Box::new(v.clone())), ScalarValue::Float32(None) | ScalarValue::Float64(None) @@ -110,9 +116,15 @@ impl From<&GroupByScalar> for ScalarValue { GroupByScalar::UInt32(v) => ScalarValue::UInt32(Some(*v)), GroupByScalar::UInt64(v) => ScalarValue::UInt64(Some(*v)), GroupByScalar::Utf8(v) => ScalarValue::Utf8(Some(v.to_string())), - GroupByScalar::TimeMillisecond(v) => ScalarValue::TimeMillisecond(Some(*v)), - GroupByScalar::TimeMicrosecond(v) => ScalarValue::TimeMicrosecond(Some(*v)), - GroupByScalar::TimeNanosecond(v) => ScalarValue::TimeNanosecond(Some(*v)), + GroupByScalar::TimeMillisecond(v) => { + ScalarValue::TimestampMillisecond(Some(*v)) + } + GroupByScalar::TimeMicrosecond(v) => { + ScalarValue::TimestampMicrosecond(Some(*v)) + } + GroupByScalar::TimeNanosecond(v) => { + ScalarValue::TimestampNanosecond(Some(*v)) + } GroupByScalar::Date32(v) => ScalarValue::Date32(Some(*v)), } } diff --git a/rust/datafusion/src/scalar.rs b/rust/datafusion/src/scalar.rs index b2367758493..833f707e971 100644 --- a/rust/datafusion/src/scalar.rs +++ b/rust/datafusion/src/scalar.rs @@ -19,16 +19,21 @@ use std::{convert::TryFrom, fmt, iter::repeat, sync::Arc}; -use arrow::array::{ - ArrayRef, Int16Builder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, - TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, - UInt16Builder, UInt32Builder, UInt64Builder, UInt8Builder, -}; use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; use arrow::{ array::*, datatypes::{ArrowNativeType, Float32Type, TimestampNanosecondType}, }; +use arrow::{ + array::{ + ArrayRef, Int16Builder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + UInt16Builder, UInt32Builder, UInt64Builder, UInt8Builder, + }, + datatypes::{ + TimestampMicrosecondType, TimestampMillisecondType, TimestampSecondType, + }, +}; use crate::error::{DataFusionError, Result}; @@ -72,12 +77,14 @@ pub enum ScalarValue { Date32(Option), /// Date stored as a signed 64bit int Date64(Option), + /// Timestamp Second + TimestampSecond(Option), /// Timestamp Milliseconds - TimeMillisecond(Option), + TimestampMillisecond(Option), /// Timestamp Microseconds - TimeMicrosecond(Option), + TimestampMicrosecond(Option), /// Timestamp Nanoseconds - TimeNanosecond(Option), + TimestampNanosecond(Option), /// Interval with YearMonth unit IntervalYearMonth(Option), /// Interval with DayTime unit @@ -145,15 +152,18 @@ impl ScalarValue { ScalarValue::Int16(_) => DataType::Int16, ScalarValue::Int32(_) => DataType::Int32, ScalarValue::Int64(_) => DataType::Int64, - ScalarValue::TimeMicrosecond(_) => { + ScalarValue::TimestampSecond(_) => { + DataType::Timestamp(TimeUnit::Second, None) + } + ScalarValue::TimestampMillisecond(_) => { + DataType::Timestamp(TimeUnit::Millisecond, None) + } + ScalarValue::TimestampMicrosecond(_) => { DataType::Timestamp(TimeUnit::Microsecond, None) } - ScalarValue::TimeNanosecond(_) => { + ScalarValue::TimestampNanosecond(_) => { DataType::Timestamp(TimeUnit::Nanosecond, None) } - ScalarValue::TimeMillisecond(_) => { - DataType::Timestamp(TimeUnit::Millisecond, None) - } ScalarValue::Float32(_) => DataType::Float32, ScalarValue::Float64(_) => DataType::Float64, ScalarValue::Utf8(_) => DataType::Utf8, @@ -209,9 +219,9 @@ impl ScalarValue { | ScalarValue::Utf8(None) | ScalarValue::LargeUtf8(None) | ScalarValue::List(None, _) - | ScalarValue::TimeMillisecond(None) - | ScalarValue::TimeMicrosecond(None) - | ScalarValue::TimeNanosecond(None) + | ScalarValue::TimestampMillisecond(None) + | ScalarValue::TimestampMicrosecond(None) + | ScalarValue::TimestampNanosecond(None) ) } @@ -266,7 +276,15 @@ impl ScalarValue { Some(value) => Arc::new(UInt64Array::from_value(*value, size)), None => new_null_array(&DataType::UInt64, size), }, - ScalarValue::TimeMillisecond(e) => match e { + ScalarValue::TimestampSecond(e) => match e { + Some(value) => Arc::new(TimestampSecondArray::from_iter_values( + repeat(*value).take(size), + )), + None => { + new_null_array(&DataType::Timestamp(TimeUnit::Second, None), size) + } + }, + ScalarValue::TimestampMillisecond(e) => match e { Some(value) => Arc::new(TimestampMillisecondArray::from_iter_values( repeat(*value).take(size), )), @@ -275,7 +293,7 @@ impl ScalarValue { size, ), }, - ScalarValue::TimeMicrosecond(e) => match e { + ScalarValue::TimestampMicrosecond(e) => match e { Some(value) => { Arc::new(TimestampMicrosecondArray::from_value(*value, size)) } @@ -284,7 +302,7 @@ impl ScalarValue { size, ), }, - ScalarValue::TimeNanosecond(e) => match e { + ScalarValue::TimestampNanosecond(e) => match e { Some(value) => { Arc::new(TimestampNanosecondArray::from_value(*value, size)) } @@ -403,6 +421,28 @@ impl ScalarValue { DataType::Date64 => { typed_cast!(array, index, Date64Array, Date64) } + DataType::Timestamp(TimeUnit::Second, _) => { + typed_cast!(array, index, TimestampSecondArray, TimestampSecond) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + typed_cast!( + array, + index, + TimestampMillisecondArray, + TimestampMillisecond + ) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + typed_cast!( + array, + index, + TimestampMicrosecondArray, + TimestampMicrosecond + ) + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + typed_cast!(array, index, TimestampNanosecondArray, TimestampNanosecond) + } other => { return Err(DataFusionError::NotImplemented(format!( "Can't create a scalar of array of type \"{:?}\"", @@ -525,7 +565,7 @@ impl TryFrom for i64 { fn try_from(value: ScalarValue) -> Result { match value { ScalarValue::Int64(Some(inner_value)) - | ScalarValue::TimeNanosecond(Some(inner_value)) => Ok(inner_value), + | ScalarValue::TimestampNanosecond(Some(inner_value)) => Ok(inner_value), _ => Err(DataFusionError::Internal(format!( "Cannot convert {:?} to {}", value, @@ -561,6 +601,18 @@ impl TryFrom<&DataType> for ScalarValue { DataType::UInt64 => ScalarValue::UInt64(None), DataType::Utf8 => ScalarValue::Utf8(None), DataType::LargeUtf8 => ScalarValue::LargeUtf8(None), + DataType::Timestamp(TimeUnit::Second, _) => { + ScalarValue::TimestampSecond(None) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + ScalarValue::TimestampMillisecond(None) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + ScalarValue::TimestampMicrosecond(None) + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + ScalarValue::TimestampNanosecond(None) + } DataType::List(ref nested_type) => { ScalarValue::List(None, nested_type.data_type().clone()) } @@ -597,9 +649,10 @@ impl fmt::Display for ScalarValue { ScalarValue::UInt16(e) => format_option!(f, e)?, ScalarValue::UInt32(e) => format_option!(f, e)?, ScalarValue::UInt64(e) => format_option!(f, e)?, - ScalarValue::TimeMillisecond(e) => format_option!(f, e)?, - ScalarValue::TimeMicrosecond(e) => format_option!(f, e)?, - ScalarValue::TimeNanosecond(e) => format_option!(f, e)?, + ScalarValue::TimestampSecond(e) => format_option!(f, e)?, + ScalarValue::TimestampMillisecond(e) => format_option!(f, e)?, + ScalarValue::TimestampMicrosecond(e) => format_option!(f, e)?, + ScalarValue::TimestampNanosecond(e) => format_option!(f, e)?, ScalarValue::Utf8(e) => format_option!(f, e)?, ScalarValue::LargeUtf8(e) => format_option!(f, e)?, ScalarValue::Binary(e) => match e { @@ -658,9 +711,16 @@ impl fmt::Debug for ScalarValue { ScalarValue::UInt16(_) => write!(f, "UInt16({})", self), ScalarValue::UInt32(_) => write!(f, "UInt32({})", self), ScalarValue::UInt64(_) => write!(f, "UInt64({})", self), - ScalarValue::TimeMillisecond(_) => write!(f, "TimeMillisecond({})", self), - ScalarValue::TimeMicrosecond(_) => write!(f, "TimeMicrosecond({})", self), - ScalarValue::TimeNanosecond(_) => write!(f, "TimeNanosecond({})", self), + ScalarValue::TimestampSecond(_) => write!(f, "TimestampSecond({})", self), + ScalarValue::TimestampMillisecond(_) => { + write!(f, "TimestampMillisecond({})", self) + } + ScalarValue::TimestampMicrosecond(_) => { + write!(f, "TimestampMicrosecond({})", self) + } + ScalarValue::TimestampNanosecond(_) => { + write!(f, "TimestampNanosecond({})", self) + } ScalarValue::Utf8(None) => write!(f, "Utf8({})", self), ScalarValue::Utf8(Some(_)) => write!(f, "Utf8(\"{}\")", self), ScalarValue::LargeUtf8(None) => write!(f, "LargeUtf8({})", self), @@ -694,9 +754,27 @@ impl ScalarType for Float32Type { } } +impl ScalarType for TimestampSecondType { + fn scalar(r: Option) -> ScalarValue { + ScalarValue::TimestampSecond(r) + } +} + +impl ScalarType for TimestampMillisecondType { + fn scalar(r: Option) -> ScalarValue { + ScalarValue::TimestampMillisecond(r) + } +} + +impl ScalarType for TimestampMicrosecondType { + fn scalar(r: Option) -> ScalarValue { + ScalarValue::TimestampMicrosecond(r) + } +} + impl ScalarType for TimestampNanosecondType { fn scalar(r: Option) -> ScalarValue { - ScalarValue::TimeNanosecond(r) + ScalarValue::TimestampNanosecond(r) } } diff --git a/rust/datafusion/src/test/mod.rs b/rust/datafusion/src/test/mod.rs index 57736189481..926a6922616 100644 --- a/rust/datafusion/src/test/mod.rs +++ b/rust/datafusion/src/test/mod.rs @@ -20,7 +20,10 @@ use crate::datasource::{MemTable, TableProvider}; use crate::error::Result; use crate::logical_plan::{LogicalPlan, LogicalPlanBuilder}; -use array::ArrayRef; +use array::{ + Array, ArrayRef, StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, +}; use arrow::array::{self, Int32Array}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; @@ -182,6 +185,93 @@ pub fn make_partition(sz: i32) -> RecordBatch { RecordBatch::try_new(schema, vec![arr]).unwrap() } +/// Return a new table provider containing all of the supported timestamp types +pub fn table_with_timestamps() -> Arc { + let batch = make_timestamps(); + let schema = batch.schema(); + let partitions = vec![vec![batch]]; + Arc::new(MemTable::try_new(schema, partitions).unwrap()) +} + +/// Return record batch with all of the supported timestamp types +/// values +/// +/// Columns are named: +/// "nanos" --> TimestampNanosecondArray +/// "micros" --> TimestampMicrosecondArray +/// "millis" --> TimestampMillisecondArray +/// "secs" --> TimestampSecondArray +/// "names" --> StringArray +pub fn make_timestamps() -> RecordBatch { + let ts_strings = vec![ + Some("2018-11-13T17:11:10.011375885995"), + Some("2011-12-13T11:13:10.12345"), + None, + Some("2021-1-1T05:11:10.432"), + ]; + + let ts_nanos = ts_strings + .into_iter() + .map(|t| { + t.map(|t| { + t.parse::() + .unwrap() + .timestamp_nanos() + }) + }) + .collect::>(); + + let ts_micros = ts_nanos + .iter() + .map(|t| t.as_ref().map(|ts_nanos| ts_nanos / 1000)) + .collect::>(); + + let ts_millis = ts_nanos + .iter() + .map(|t| t.as_ref().map(|ts_nanos| ts_nanos / 1000000)) + .collect::>(); + + let ts_secs = ts_nanos + .iter() + .map(|t| t.as_ref().map(|ts_nanos| ts_nanos / 1000000000)) + .collect::>(); + + let names = ts_nanos + .iter() + .enumerate() + .map(|(i, _)| format!("Row {}", i)) + .collect::>(); + + let arr_nanos = TimestampNanosecondArray::from_opt_vec(ts_nanos, None); + let arr_micros = TimestampMicrosecondArray::from_opt_vec(ts_micros, None); + let arr_millis = TimestampMillisecondArray::from_opt_vec(ts_millis, None); + let arr_secs = TimestampSecondArray::from_opt_vec(ts_secs, None); + + let names = names.iter().map(|s| s.as_str()).collect::>(); + let arr_names = StringArray::from(names); + + let schema = Schema::new(vec![ + Field::new("nanos", arr_nanos.data_type().clone(), false), + Field::new("micros", arr_micros.data_type().clone(), false), + Field::new("millis", arr_millis.data_type().clone(), false), + Field::new("secs", arr_secs.data_type().clone(), false), + Field::new("name", arr_names.data_type().clone(), false), + ]); + let schema = Arc::new(schema); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(arr_nanos), + Arc::new(arr_micros), + Arc::new(arr_millis), + Arc::new(arr_secs), + Arc::new(arr_names), + ], + ) + .unwrap() +} + pub mod exec; pub mod user_defined; pub mod variable; From a102ba2f8b0054871eb441bbf6dc007a9b448ee7 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 13 Apr 2021 08:51:51 -0400 Subject: [PATCH 008/719] ARROW-12288: [C++] Create Scanner interface To prepare for the AsyncScanner this PR creates a Scanner interface and, along the way, simplifies the current Scanner API so that the new scanner won't need to match. ## What is removed: * `Scanner::GetFragments` was only used in `FileSystemDataset::Write`. The correct source of truth for fragments is the `Dataset`. Note: The python implementation exposed this method but it was not documented or used in any unit test. I think it can be safely removed and we need not worry about deprecation. * `Scanner::schema` is redundant and ambiguous. There are two schemas at the scan level. The dataset schema (the unified master schema that we expect all fragment schemas to be a subset of) and the projection schema (a combination of the dataset schema and the projection expression). Both of these are available on the scan options object and there is an accessor for these options so the caller might as well get them from there. This schema function was exposed via R and used internally there but I think any uses can be easily changed to using the options. * `FileFormat::splittable` and `Fragment::splittable`. These were intended to advertise that batch readahead was available on the given fragment/format. However, there is no need to advertise this. They are not used by the `SyncScanner` and the `AsyncScanner` will just assume that the format/fragment's will utilize readahead if they can (respecting the readahead options in `ScanOptions`) * Direct instantiation of `Scanner`. All `Scanner` creation should go through `ScannerBuilder` now. This allows the `ScannerBuilder` to determine what implementation to use. This was mostly the way things were implemented already. Only a few tests instantiated a `Scanner` directly. ## What is deprecated * `Scanner::Scan` is going to be deprecated (ARROW-11797). It will not be implemented by `AsyncScanner`. I do not actually deprecate it in this PR as I reserve that for ARROW-11797. Unfortunately, this method was exposed via python & R and likely was used so deprecation is recommended over outright removal. ## What is new * `Scanner::ScanBatches` and `Scanner::ScanBatchesUnordered` have been added. These functions will be the new preferred "scan" method going forward. This allows the parallelization (batch readahead, file readahead, etc.) to be handled by C++ and simplifies the user's life. * `ScanOptions::batch_readahead` and `ScanOptions::fragment_readahead` options allow more fine grained control over how to perform readahead. One technicality is that these options will not be respected well by the `SyncScanner` (although I think the current ARROW-11797 utilizes batch readahead) so they are more placeholders for when we implement `AsyncScanner`. * `ScanOptions::cpu_executor` and `ScanOptions::io_context` are added and should be fairly self explanatory. * `ScanOptions::use_async` will toggle which scanner to use. Closes #9947 from westonpace/feature/arrow-12288 Authored-by: Weston Pace Signed-off-by: David Li --- cpp/src/arrow/dataset/dataset.h | 5 - cpp/src/arrow/dataset/dataset_test.cc | 4 +- cpp/src/arrow/dataset/file_base.cc | 27 +--- cpp/src/arrow/dataset/file_base.h | 4 - cpp/src/arrow/dataset/file_csv.cc | 2 +- cpp/src/arrow/dataset/file_ipc.h | 2 - cpp/src/arrow/dataset/file_ipc_test.cc | 9 +- cpp/src/arrow/dataset/file_parquet.h | 2 - cpp/src/arrow/dataset/scanner.cc | 128 +++++++++++++++-- cpp/src/arrow/dataset/scanner.h | 190 +++++++++++++++++++++---- cpp/src/arrow/dataset/scanner_test.cc | 44 +++++- cpp/src/arrow/dataset/test_util.h | 53 ++++++- cpp/src/jni/dataset/jni_wrapper.cc | 3 +- python/pyarrow/_dataset.pyx | 8 -- r/src/dataset.cpp | 2 +- 15 files changed, 396 insertions(+), 87 deletions(-) diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h index 6be83059fc1..5d818b23938 100644 --- a/cpp/src/arrow/dataset/dataset.h +++ b/cpp/src/arrow/dataset/dataset.h @@ -64,9 +64,6 @@ class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this { /// To receive a record batch stream which is fully filtered and projected, use Scanner. virtual Result Scan(std::shared_ptr options) = 0; - /// \brief Return true if the fragment can benefit from parallel scanning. - virtual bool splittable() const = 0; - virtual std::string type_name() const = 0; virtual std::string ToString() const { return type_name(); } @@ -111,8 +108,6 @@ class ARROW_DS_EXPORT InMemoryFragment : public Fragment { Result Scan(std::shared_ptr options) override; - bool splittable() const override { return false; } - std::string type_name() const override { return "in-memory"; } protected: diff --git a/cpp/src/arrow/dataset/dataset_test.cc b/cpp/src/arrow/dataset/dataset_test.cc index 1db96b8b5c3..7aa0e1a2413 100644 --- a/cpp/src/arrow/dataset/dataset_test.cc +++ b/cpp/src/arrow/dataset/dataset_test.cc @@ -442,7 +442,7 @@ TEST_F(TestEndToEnd, EndToEndSingleDataset) { // In the simplest case, consumption is simply conversion to a Table. ASSERT_OK_AND_ASSIGN(auto table, scanner->ToTable()); - auto expected = TableFromJSON(scanner->schema(), {R"([ + auto expected = TableFromJSON(scanner_builder->projected_schema(), {R"([ {"sales": 152.25, "model": "3", "country": "CA"}, {"sales": 273.5, "model": "3", "country": "US"} ])"}); @@ -547,7 +547,7 @@ class TestSchemaUnification : public TestUnionDataset { void AssertScanEquals(std::shared_ptr scanner, const std::vector& expected_rows) { std::vector columns; - for (const auto& field : scanner->schema()->fields()) { + for (const auto& field : scanner->options()->projected_schema->fields()) { columns.push_back(field->name()); } diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index ad19bd2041e..7b2f42055b3 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -369,7 +369,7 @@ struct WriteState { std::unordered_map> queues; }; -Status WriteNextBatch(WriteState& state, const std::shared_ptr& scan_task, +Status WriteNextBatch(WriteState& state, const std::shared_ptr& fragment, std::shared_ptr batch) { ARROW_ASSIGN_OR_RAISE(auto groups, state.write_options.partitioning->Partition(batch)); batch.reset(); // drop to hopefully conserve memory @@ -382,8 +382,8 @@ Status WriteNextBatch(WriteState& state, const std::shared_ptr& scan_t std::unordered_set need_flushed; for (size_t i = 0; i < groups.batches.size(); ++i) { - auto partition_expression = and_(std::move(groups.expressions[i]), - scan_task->fragment()->partition_expression()); + auto partition_expression = + and_(std::move(groups.expressions[i]), fragment->partition_expression()); auto batch = std::move(groups.batches[i]); ARROW_ASSIGN_OR_RAISE(auto part, @@ -432,7 +432,7 @@ Future<> WriteInternal(const ScanOptions& scan_options, WriteState& state, ARROW_ASSIGN_OR_RAISE(auto batches_gen, scan_task->ExecuteAsync(cpu_executor)); std::function batch)> batch_visitor = [&, scan_task](std::shared_ptr batch) { - return WriteNextBatch(state, scan_task, std::move(batch)); + return WriteNextBatch(state, scan_task->fragment(), std::move(batch)); }; scan_futs.push_back(VisitAsyncGenerator(batches_gen, batch_visitor)); } else { @@ -441,7 +441,7 @@ Future<> WriteInternal(const ScanOptions& scan_options, WriteState& state, for (auto maybe_batch : batches) { ARROW_ASSIGN_OR_RAISE(auto batch, maybe_batch); - RETURN_NOT_OK(WriteNextBatch(state, scan_task, std::move(batch))); + RETURN_NOT_OK(WriteNextBatch(state, scan_task->fragment(), std::move(batch))); } return Status::OK(); @@ -469,21 +469,8 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio // // NB: neither of these will have any impact whatsoever on the common case of writing // an in-memory table to disk. - ARROW_ASSIGN_OR_RAISE(auto fragment_it, scanner->GetFragments()); - ARROW_ASSIGN_OR_RAISE(FragmentVector fragments, fragment_it.ToVector()); - ScanTaskVector scan_tasks; - - for (const auto& fragment : fragments) { - auto options = std::make_shared(*scanner->options()); - // Avoid contention with multithreaded readers - options->use_threads = false; - ARROW_ASSIGN_OR_RAISE(auto scan_task_it, - Scanner(fragment, std::move(options)).Scan()); - for (auto maybe_scan_task : scan_task_it) { - ARROW_ASSIGN_OR_RAISE(auto scan_task, maybe_scan_task); - scan_tasks.push_back(std::move(scan_task)); - } - } + ARROW_ASSIGN_OR_RAISE(auto scan_task_it, scanner->Scan()); + ARROW_ASSIGN_OR_RAISE(ScanTaskVector scan_tasks, scan_task_it.ToVector()); WriteState state(write_options); auto res = internal::RunSynchronously( diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h index e4e7167aa75..ccc3d54709b 100644 --- a/cpp/src/arrow/dataset/file_base.h +++ b/cpp/src/arrow/dataset/file_base.h @@ -134,9 +134,6 @@ class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_thistype_name(); } std::string ToString() const override { return source_.path(); }; - bool splittable() const override { return format_->splittable(); } const FileSource& source() const { return source_; } const std::shared_ptr& format() const { return format_; } diff --git a/cpp/src/arrow/dataset/file_csv.cc b/cpp/src/arrow/dataset/file_csv.cc index 677d1be05b7..9a7a9d2de4c 100644 --- a/cpp/src/arrow/dataset/file_csv.cc +++ b/cpp/src/arrow/dataset/file_csv.cc @@ -45,7 +45,7 @@ using internal::checked_cast; using internal::checked_pointer_cast; using internal::Executor; using internal::SerialExecutor; -using RecordBatchGenerator = AsyncGenerator>; +using RecordBatchGenerator = std::function>()>; Result> GetColumnNames( const csv::ParseOptions& parse_options, util::string_view first_block, diff --git a/cpp/src/arrow/dataset/file_ipc.h b/cpp/src/arrow/dataset/file_ipc.h index a7bcd04a9d2..621eef80635 100644 --- a/cpp/src/arrow/dataset/file_ipc.h +++ b/cpp/src/arrow/dataset/file_ipc.h @@ -42,8 +42,6 @@ class ARROW_DS_EXPORT IpcFileFormat : public FileFormat { return type_name() == other.type_name(); } - bool splittable() const override { return true; } - Result IsSupported(const FileSource& source) const override; /// \brief Return the schema of the file if possible. diff --git a/cpp/src/arrow/dataset/file_ipc_test.cc b/cpp/src/arrow/dataset/file_ipc_test.cc index 502b61ca645..ef0c0f62108 100644 --- a/cpp/src/arrow/dataset/file_ipc_test.cc +++ b/cpp/src/arrow/dataset/file_ipc_test.cc @@ -234,6 +234,13 @@ class TestIpcFileSystemDataset : public testing::Test, format_ = ipc_format; SetWriteOptions(ipc_format->DefaultWriteOptions()); } + + std::shared_ptr MakeScanner(const std::shared_ptr& dataset, + const std::shared_ptr& scan_options) { + ScannerBuilder builder(dataset, scan_options); + EXPECT_OK_AND_ASSIGN(auto scanner, builder.Finish()); + return scanner; + } }; TEST_F(TestIpcFileSystemDataset, WriteWithIdenticalPartitioningSchema) { @@ -259,7 +266,7 @@ TEST_F(TestIpcFileSystemDataset, WriteExceedsMaxPartitions) { // require that no batch be grouped into more than 2 written batches: write_options_.max_partitions = 2; - auto scanner = std::make_shared(dataset_, scan_options_); + auto scanner = MakeScanner(dataset_, scan_options_); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("This exceeds the maximum"), FileSystemDataset::Write(write_options_, scanner)); } diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index fa0d7dea843..ac8a746481a 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -70,8 +70,6 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat { std::string type_name() const override { return kParquetTypeName; } - bool splittable() const override { return true; } - bool Equals(const FileFormat& other) const override; struct ReaderOptions { diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc index a8ac24b7799..738c9fc0f62 100644 --- a/cpp/src/arrow/dataset/scanner.cc +++ b/cpp/src/arrow/dataset/scanner.cc @@ -70,7 +70,108 @@ Result ScanTask::ExecuteAsync(internal::Executor*) { bool ScanTask::supports_async() const { return false; } -Result Scanner::GetFragments() { +Result Scanner::Scan() { + // TODO(ARROW-12289) This is overridden in SyncScanner and will never be implemented in + // AsyncScanner. It is deprecated and will eventually go away. + return Status::NotImplemented("This scanner does not support the legacy Scan() method"); +} + +Result Scanner::ScanBatchesUnordered() { + // If a scanner doesn't support unordered scanning (i.e. SyncScanner) then we just + // fall back to an ordered scan and assign the appropriate tagging + ARROW_ASSIGN_OR_RAISE(auto ordered_scan, ScanBatches()); + return AddPositioningToInOrderScan(std::move(ordered_scan)); +} + +Result Scanner::AddPositioningToInOrderScan( + TaggedRecordBatchIterator scan) { + ARROW_ASSIGN_OR_RAISE(auto first, scan.Next()); + if (IsIterationEnd(first)) { + return MakeEmptyIterator(); + } + struct State { + State(TaggedRecordBatchIterator source, TaggedRecordBatch first) + : source(std::move(source)), + batch_index(0), + fragment_index(0), + finished(false), + prev_batch(std::move(first)) {} + TaggedRecordBatchIterator source; + int batch_index; + int fragment_index; + bool finished; + TaggedRecordBatch prev_batch; + }; + struct EnumeratingIterator { + Result Next() { + if (state->finished) { + return IterationEnd(); + } + ARROW_ASSIGN_OR_RAISE(auto next, state->source.Next()); + if (IsIterationEnd(next)) { + state->finished = true; + return EnumeratedRecordBatch{ + {std::move(state->prev_batch.record_batch), state->batch_index, true}, + {std::move(state->prev_batch.fragment), state->fragment_index, true}}; + } + auto prev = std::move(state->prev_batch); + bool prev_is_last_batch = false; + auto prev_batch_index = state->batch_index; + auto prev_fragment_index = state->fragment_index; + // Reference equality here seems risky but a dataset should have a constant set of + // fragments which should be consistent for the lifetime of a scan + if (prev.fragment.get() != next.fragment.get()) { + state->batch_index = 0; + state->fragment_index++; + prev_is_last_batch = true; + } else { + state->batch_index++; + } + state->prev_batch = std::move(next); + return EnumeratedRecordBatch{ + {std::move(prev.record_batch), prev_batch_index, prev_is_last_batch}, + {std::move(prev.fragment), prev_fragment_index, false}}; + } + std::shared_ptr state; + }; + return EnumeratedRecordBatchIterator( + EnumeratingIterator{std::make_shared(std::move(scan), std::move(first))}); +} + +Result SyncScanner::ScanBatches() { + // TODO(ARROW-11797) Provide a better implementation that does readahead. Also, add + // unit testing + ARROW_ASSIGN_OR_RAISE(auto scan_task_it, Scan()); + struct BatchIter { + explicit BatchIter(ScanTaskIterator scan_task_it) + : scan_task_it(std::move(scan_task_it)) {} + + Result Next() { + while (true) { + if (current_task == nullptr) { + ARROW_ASSIGN_OR_RAISE(current_task, scan_task_it.Next()); + if (IsIterationEnd>(current_task)) { + return IterationEnd(); + } + ARROW_ASSIGN_OR_RAISE(batch_it, current_task->Execute()); + } + ARROW_ASSIGN_OR_RAISE(auto next, batch_it.Next()); + if (IsIterationEnd>(next)) { + current_task = nullptr; + } else { + return TaggedRecordBatch{next, current_task->fragment()}; + } + } + } + + ScanTaskIterator scan_task_it; + RecordBatchIterator batch_it; + std::shared_ptr current_task; + }; + return TaggedRecordBatchIterator(BatchIter(std::move(scan_task_it))); +} + +Result SyncScanner::GetFragments() { if (fragment_ != nullptr) { return MakeVectorIterator(FragmentVector{fragment_}); } @@ -81,7 +182,7 @@ Result Scanner::GetFragments() { return GetFragmentsFromDatasets({dataset_}, scan_options_->filter); } -Result Scanner::Scan() { +Result SyncScanner::Scan() { // Transforms Iterator into a unified // Iterator. The first Iterator::Next invocation is going to do // all the work of unwinding the chained iterators. @@ -110,7 +211,7 @@ ScannerBuilder::ScannerBuilder(std::shared_ptr dataset, fragment_(nullptr), scan_options_(std::move(scan_options)) { scan_options_->dataset_schema = dataset_->schema(); - DCHECK_OK(Filter(literal(true))); + DCHECK_OK(Filter(scan_options_->filter)); } ScannerBuilder::ScannerBuilder(std::shared_ptr schema, @@ -120,13 +221,17 @@ ScannerBuilder::ScannerBuilder(std::shared_ptr schema, fragment_(std::move(fragment)), scan_options_(std::move(scan_options)) { scan_options_->dataset_schema = std::move(schema); - DCHECK_OK(Filter(literal(true))); + DCHECK_OK(Filter(scan_options_->filter)); } const std::shared_ptr& ScannerBuilder::schema() const { return scan_options_->dataset_schema; } +const std::shared_ptr& ScannerBuilder::projected_schema() const { + return scan_options_->projected_schema; +} + Status ScannerBuilder::Project(std::vector columns) { return SetProjection(scan_options_.get(), std::move(columns)); } @@ -170,9 +275,15 @@ Result> ScannerBuilder::Finish() { } if (dataset_ == nullptr) { - return std::make_shared(fragment_, scan_options_); + // AsyncScanner does not support this method of running. It may in the future + return std::make_shared(fragment_, scan_options_); + } + if (scan_options_->use_async) { + // TODO(ARROW-12289) + return Status::NotImplemented("The asynchronous scanner is not yet available"); + } else { + return std::make_shared(dataset_, scan_options_); } - return std::make_shared(dataset_, scan_options_); } static inline RecordBatchVector FlattenRecordBatchVector( @@ -202,13 +313,13 @@ struct TableAssemblyState { } }; -Result> Scanner::ToTable() { +Result> SyncScanner::ToTable() { return internal::RunSynchronously>( [this](Executor* executor) { return ToTableInternal(executor); }, scan_options_->use_threads); } -Future> Scanner::ToTableInternal( +Future> SyncScanner::ToTableInternal( internal::Executor* cpu_executor) { ARROW_ASSIGN_OR_RAISE(auto scan_task_it, Scan()); auto task_group = scan_options_->TaskGroup(); @@ -218,6 +329,7 @@ Future> Scanner::ToTableInternal( /// and the mutex/batches fail out of scope. auto state = std::make_shared(); + // TODO (ARROW-11797) Migrate to using ScanBatches() size_t scan_task_id = 0; std::vector> scan_futures; for (auto maybe_scan_task : scan_task_it) { diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h index 9bd4b10847b..ddd86674d39 100644 --- a/cpp/src/arrow/dataset/scanner.h +++ b/cpp/src/arrow/dataset/scanner.h @@ -30,8 +30,11 @@ #include "arrow/dataset/projector.h" #include "arrow/dataset/type_fwd.h" #include "arrow/dataset/visibility.h" +#include "arrow/io/interfaces.h" #include "arrow/memory_pool.h" #include "arrow/type_fwd.h" +#include "arrow/util/iterator.h" +#include "arrow/util/thread_pool.h" #include "arrow/util/type_fwd.h" namespace arrow { @@ -41,6 +44,8 @@ using RecordBatchGenerator = std::function>( namespace dataset { constexpr int64_t kDefaultBatchSize = 1 << 20; +constexpr int32_t kDefaultBatchReadahead = 32; +constexpr int32_t kDefaultFragmentReadahead = 8; struct ARROW_DS_EXPORT ScanOptions { // Filter and projection @@ -67,12 +72,48 @@ struct ARROW_DS_EXPORT ScanOptions { // Maximum row count for scanned batches. int64_t batch_size = kDefaultBatchSize; + /// How many batches to read ahead within a file + /// + /// Set to 0 to disable batch readahead + /// + /// Note: May not be supported by all formats + /// Note: May not be supported by all scanners + /// Note: Will be ignored if use_threads is set to false + int32_t batch_readahead = kDefaultBatchReadahead; + + /// How many files to read ahead + /// + /// Set to 0 to disable fragment readahead + /// + /// Note: May not be enforced by all scanners + /// Note: Will be ignored if use_threads is set to false + int32_t fragment_readahead = kDefaultFragmentReadahead; + /// A pool from which materialized and scanned arrays will be allocated. MemoryPool* pool = arrow::default_memory_pool(); - /// Indicate if the Scanner should make use of a ThreadPool. + /// Executor on which to run any CPU tasks + /// + /// Note: Will be ignored if use_threads is set to false + internal::Executor* cpu_executor = internal::GetCpuThreadPool(); + + /// IOContext for any IO tasks + /// + /// Note: The IOContext executor will be ignored if use_threads is set to false + io::IOContext io_context; + + /// If true the scanner will scan in parallel + /// + /// Note: If true, this will use threads from both the cpu_executor and the + /// io_context.executor + /// Note: This must be true in order for any readahead to happen bool use_threads = false; + /// If true then an asycnhronous implementation of the scanner will be used. + /// This implementation is newer and generally performs better. However, it + /// makes extensive use of threading and is still considered experimental + bool use_async = false; + /// Fragment-specific scan options. std::shared_ptr fragment_scan_options; @@ -140,49 +181,148 @@ ARROW_DS_EXPORT Result ScanTaskIteratorFromRecordBatch( std::vector> batches, std::shared_ptr options); -/// \brief Scanner is a materialized scan operation with context and options -/// bound. A scanner is the class that glues ScanTask, Fragment, -/// and Dataset. In python pseudo code, it performs the following: +template +struct Enumerated { + T value; + int index; + bool last; +}; + +/// \brief Combines a record batch with the fragment that the record batch originated +/// from /// -/// def Scan(): -/// for fragment in self.dataset.GetFragments(this.options.filter): -/// for scan_task in fragment.Scan(this.options): -/// yield scan_task +/// Knowing the source fragment can be useful for debugging & understanding loaded data +struct TaggedRecordBatch { + std::shared_ptr record_batch; + std::shared_ptr fragment; +}; +using TaggedRecordBatchGenerator = std::function()>; +using TaggedRecordBatchIterator = Iterator; + +/// \brief Combines a tagged batch with positional information +/// +/// This is returned when scanning batches in an unordered fashion. This information is +/// needed if you ever want to reassemble the batches in order +struct EnumeratedRecordBatch { + Enumerated> record_batch; + Enumerated> fragment; +}; +using EnumeratedRecordBatchGenerator = std::function()>; +using EnumeratedRecordBatchIterator = Iterator; + +} // namespace dataset + +template <> +struct IterationTraits { + static dataset::TaggedRecordBatch End() { + return dataset::TaggedRecordBatch{NULL, NULL}; + } + static bool IsEnd(const dataset::TaggedRecordBatch& val) { + return val.record_batch == NULL; + } +}; + +template <> +struct IterationTraits { + static dataset::EnumeratedRecordBatch End() { + return dataset::EnumeratedRecordBatch{{NULL, -1, false}, {NULL, -1, false}}; + } + static bool IsEnd(const dataset::EnumeratedRecordBatch& val) { + return val.fragment.value == NULL; + } +}; + +namespace dataset { +/// \brief A scanner glues together several dataset classes to load in data. +/// The dataset contains a collection of fragments and partitioning rules. +/// +/// The fragments identify independently loadable units of data (i.e. each fragment has +/// a potentially unique schema and possibly even format. It should be possible to read +/// fragments in parallel if desired). +/// +/// The fragment's format contains the logic necessary to actually create a task to load +/// the fragment into memory. That task may or may not support parallel execution of +/// its own. +/// +/// The scanner is then responsible for creating scan tasks from every fragment in the +/// dataset and (potentially) sequencing the loaded record batches together. +/// +/// The scanner should not buffer the entire dataset in memory (unless asked) instead +/// yielding record batches as soon as they are ready to scan. Various readahead +/// properties control how much data is allowed to be scanned before pausing to let a +/// slow consumer catchup. +/// +/// Today the scanner also handles projection & filtering although that may change in +/// the future. class ARROW_DS_EXPORT Scanner { public: - Scanner(std::shared_ptr dataset, std::shared_ptr scan_options) - : dataset_(std::move(dataset)), scan_options_(std::move(scan_options)) {} - - Scanner(std::shared_ptr fragment, std::shared_ptr scan_options) - : fragment_(std::move(fragment)), scan_options_(std::move(scan_options)) {} + virtual ~Scanner() = default; /// \brief The Scan operator returns a stream of ScanTask. The caller is /// responsible to dispatch/schedule said tasks. Tasks should be safe to run /// in a concurrent fashion and outlive the iterator. - Result Scan(); - + /// + /// Note: Not supported by the async scanner + /// TODO(ARROW-11797) Deprecate Scan() + virtual Result Scan(); /// \brief Convert a Scanner into a Table. /// /// Use this convenience utility with care. This will serially materialize the /// Scan result in memory before creating the Table. - Result> ToTable(); + virtual Result> ToTable() = 0; + /// \brief Scan the dataset into a stream of record batches. Each batch is tagged + /// with the fragment it originated from. The batches will arrive in order. The + /// order of fragments is determined by the dataset. + /// + /// Note: The scanner will perform some readahead but will avoid materializing too + /// much in memory (this is goverended by the readahead options and use_threads option). + /// If the readahead queue fills up then I/O will pause until the calling thread catches + /// up. + virtual Result ScanBatches() = 0; + /// \brief Scan the dataset into a stream of record batches. Unlike ScanBatches this + /// method may allow record batches to be returned out of order. This allows for more + /// efficient scanning: some fragments may be accessed more quickly than others (e.g. + /// may be cached in RAM or just happen to get scheduled earlier by the I/O) + /// + /// To make up for the out-of-order iteration each batch is further tagged with + /// positional information. + virtual Result ScanBatchesUnordered(); - /// \brief GetFragments returns an iterator over all Fragments in this scan. - Result GetFragments(); + const std::shared_ptr& options() const { return scan_options_; } - const std::shared_ptr& schema() const { - return scan_options_->projected_schema; - } + protected: + explicit Scanner(std::shared_ptr scan_options) + : scan_options_(std::move(scan_options)) {} - const std::shared_ptr& options() const { return scan_options_; } + Result AddPositioningToInOrderScan( + TaggedRecordBatchIterator scan); + + const std::shared_ptr scan_options_; +}; + +class ARROW_DS_EXPORT SyncScanner : public Scanner { + public: + SyncScanner(std::shared_ptr dataset, std::shared_ptr scan_options) + : Scanner(std::move(scan_options)), dataset_(std::move(dataset)) {} + + SyncScanner(std::shared_ptr fragment, + std::shared_ptr scan_options) + : Scanner(std::move(scan_options)), fragment_(std::move(fragment)) {} + + Result ScanBatches() override; + + Result Scan() override; + + Result> ToTable() override; protected: + /// \brief GetFragments returns an iterator over all Fragments in this scan. + Result GetFragments(); Future> ToTableInternal(internal::Executor* cpu_executor); std::shared_ptr dataset_; // TODO(ARROW-8065) remove fragment_ after a Dataset is constuctible from fragments std::shared_ptr fragment_; - std::shared_ptr scan_options_; }; /// \brief ScannerBuilder is a factory class to construct a Scanner. It is used @@ -209,7 +349,8 @@ class ARROW_DS_EXPORT ScannerBuilder { /// Schema. Status Project(std::vector columns); - /// \brief Set expressions which will be evaluated to produce the materialized columns. + /// \brief Set expressions which will be evaluated to produce the materialized + /// columns. /// /// Columns which are not referenced may not be read from fragments. /// @@ -255,6 +396,7 @@ class ARROW_DS_EXPORT ScannerBuilder { Result> Finish(); const std::shared_ptr& schema() const; + const std::shared_ptr& projected_schema() const; private: std::shared_ptr dataset_; diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index eec8ed21668..ccae126da47 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -38,7 +38,7 @@ constexpr int64_t kBatchSize = 1024; class TestScanner : public DatasetFixtureMixin { protected: - Scanner MakeScanner(std::shared_ptr batch) { + std::shared_ptr MakeScanner(std::shared_ptr batch) { std::vector> batches{static_cast(kNumberBatches), batch}; @@ -47,17 +47,35 @@ class TestScanner : public DatasetFixtureMixin { EXPECT_OK_AND_ASSIGN(auto dataset, UnionDataset::Make(batch->schema(), children)); - return Scanner{dataset, options_}; + ScannerBuilder builder(dataset, options_); + EXPECT_OK_AND_ASSIGN(auto scanner, builder.Finish()); + return scanner; } void AssertScannerEqualsRepetitionsOf( - Scanner scanner, std::shared_ptr batch, + std::shared_ptr scanner, std::shared_ptr batch, const int64_t total_batches = kNumberChildDatasets * kNumberBatches) { auto expected = ConstantArrayGenerator::Repeat(total_batches, batch); // Verifies that the unified BatchReader is equivalent to flattening all the // structures of the scanner, i.e. Scanner[Dataset[ScanTask[RecordBatch]]] - AssertScannerEquals(expected.get(), &scanner); + AssertScannerEquals(expected.get(), scanner.get()); + } + + void AssertScanBatchesEqualRepetitionsOf( + std::shared_ptr scanner, std::shared_ptr batch, + const int64_t total_batches = kNumberChildDatasets * kNumberBatches) { + auto expected = ConstantArrayGenerator::Repeat(total_batches, batch); + + AssertScanBatchesEquals(expected.get(), scanner.get()); + } + + void AssertScanBatchesUnorderedEqualRepetitionsOf( + std::shared_ptr scanner, std::shared_ptr batch, + const int64_t total_batches = kNumberChildDatasets * kNumberBatches) { + auto expected = ConstantArrayGenerator::Repeat(total_batches, batch); + + AssertScanBatchesUnorderedEquals(expected.get(), scanner.get()); } }; @@ -67,6 +85,18 @@ TEST_F(TestScanner, Scan) { AssertScannerEqualsRepetitionsOf(MakeScanner(batch), batch); } +TEST_F(TestScanner, ScanBatches) { + SetSchema({field("i32", int32()), field("f64", float64())}); + auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, schema_); + AssertScanBatchesEqualRepetitionsOf(MakeScanner(batch), batch); +} + +TEST_F(TestScanner, ScanBatchesUnordered) { + SetSchema({field("i32", int32()), field("f64", float64())}); + auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, schema_); + AssertScanBatchesUnorderedEqualRepetitionsOf(MakeScanner(batch), batch); +} + TEST_F(TestScanner, ScanWithCappedBatchSize) { SetSchema({field("i32", int32()), field("f64", float64())}); auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, schema_); @@ -126,7 +156,7 @@ TEST_F(TestScanner, MaterializeMissingColumn) { ScannerBuilder builder{schema_, fragment_missing_f64, options_}; ASSERT_OK_AND_ASSIGN(auto scanner, builder.Finish()); - AssertScannerEqualsRepetitionsOf(*scanner, batch_with_f64); + AssertScannerEqualsRepetitionsOf(scanner, batch_with_f64); } TEST_F(TestScanner, ToTable) { @@ -141,13 +171,13 @@ TEST_F(TestScanner, ToTable) { std::shared_ptr actual; options_->use_threads = false; - ASSERT_OK_AND_ASSIGN(actual, scanner.ToTable()); + ASSERT_OK_AND_ASSIGN(actual, scanner->ToTable()); AssertTablesEqual(*expected, *actual); // There is no guarantee on the ordering when using multiple threads, but // since the RecordBatch is always the same it will pass. options_->use_threads = true; - ASSERT_OK_AND_ASSIGN(actual, scanner.ToTable()); + ASSERT_OK_AND_ASSIGN(actual, scanner->ToTable()); AssertTablesEqual(*expected, *actual); } diff --git a/cpp/src/arrow/dataset/test_util.h b/cpp/src/arrow/dataset/test_util.h index 72cde368013..826e8b7901a 100644 --- a/cpp/src/arrow/dataset/test_util.h +++ b/cpp/src/arrow/dataset/test_util.h @@ -43,10 +43,12 @@ #include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" +#include "arrow/util/async_generator.h" #include "arrow/util/io_util.h" #include "arrow/util/iterator.h" #include "arrow/util/logging.h" #include "arrow/util/make_unique.h" +#include "arrow/util/thread_pool.h" namespace arrow { namespace dataset { @@ -137,6 +139,14 @@ class DatasetFixtureMixin : public ::testing::Test { } } + /// \brief Assert the value of the next batch yielded by the reader + void AssertBatchEquals(RecordBatchReader* expected, const RecordBatch& batch) { + std::shared_ptr lhs; + ASSERT_OK(expected->ReadNext(&lhs)); + EXPECT_NE(lhs, nullptr); + AssertBatchesEqual(*lhs, batch); + } + /// \brief Ensure that record batches found in reader are equals to the /// record batches yielded by the data fragment. void AssertFragmentEquals(RecordBatchReader* expected, Fragment* fragment, @@ -186,6 +196,46 @@ class DatasetFixtureMixin : public ::testing::Test { } } + /// \brief Ensure that record batches found in reader are equals to the + /// record batches yielded by a scanner. + void AssertScanBatchesEquals(RecordBatchReader* expected, Scanner* scanner, + bool ensure_drained = true) { + ASSERT_OK_AND_ASSIGN(auto it, scanner->ScanBatches()); + + ARROW_EXPECT_OK(it.Visit([&](TaggedRecordBatch batch) -> Status { + AssertBatchEquals(expected, *batch.record_batch); + return Status::OK(); + })); + + if (ensure_drained) { + EnsureRecordBatchReaderDrained(expected); + } + } + + /// \brief Ensure that record batches found in reader are equals to the + /// record batches yielded by a scanner. Each fragment in the scanner is + /// expected to have a single batch. + void AssertScanBatchesUnorderedEquals(RecordBatchReader* expected, Scanner* scanner, + bool ensure_drained = true) { + ASSERT_OK_AND_ASSIGN(auto it, scanner->ScanBatchesUnordered()); + + int fragment_counter = 0; + bool saw_last_fragment = false; + ARROW_EXPECT_OK(it.Visit([&](EnumeratedRecordBatch batch) -> Status { + EXPECT_EQ(0, batch.record_batch.index); + EXPECT_EQ(true, batch.record_batch.last); + EXPECT_EQ(fragment_counter++, batch.fragment.index); + EXPECT_FALSE(saw_last_fragment); + saw_last_fragment = batch.fragment.last; + AssertBatchEquals(expected, *batch.record_batch.value); + return Status::OK(); + })); + + if (ensure_drained) { + EnsureRecordBatchReaderDrained(expected); + } + } + /// \brief Ensure that record batches found in reader are equals to the /// record batches yielded by a dataset. void AssertDatasetEquals(RecordBatchReader* expected, Dataset* dataset, @@ -584,7 +634,8 @@ class WriteFileSystemDatasetMixin : public MakeFileSystemDatasetMixin { void DoWrite(std::shared_ptr desired_partitioning) { write_options_.partitioning = desired_partitioning; - auto scanner = std::make_shared(dataset_, scan_options_); + auto scanner_builder = ScannerBuilder(dataset_, scan_options_); + ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder.Finish()); ASSERT_OK(FileSystemDataset::Write(write_options_, scanner)); // re-discover the written dataset diff --git a/cpp/src/jni/dataset/jni_wrapper.cc b/cpp/src/jni/dataset/jni_wrapper.cc index fe09dc44eca..196bf2b5c05 100644 --- a/cpp/src/jni/dataset/jni_wrapper.cc +++ b/cpp/src/jni/dataset/jni_wrapper.cc @@ -475,7 +475,8 @@ Java_org_apache_arrow_dataset_jni_JniWrapper_getSchemaFromScanner(JNIEnv* env, j std::shared_ptr schema = RetrieveNativeInstance(scanner_id) ->GetScanner() - ->schema(); + ->options() + ->projected_schema; return JniGetOrThrow(ToSchemaByteArray(env, schema)); JNI_METHOD_END(nullptr) } diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 3320b472e1b..a6cfd711558 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -2791,14 +2791,6 @@ cdef class Scanner(_Weakrefable): return pyarrow_wrap_table(GetResultValue(result)) - def get_fragments(self): - """Returns an iterator over the fragments in this scan. - """ - cdef CFragmentIterator c_fragments = move(GetResultValue( - self.scanner.GetFragments())) - for maybe_fragment in c_fragments: - yield Fragment.wrap(GetResultValue(move(maybe_fragment))) - def _get_partition_keys(Expression partition_expression): """ diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp index c8fdb7ae311..dc7ccd693a2 100644 --- a/r/src/dataset.cpp +++ b/r/src/dataset.cpp @@ -457,7 +457,7 @@ cpp11::list dataset___Scanner__Scan(const std::shared_ptr& scanner) // [[dataset::export]] std::shared_ptr dataset___Scanner__schema( const std::shared_ptr& sc) { - return sc->schema(); + return sc->options()->projected_schema; } // [[dataset::export]] From 57d430e5f7bb0782bcfdbfda6d45068672982b8a Mon Sep 17 00:00:00 2001 From: Sathis Kumar Date: Tue, 13 Apr 2021 07:08:08 -0600 Subject: [PATCH 009/719] ARROW-12332: [Rust] [Ballista] Add simple api server in scheduler Implements GET /executors. We can additional endpoints going forward. Closes #9987 from msathis/master Authored-by: Sathis Kumar Signed-off-by: Andy Grove --- rust/ballista/rust/core/Cargo.toml | 1 + .../rust/core/src/serde/scheduler/mod.rs | 3 +- rust/ballista/rust/scheduler/Cargo.toml | 7 +- rust/ballista/rust/scheduler/README.md | 11 +++ .../rust/scheduler/src/api/handlers.rs | 40 +++++++++ rust/ballista/rust/scheduler/src/api/mod.rs | 85 +++++++++++++++++++ rust/ballista/rust/scheduler/src/lib.rs | 2 + rust/ballista/rust/scheduler/src/main.rs | 43 ++++++++-- 8 files changed, 183 insertions(+), 9 deletions(-) create mode 100644 rust/ballista/rust/scheduler/src/api/handlers.rs create mode 100644 rust/ballista/rust/scheduler/src/api/mod.rs diff --git a/rust/ballista/rust/core/Cargo.toml b/rust/ballista/rust/core/Cargo.toml index b6301918a1f..f5f6f8574b3 100644 --- a/rust/ballista/rust/core/Cargo.toml +++ b/rust/ballista/rust/core/Cargo.toml @@ -34,6 +34,7 @@ async-trait = "0.1.36" futures = "0.3" log = "0.4" prost = "0.7" +serde = {version = "1", features = ["derive"]} sqlparser = "0.8" tokio = "1.0" tonic = "0.4" diff --git a/rust/ballista/rust/core/src/serde/scheduler/mod.rs b/rust/ballista/rust/core/src/serde/scheduler/mod.rs index efee82dbdf3..81d8722d7f4 100644 --- a/rust/ballista/rust/core/src/serde/scheduler/mod.rs +++ b/rust/ballista/rust/core/src/serde/scheduler/mod.rs @@ -23,6 +23,7 @@ use arrow::array::{ use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::logical_plan::LogicalPlan; use datafusion::physical_plan::ExecutionPlan; +use serde::Serialize; use uuid::Uuid; use super::protobuf; @@ -67,7 +68,7 @@ pub struct PartitionLocation { } /// Meta-data for an executor, used when fetching shuffle partitions from other executors -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] pub struct ExecutorMeta { pub id: String, pub host: String, diff --git a/rust/ballista/rust/scheduler/Cargo.toml b/rust/ballista/rust/scheduler/Cargo.toml index 525e28a63cc..b0213d37bda 100644 --- a/rust/ballista/rust/scheduler/Cargo.toml +++ b/rust/ballista/rust/scheduler/Cargo.toml @@ -38,14 +38,19 @@ configure_me = "0.4.0" env_logger = "0.8" etcd-client = { version = "0.6", optional = true } futures = "0.3" +http = "0.2" +http-body = "0.4" +hyper = "0.14.4" log = "0.4" parse_arg = "0.1.3" prost = "0.7" rand = "0.8" serde = {version = "1", features = ["derive"]} sled_package = { package = "sled", version = "0.34", optional = true } -tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread"] } +tokio = { version = "1.0", features = ["full"] } tonic = "0.4" +tower = { version = "0.4" } +warp = "0.3" arrow = { git = "https://github.com/apache/arrow", rev="46161d2" } datafusion = { git = "https://github.com/apache/arrow", rev="46161d2" } diff --git a/rust/ballista/rust/scheduler/README.md b/rust/ballista/rust/scheduler/README.md index facc6d17698..c2cc090bd67 100644 --- a/rust/ballista/rust/scheduler/README.md +++ b/rust/ballista/rust/scheduler/README.md @@ -30,3 +30,14 @@ $ RUST_LOG=info cargo run --release ``` By default, the scheduler will bind to `localhost` and listen on port `50051`. + +## Connecting to Scheduler +Scheduler supports REST model also using content negotiation. +For e.x if you want to get list of executors connected to the scheduler, +you can do (assuming you use default config) + +```bash +curl --request GET \ + --url http://localhost:50050/executors \ + --header 'Accept: application/json' +``` diff --git a/rust/ballista/rust/scheduler/src/api/handlers.rs b/rust/ballista/rust/scheduler/src/api/handlers.rs new file mode 100644 index 00000000000..c3450215007 --- /dev/null +++ b/rust/ballista/rust/scheduler/src/api/handlers.rs @@ -0,0 +1,40 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::SchedulerServer; +use ballista_core::serde::protobuf::{ + scheduler_grpc_server::SchedulerGrpc, ExecutorMetadata, GetExecutorMetadataParams, + GetExecutorMetadataResult, +}; +use ballista_core::serde::scheduler::ExecutorMeta; +use tonic::{Request, Response}; +use warp::Rejection; + +pub(crate) async fn list_executors_data( + data_server: SchedulerServer, +) -> Result { + let data: Result, tonic::Status> = data_server + .get_executors_metadata(Request::new(GetExecutorMetadataParams {})) + .await; + let result = data.unwrap(); + let res: &GetExecutorMetadataResult = result.get_ref(); + let vec: &Vec = &res.metadata; + let metadata: Vec = vec + .iter() + .map(|v: &ExecutorMetadata| ExecutorMeta { + host: v.host.clone(), + port: v.port as u16, + id: v.id.clone(), + }) + .collect(); + Ok(warp::reply::json(&metadata)) +} diff --git a/rust/ballista/rust/scheduler/src/api/mod.rs b/rust/ballista/rust/scheduler/src/api/mod.rs new file mode 100644 index 00000000000..29c5cb1af67 --- /dev/null +++ b/rust/ballista/rust/scheduler/src/api/mod.rs @@ -0,0 +1,85 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod handlers; + +use crate::SchedulerServer; +use anyhow::Result; +use std::{ + pin::Pin, + task::{Context as TaskContext, Poll}, +}; +use warp::filters::BoxedFilter; +use warp::{Buf, Filter, Reply}; + +pub enum EitherBody { + Left(A), + Right(B), +} + +pub type Error = Box; +pub type HttpBody = dyn http_body::Body + 'static; + +impl http_body::Body for EitherBody + where + A: http_body::Body + Send + Unpin, + B: http_body::Body + Send + Unpin, + A::Error: Into, + B::Error: Into, +{ + type Data = A::Data; + type Error = Error; + + fn poll_data( + self: Pin<&mut Self>, + cx: &mut TaskContext<'_>, + ) -> Poll>> { + match self.get_mut() { + EitherBody::Left(b) => Pin::new(b).poll_data(cx).map(map_option_err), + EitherBody::Right(b) => Pin::new(b).poll_data(cx).map(map_option_err), + } + } + + fn poll_trailers( + self: Pin<&mut Self>, + cx: &mut TaskContext<'_>, + ) -> Poll, Self::Error>> { + match self.get_mut() { + EitherBody::Left(b) => Pin::new(b).poll_trailers(cx).map_err(Into::into), + EitherBody::Right(b) => Pin::new(b).poll_trailers(cx).map_err(Into::into), + } + } + + fn is_end_stream(&self) -> bool { + match self { + EitherBody::Left(b) => b.is_end_stream(), + EitherBody::Right(b) => b.is_end_stream(), + } + } +} + +fn map_option_err>(err: Option>) -> Option> { + err.map(|e| e.map_err(Into::into)) +} + +fn with_data_server( + db: SchedulerServer, +) -> impl Filter + Clone { + warp::any().map(move || db.clone()) +} + +pub fn get_routes(scheduler_server: SchedulerServer) -> BoxedFilter<(impl Reply,)> { + let routes = warp::path("executors") + .and(with_data_server(scheduler_server)) + .and_then(handlers::list_executors_data); + routes.boxed() +} diff --git a/rust/ballista/rust/scheduler/src/lib.rs b/rust/ballista/rust/scheduler/src/lib.rs index 8ad2cc7a448..6df6c9ac57c 100644 --- a/rust/ballista/rust/scheduler/src/lib.rs +++ b/rust/ballista/rust/scheduler/src/lib.rs @@ -17,6 +17,7 @@ //! Support for distributed schedulers, such as Kubernetes +pub mod api; pub mod planner; pub mod state; @@ -68,6 +69,7 @@ use self::state::{ConfigBackendClient, SchedulerState}; use datafusion::physical_plan::parquet::ParquetExec; use std::time::Instant; +#[derive(Clone)] pub struct SchedulerServer { state: SchedulerState, namespace: String, diff --git a/rust/ballista/rust/scheduler/src/main.rs b/rust/ballista/rust/scheduler/src/main.rs index 785ffb47b17..c166fdc388d 100644 --- a/rust/ballista/rust/scheduler/src/main.rs +++ b/rust/ballista/rust/scheduler/src/main.rs @@ -17,9 +17,14 @@ //! Ballista Rust scheduler binary. +use anyhow::{Context, Result}; +use futures::future::{self, Either, TryFutureExt}; +use hyper::{service::make_service_fn, Server}; +use std::convert::Infallible; use std::{net::SocketAddr, sync::Arc}; +use tonic::transport::Server as TonicServer; +use tower::Service; -use anyhow::{Context, Result}; use ballista_core::BALLISTA_VERSION; use ballista_core::{ print_version, serde::protobuf::scheduler_grpc_server::SchedulerGrpcServer, @@ -29,9 +34,9 @@ use ballista_scheduler::state::EtcdClient; #[cfg(feature = "sled")] use ballista_scheduler::state::StandaloneClient; use ballista_scheduler::{state::ConfigBackendClient, ConfigBackend, SchedulerServer}; +use ballista_scheduler::api::{get_routes, EitherBody, Error}; use log::info; -use tonic::transport::Server; #[macro_use] extern crate configure_me; @@ -56,11 +61,35 @@ async fn start_server( "Ballista v{} Scheduler listening on {:?}", BALLISTA_VERSION, addr ); - let server = - SchedulerGrpcServer::new(SchedulerServer::new(config_backend, namespace)); - Ok(Server::builder() - .add_service(server) - .serve(addr) + Ok(Server::bind(&addr) + .serve(make_service_fn(move |_| { + let scheduler_server = SchedulerServer::new(config_backend.clone(), namespace.clone()); + let scheduler_grpc_server = SchedulerGrpcServer::new(scheduler_server.clone()); + + let mut tonic = TonicServer::builder() + .add_service(scheduler_grpc_server) + .into_service(); + let mut warp = warp::service(get_routes(scheduler_server)); + + future::ok::<_, Infallible>(tower::service_fn( + move |req: hyper::Request| { + let header = req.headers().get(hyper::header::ACCEPT); + if header.is_some() && header.unwrap().eq("application/json") { + return Either::Left( + warp.call(req) + .map_ok(|res| res.map(EitherBody::Left)) + .map_err(Error::from), + ); + } + Either::Right( + tonic + .call(req) + .map_ok(|res| res.map(EitherBody::Right)) + .map_err(Error::from), + ) + }, + )) + })) .await .context("Could not start grpc server")?) } From a49d6e235f7e5b7178a985330320980659033f36 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 13 Apr 2021 11:50:39 -0400 Subject: [PATCH 010/719] ARROW-12248: [C++] Avoid looking up ARROW_DEFAULT_MEMORY_POOL environment variable too late In some situations (e.g. R bindings), default_memory_pool() may be called before the Arrow library's global variables were fully initialized. Closes #9930 from pitrou/ARROW-12248-memory-pool-env-lookup Authored-by: Antoine Pitrou Signed-off-by: David Li --- cpp/src/arrow/memory_pool.cc | 74 ++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index 58a80232fdf..2d19b3d1962 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -105,8 +105,14 @@ struct SupportedBackend { MemoryPoolBackend backend; }; -std::vector SupportedBackends() { - std::vector backends = { +// See ARROW-12248 for why we use static in-function singletons rather than +// global constants below (in SupportedBackends() and UserSelectedBackend()). +// In some contexts (especially R bindings) `default_memory_pool()` may be +// called before all globals are initialized, and then the ARROW_DEFAULT_MEMORY_POOL +// environment variable would be ignored. + +const std::vector& SupportedBackends() { + static std::vector backends = { #ifdef ARROW_JEMALLOC {"jemalloc", MemoryPoolBackend::Jemalloc}, #endif @@ -117,42 +123,44 @@ std::vector SupportedBackends() { return backends; } -const std::vector supported_backends = SupportedBackends(); - +// Return the MemoryPoolBackend selected by the user through the +// ARROW_DEFAULT_MEMORY_POOL environment variable, if any. util::optional UserSelectedBackend() { - auto unsupported_backend = [](const std::string& name) { - std::vector supported; - for (const auto backend : supported_backends) { - supported.push_back(std::string("'") + backend.name + "'"); + static auto user_selected_backend = []() -> util::optional { + auto unsupported_backend = [](const std::string& name) { + std::vector supported; + for (const auto backend : SupportedBackends()) { + supported.push_back(std::string("'") + backend.name + "'"); + } + ARROW_LOG(WARNING) << "Unsupported backend '" << name << "' specified in " + << kDefaultBackendEnvVar << " (supported backends are " + << internal::JoinStrings(supported, ", ") << ")"; + }; + + auto maybe_name = internal::GetEnvVar(kDefaultBackendEnvVar); + if (!maybe_name.ok()) { + return {}; } - ARROW_LOG(WARNING) << "Unsupported backend '" << name << "' specified in " - << kDefaultBackendEnvVar << " (supported backends are " - << internal::JoinStrings(supported, ", ") << ")"; - }; - - auto maybe_name = internal::GetEnvVar(kDefaultBackendEnvVar); - if (!maybe_name.ok()) { - return {}; - } - const auto name = *std::move(maybe_name); - if (name.empty()) { - // An empty environment variable is considered missing + const auto name = *std::move(maybe_name); + if (name.empty()) { + // An empty environment variable is considered missing + return {}; + } + const auto found = std::find_if( + SupportedBackends().begin(), SupportedBackends().end(), + [&](const SupportedBackend& backend) { return name == backend.name; }); + if (found != SupportedBackends().end()) { + return found->backend; + } + unsupported_backend(name); return {}; - } - const auto found = - std::find_if(supported_backends.begin(), supported_backends.end(), - [&](const SupportedBackend& backend) { return name == backend.name; }); - if (found != supported_backends.end()) { - return found->backend; - } - unsupported_backend(name); - return {}; -} + }(); -const util::optional user_selected_backend = UserSelectedBackend(); + return user_selected_backend; +} MemoryPoolBackend DefaultBackend() { - auto backend = user_selected_backend; + auto backend = UserSelectedBackend(); if (backend.has_value()) { return backend.value(); } @@ -634,7 +642,7 @@ std::string ProxyMemoryPool::backend_name() const { return impl_->backend_name() std::vector SupportedMemoryBackendNames() { std::vector supported; - for (const auto backend : supported_backends) { + for (const auto backend : SupportedBackends()) { supported.push_back(backend.name); } return supported; From d7558bff24993ec69a8c7810734425e821c601b7 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 13 Apr 2021 18:14:32 +0200 Subject: [PATCH 011/719] ARROW-11839: [C++] Use xsimd for generation of accelerated bit-unpacking The custom per-ISA code generation scripts (AVX2, AVX512) are replaced with a single code generation script that outputs xsimd code for any SIMD bit-width, in an ISA-agnostic way. Also add a Neon optimized version of bit-unpacking that leverages the generated code for 128-bit SIMD. Closes #9614 from pitrou/ARROW-11839-xsimd-bpacking Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/cmake_modules/SetupCxxFlags.cmake | 1 + cpp/src/arrow/CMakeLists.txt | 3 + cpp/src/arrow/util/bpacking.cc | 7 + cpp/src/arrow/util/bpacking_avx2.cc | 114 +- cpp/src/arrow/util/bpacking_avx2_codegen.py | 203 -- cpp/src/arrow/util/bpacking_avx512.cc | 114 +- cpp/src/arrow/util/bpacking_avx512_codegen.py | 186 -- cpp/src/arrow/util/bpacking_neon.cc | 31 + cpp/src/arrow/util/bpacking_neon.h | 28 + .../arrow/util/bpacking_simd128_generated.h | 2138 +++++++++++++++++ .../arrow/util/bpacking_simd256_generated.h | 1270 ++++++++++ .../arrow/util/bpacking_simd512_generated.h | 836 +++++++ cpp/src/arrow/util/bpacking_simd_codegen.py | 209 ++ cpp/src/arrow/util/bpacking_simd_internal.h | 138 ++ cpp/thirdparty/versions.txt | 3 +- 15 files changed, 4670 insertions(+), 611 deletions(-) delete mode 100644 cpp/src/arrow/util/bpacking_avx2_codegen.py delete mode 100644 cpp/src/arrow/util/bpacking_avx512_codegen.py create mode 100644 cpp/src/arrow/util/bpacking_neon.cc create mode 100644 cpp/src/arrow/util/bpacking_neon.h create mode 100644 cpp/src/arrow/util/bpacking_simd128_generated.h create mode 100644 cpp/src/arrow/util/bpacking_simd256_generated.h create mode 100644 cpp/src/arrow/util/bpacking_simd512_generated.h create mode 100644 cpp/src/arrow/util/bpacking_simd_codegen.py create mode 100644 cpp/src/arrow/util/bpacking_simd_internal.h diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 9f68c560472..6e259559e42 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -452,6 +452,7 @@ if(ARROW_CPU_FLAG STREQUAL "armv8") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ARMV8_ARCH_FLAG}") if(NOT ARROW_SIMD_LEVEL STREQUAL "NONE") + set(ARROW_HAVE_NEON ON) add_definitions(-DARROW_HAVE_NEON) endif() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index df72dcc5b6b..3623283f355 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -228,6 +228,9 @@ if(ARROW_HAVE_RUNTIME_AVX512) set_source_files_properties(util/bpacking_avx512.cc PROPERTIES COMPILE_FLAGS ${ARROW_AVX512_FLAG}) endif() +if(ARROW_HAVE_NEON) + list(APPEND ARROW_SRCS util/bpacking_neon.cc) +endif() if(APPLE) list(APPEND ARROW_SRCS vendored/datetime/ios.mm) diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index 02634755bd0..2e658fd108e 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -27,6 +27,9 @@ #if defined(ARROW_HAVE_RUNTIME_AVX512) #include "arrow/util/bpacking_avx512.h" #endif +#if defined(ARROW_HAVE_NEON) +#include "arrow/util/bpacking_neon.h" +#endif namespace arrow { namespace internal { @@ -163,8 +166,12 @@ struct Unpack32DynamicFunction { } // namespace int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { +#if defined(ARROW_HAVE_NEON) + return unpack32_neon(in, out, batch_size, num_bits); +#else static DynamicDispatch dispatch; return dispatch.func(in, out, batch_size, num_bits); +#endif } } // namespace internal diff --git a/cpp/src/arrow/util/bpacking_avx2.cc b/cpp/src/arrow/util/bpacking_avx2.cc index 63b914b578a..5a3a7bad3d3 100644 --- a/cpp/src/arrow/util/bpacking_avx2.cc +++ b/cpp/src/arrow/util/bpacking_avx2.cc @@ -16,121 +16,15 @@ // under the License. #include "arrow/util/bpacking_avx2.h" -#include "arrow/util/bpacking_avx2_generated.h" -#include "arrow/util/logging.h" +#include "arrow/util/bpacking_simd256_generated.h" +#include "arrow/util/bpacking_simd_internal.h" namespace arrow { namespace internal { int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { - batch_size = batch_size / 32 * 32; - int num_loops = batch_size / 32; - - switch (num_bits) { - case 0: - for (int i = 0; i < num_loops; ++i) in = unpack0_32_avx2(in, out + i * 32); - break; - case 1: - for (int i = 0; i < num_loops; ++i) in = unpack1_32_avx2(in, out + i * 32); - break; - case 2: - for (int i = 0; i < num_loops; ++i) in = unpack2_32_avx2(in, out + i * 32); - break; - case 3: - for (int i = 0; i < num_loops; ++i) in = unpack3_32_avx2(in, out + i * 32); - break; - case 4: - for (int i = 0; i < num_loops; ++i) in = unpack4_32_avx2(in, out + i * 32); - break; - case 5: - for (int i = 0; i < num_loops; ++i) in = unpack5_32_avx2(in, out + i * 32); - break; - case 6: - for (int i = 0; i < num_loops; ++i) in = unpack6_32_avx2(in, out + i * 32); - break; - case 7: - for (int i = 0; i < num_loops; ++i) in = unpack7_32_avx2(in, out + i * 32); - break; - case 8: - for (int i = 0; i < num_loops; ++i) in = unpack8_32_avx2(in, out + i * 32); - break; - case 9: - for (int i = 0; i < num_loops; ++i) in = unpack9_32_avx2(in, out + i * 32); - break; - case 10: - for (int i = 0; i < num_loops; ++i) in = unpack10_32_avx2(in, out + i * 32); - break; - case 11: - for (int i = 0; i < num_loops; ++i) in = unpack11_32_avx2(in, out + i * 32); - break; - case 12: - for (int i = 0; i < num_loops; ++i) in = unpack12_32_avx2(in, out + i * 32); - break; - case 13: - for (int i = 0; i < num_loops; ++i) in = unpack13_32_avx2(in, out + i * 32); - break; - case 14: - for (int i = 0; i < num_loops; ++i) in = unpack14_32_avx2(in, out + i * 32); - break; - case 15: - for (int i = 0; i < num_loops; ++i) in = unpack15_32_avx2(in, out + i * 32); - break; - case 16: - for (int i = 0; i < num_loops; ++i) in = unpack16_32_avx2(in, out + i * 32); - break; - case 17: - for (int i = 0; i < num_loops; ++i) in = unpack17_32_avx2(in, out + i * 32); - break; - case 18: - for (int i = 0; i < num_loops; ++i) in = unpack18_32_avx2(in, out + i * 32); - break; - case 19: - for (int i = 0; i < num_loops; ++i) in = unpack19_32_avx2(in, out + i * 32); - break; - case 20: - for (int i = 0; i < num_loops; ++i) in = unpack20_32_avx2(in, out + i * 32); - break; - case 21: - for (int i = 0; i < num_loops; ++i) in = unpack21_32_avx2(in, out + i * 32); - break; - case 22: - for (int i = 0; i < num_loops; ++i) in = unpack22_32_avx2(in, out + i * 32); - break; - case 23: - for (int i = 0; i < num_loops; ++i) in = unpack23_32_avx2(in, out + i * 32); - break; - case 24: - for (int i = 0; i < num_loops; ++i) in = unpack24_32_avx2(in, out + i * 32); - break; - case 25: - for (int i = 0; i < num_loops; ++i) in = unpack25_32_avx2(in, out + i * 32); - break; - case 26: - for (int i = 0; i < num_loops; ++i) in = unpack26_32_avx2(in, out + i * 32); - break; - case 27: - for (int i = 0; i < num_loops; ++i) in = unpack27_32_avx2(in, out + i * 32); - break; - case 28: - for (int i = 0; i < num_loops; ++i) in = unpack28_32_avx2(in, out + i * 32); - break; - case 29: - for (int i = 0; i < num_loops; ++i) in = unpack29_32_avx2(in, out + i * 32); - break; - case 30: - for (int i = 0; i < num_loops; ++i) in = unpack30_32_avx2(in, out + i * 32); - break; - case 31: - for (int i = 0; i < num_loops; ++i) in = unpack31_32_avx2(in, out + i * 32); - break; - case 32: - for (int i = 0; i < num_loops; ++i) in = unpack32_32_avx2(in, out + i * 32); - break; - default: - DCHECK(false) << "Unsupported num_bits"; - } - - return batch_size; + return unpack32_specialized>(in, out, batch_size, + num_bits); } } // namespace internal diff --git a/cpp/src/arrow/util/bpacking_avx2_codegen.py b/cpp/src/arrow/util/bpacking_avx2_codegen.py deleted file mode 100644 index e60aed86a29..00000000000 --- a/cpp/src/arrow/util/bpacking_avx2_codegen.py +++ /dev/null @@ -1,203 +0,0 @@ -#!/bin/python - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Usage: python bpacking_avx2_codegen.py > bpacking_avx2_generated.h - - -def print_unpack_bit_func(bit): - shift = 0 - shifts = [] - in_index = 0 - inls = [] - mask = (1 << bit) - 1 - bracket = "{" - - print( - f"inline static const uint32_t* unpack{bit}_32_avx2(const uint32_t* in, uint32_t* out) {bracket}") - print(" using ::arrow::util::SafeLoad;") - print(" uint32_t mask = 0x%x;" % mask) - print(" __m256i reg_shifts, reg_inls, reg_masks;") - print(" __m256i results;") - - print("") - for i in range(32): - if shift + bit == 32: - shifts.append(shift) - inls.append(f"SafeLoad(in + {in_index})") - in_index += 1 - shift = 0 - elif shift + bit > 32: # cross the boundary - inls.append( - f"SafeLoad(in + {in_index}) >> {shift} | SafeLoad(in + {in_index + 1}) << {32 - shift}") - in_index += 1 - shift = bit - (32 - shift) - shifts.append(0) # zero shift - else: - shifts.append(shift) - inls.append(f"SafeLoad(in + {in_index})") - shift += bit - - print(" reg_masks = _mm256_set1_epi32(mask);") - print("") - - print(" // shift the first 8 outs") - print( - f" reg_shifts = _mm256_set_epi32({shifts[7]}, {shifts[6]}, {shifts[5]}, {shifts[4]},") - print( - f" {shifts[3]}, {shifts[2]}, {shifts[1]}, {shifts[0]});") - print(f" reg_inls = _mm256_set_epi32({inls[7]}, {inls[6]},") - print(f" {inls[5]}, {inls[4]},") - print(f" {inls[3]}, {inls[2]},") - print(f" {inls[1]}, {inls[0]});") - print( - " results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);") - print(" _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);") - print(" out += 8;") - print("") - - print(" // shift the second 8 outs") - print( - f" reg_shifts = _mm256_set_epi32({shifts[15]}, {shifts[14]}, {shifts[13]}, {shifts[12]},") - print( - f" {shifts[11]}, {shifts[10]}, {shifts[9]}, {shifts[8]});") - print(f" reg_inls = _mm256_set_epi32({inls[15]}, {inls[14]},") - print(f" {inls[13]}, {inls[12]},") - print(f" {inls[11]}, {inls[10]},") - print(f" {inls[9]}, {inls[8]});") - print( - " results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);") - print(" _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);") - print(" out += 8;") - print("") - - print(" // shift the third 8 outs") - print( - f" reg_shifts = _mm256_set_epi32({shifts[23]}, {shifts[22]}, {shifts[21]}, {shifts[20]},") - print( - f" {shifts[19]}, {shifts[18]}, {shifts[17]}, {shifts[16]});") - print(f" reg_inls = _mm256_set_epi32({inls[23]}, {inls[22]},") - print(f" {inls[21]}, {inls[20]},") - print(f" {inls[19]}, {inls[18]},") - print(f" {inls[17]}, {inls[16]});") - print( - " results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);") - print(" _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);") - print(" out += 8;") - print("") - - print(" // shift the last 8 outs") - print( - f" reg_shifts = _mm256_set_epi32({shifts[31]}, {shifts[30]}, {shifts[29]}, {shifts[28]},") - print( - f" {shifts[27]}, {shifts[26]}, {shifts[25]}, {shifts[24]});") - print(f" reg_inls = _mm256_set_epi32({inls[31]}, {inls[30]},") - print(f" {inls[29]}, {inls[28]},") - print(f" {inls[27]}, {inls[26]},") - print(f" {inls[25]}, {inls[24]});") - print( - " results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);") - print(" _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), results);") - print(" out += 8;") - - print("") - print(f" in += {bit};") - print("") - print(" return in;") - print("}") - - -def print_unpack_bit0_func(): - print( - "inline static const uint32_t* unpack0_32_avx2(const uint32_t* in, uint32_t* out) {") - print(" memset(out, 0x0, 32 * sizeof(*out));") - print(" out += 32;") - print("") - print(" return in;") - print("}") - - -def print_unpack_bit32_func(): - print( - "inline static const uint32_t* unpack32_32_avx2(const uint32_t* in, uint32_t* out) {") - print(" memcpy(out, in, 32 * sizeof(*out));") - print(" in += 32;") - print(" out += 32;") - print("") - print(" return in;") - print("}") - - -def print_copyright(): - print( - """// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License.""") - - -def print_note(): - print("//") - print("// Automatically generated file; DO NOT EDIT.") - - -def main(): - print_copyright() - print_note() - print("") - print("#pragma once") - print("") - print("#include ") - print("#include ") - print("") - print("#ifdef _MSC_VER") - print("#include ") - print("#else") - print("#include ") - print("#endif") - print("") - print('#include "arrow/util/ubsan.h"') - print("") - print("namespace arrow {") - print("namespace internal {") - print("") - print_unpack_bit0_func() - print("") - for i in range(1, 32): - print_unpack_bit_func(i) - print("") - print_unpack_bit32_func() - print("") - print("} // namespace internal") - print("} // namespace arrow") - - -if __name__ == '__main__': - main() diff --git a/cpp/src/arrow/util/bpacking_avx512.cc b/cpp/src/arrow/util/bpacking_avx512.cc index 98eb4d325af..08ccd3fcd4d 100644 --- a/cpp/src/arrow/util/bpacking_avx512.cc +++ b/cpp/src/arrow/util/bpacking_avx512.cc @@ -16,121 +16,15 @@ // under the License. #include "arrow/util/bpacking_avx512.h" -#include "arrow/util/bpacking_avx512_generated.h" -#include "arrow/util/logging.h" +#include "arrow/util/bpacking_simd512_generated.h" +#include "arrow/util/bpacking_simd_internal.h" namespace arrow { namespace internal { int unpack32_avx512(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { - batch_size = batch_size / 32 * 32; - int num_loops = batch_size / 32; - - switch (num_bits) { - case 0: - for (int i = 0; i < num_loops; ++i) in = unpack0_32_avx512(in, out + i * 32); - break; - case 1: - for (int i = 0; i < num_loops; ++i) in = unpack1_32_avx512(in, out + i * 32); - break; - case 2: - for (int i = 0; i < num_loops; ++i) in = unpack2_32_avx512(in, out + i * 32); - break; - case 3: - for (int i = 0; i < num_loops; ++i) in = unpack3_32_avx512(in, out + i * 32); - break; - case 4: - for (int i = 0; i < num_loops; ++i) in = unpack4_32_avx512(in, out + i * 32); - break; - case 5: - for (int i = 0; i < num_loops; ++i) in = unpack5_32_avx512(in, out + i * 32); - break; - case 6: - for (int i = 0; i < num_loops; ++i) in = unpack6_32_avx512(in, out + i * 32); - break; - case 7: - for (int i = 0; i < num_loops; ++i) in = unpack7_32_avx512(in, out + i * 32); - break; - case 8: - for (int i = 0; i < num_loops; ++i) in = unpack8_32_avx512(in, out + i * 32); - break; - case 9: - for (int i = 0; i < num_loops; ++i) in = unpack9_32_avx512(in, out + i * 32); - break; - case 10: - for (int i = 0; i < num_loops; ++i) in = unpack10_32_avx512(in, out + i * 32); - break; - case 11: - for (int i = 0; i < num_loops; ++i) in = unpack11_32_avx512(in, out + i * 32); - break; - case 12: - for (int i = 0; i < num_loops; ++i) in = unpack12_32_avx512(in, out + i * 32); - break; - case 13: - for (int i = 0; i < num_loops; ++i) in = unpack13_32_avx512(in, out + i * 32); - break; - case 14: - for (int i = 0; i < num_loops; ++i) in = unpack14_32_avx512(in, out + i * 32); - break; - case 15: - for (int i = 0; i < num_loops; ++i) in = unpack15_32_avx512(in, out + i * 32); - break; - case 16: - for (int i = 0; i < num_loops; ++i) in = unpack16_32_avx512(in, out + i * 32); - break; - case 17: - for (int i = 0; i < num_loops; ++i) in = unpack17_32_avx512(in, out + i * 32); - break; - case 18: - for (int i = 0; i < num_loops; ++i) in = unpack18_32_avx512(in, out + i * 32); - break; - case 19: - for (int i = 0; i < num_loops; ++i) in = unpack19_32_avx512(in, out + i * 32); - break; - case 20: - for (int i = 0; i < num_loops; ++i) in = unpack20_32_avx512(in, out + i * 32); - break; - case 21: - for (int i = 0; i < num_loops; ++i) in = unpack21_32_avx512(in, out + i * 32); - break; - case 22: - for (int i = 0; i < num_loops; ++i) in = unpack22_32_avx512(in, out + i * 32); - break; - case 23: - for (int i = 0; i < num_loops; ++i) in = unpack23_32_avx512(in, out + i * 32); - break; - case 24: - for (int i = 0; i < num_loops; ++i) in = unpack24_32_avx512(in, out + i * 32); - break; - case 25: - for (int i = 0; i < num_loops; ++i) in = unpack25_32_avx512(in, out + i * 32); - break; - case 26: - for (int i = 0; i < num_loops; ++i) in = unpack26_32_avx512(in, out + i * 32); - break; - case 27: - for (int i = 0; i < num_loops; ++i) in = unpack27_32_avx512(in, out + i * 32); - break; - case 28: - for (int i = 0; i < num_loops; ++i) in = unpack28_32_avx512(in, out + i * 32); - break; - case 29: - for (int i = 0; i < num_loops; ++i) in = unpack29_32_avx512(in, out + i * 32); - break; - case 30: - for (int i = 0; i < num_loops; ++i) in = unpack30_32_avx512(in, out + i * 32); - break; - case 31: - for (int i = 0; i < num_loops; ++i) in = unpack31_32_avx512(in, out + i * 32); - break; - case 32: - for (int i = 0; i < num_loops; ++i) in = unpack32_32_avx512(in, out + i * 32); - break; - default: - DCHECK(false) << "Unsupported num_bits"; - } - - return batch_size; + return unpack32_specialized>(in, out, batch_size, + num_bits); } } // namespace internal diff --git a/cpp/src/arrow/util/bpacking_avx512_codegen.py b/cpp/src/arrow/util/bpacking_avx512_codegen.py deleted file mode 100644 index df4d7d750da..00000000000 --- a/cpp/src/arrow/util/bpacking_avx512_codegen.py +++ /dev/null @@ -1,186 +0,0 @@ -#!/bin/python - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Usage: python bpacking_avx512_codegen.py > bpacking_avx512_generated.h - - -def print_unpack_bit_func(bit): - shift = 0 - shifts = [] - in_index = 0 - inls = [] - mask = (1 << bit) - 1 - bracket = "{" - - print( - f"inline static const uint32_t* unpack{bit}_32_avx512(const uint32_t* in, uint32_t* out) {bracket}") - print(" using ::arrow::util::SafeLoad;") - print(" uint32_t mask = 0x%x;" % mask) - print(" __m512i reg_shifts, reg_inls, reg_masks;") - print(" __m512i results;") - - print("") - for i in range(32): - if shift + bit == 32: - shifts.append(shift) - inls.append(f"SafeLoad(in + {in_index})") - in_index += 1 - shift = 0 - elif shift + bit > 32: # cross the boundary - inls.append( - f"SafeLoad(in + {in_index}) >> {shift} | SafeLoad(in + {in_index + 1}) << {32 - shift}") - in_index += 1 - shift = bit - (32 - shift) - shifts.append(0) # zero shift - else: - shifts.append(shift) - inls.append(f"SafeLoad(in + {in_index})") - shift += bit - - print(" reg_masks = _mm512_set1_epi32(mask);") - print("") - print(" // shift the first 16 outs") - print( - f" reg_shifts = _mm512_set_epi32({shifts[15]}, {shifts[14]}, {shifts[13]}, {shifts[12]},") - print( - f" {shifts[11]}, {shifts[10]}, {shifts[9]}, {shifts[8]},") - print( - f" {shifts[7]}, {shifts[6]}, {shifts[5]}, {shifts[4]},") - print( - f" {shifts[3]}, {shifts[2]}, {shifts[1]}, {shifts[0]});") - print(f" reg_inls = _mm512_set_epi32({inls[15]}, {inls[14]},") - print(f" {inls[13]}, {inls[12]},") - print(f" {inls[11]}, {inls[10]},") - print(f" {inls[9]}, {inls[8]},") - print(f" {inls[7]}, {inls[6]},") - print(f" {inls[5]}, {inls[4]},") - print(f" {inls[3]}, {inls[2]},") - print(f" {inls[1]}, {inls[0]});") - print( - " results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);") - print(" _mm512_storeu_si512(out, results);") - print(" out += 16;") - print("") - print(" // shift the second 16 outs") - print( - f" reg_shifts = _mm512_set_epi32({shifts[31]}, {shifts[30]}, {shifts[29]}, {shifts[28]},") - print( - f" {shifts[27]}, {shifts[26]}, {shifts[25]}, {shifts[24]},") - print( - f" {shifts[23]}, {shifts[22]}, {shifts[21]}, {shifts[20]},") - print( - f" {shifts[19]}, {shifts[18]}, {shifts[17]}, {shifts[16]});") - print(f" reg_inls = _mm512_set_epi32({inls[31]}, {inls[30]},") - print(f" {inls[29]}, {inls[28]},") - print(f" {inls[27]}, {inls[26]},") - print(f" {inls[25]}, {inls[24]},") - print(f" {inls[23]}, {inls[22]},") - print(f" {inls[21]}, {inls[20]},") - print(f" {inls[19]}, {inls[18]},") - print(f" {inls[17]}, {inls[16]});") - print( - " results = _mm512_and_epi32(_mm512_srlv_epi32(reg_inls, reg_shifts), reg_masks);") - print(" _mm512_storeu_si512(out, results);") - print(" out += 16;") - print("") - print(f" in += {bit};") - print("") - print(" return in;") - print("}") - - -def print_unpack_bit0_func(): - print( - "inline static const uint32_t* unpack0_32_avx512(const uint32_t* in, uint32_t* out) {") - print(" memset(out, 0x0, 32 * sizeof(*out));") - print(" out += 32;") - print("") - print(" return in;") - print("}") - - -def print_unpack_bit32_func(): - print( - "inline static const uint32_t* unpack32_32_avx512(const uint32_t* in, uint32_t* out) {") - print(" memcpy(out, in, 32 * sizeof(*out));") - print(" in += 32;") - print(" out += 32;") - print("") - print(" return in;") - print("}") - - -def print_copyright(): - print( - """// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License.""") - - -def print_note(): - print("//") - print("// Automatically generated file; DO NOT EDIT.") - - -def main(): - print_copyright() - print_note() - print("") - print("#pragma once") - print("") - print("#include ") - print("#include ") - print("") - print("#ifdef _MSC_VER") - print("#include ") - print("#else") - print("#include ") - print("#endif") - print("") - print('#include "arrow/util/ubsan.h"') - print("") - print("namespace arrow {") - print("namespace internal {") - print("") - print_unpack_bit0_func() - print("") - for i in range(1, 32): - print_unpack_bit_func(i) - print("") - print_unpack_bit32_func() - print("") - print("} // namespace internal") - print("} // namespace arrow") - - -if __name__ == '__main__': - main() diff --git a/cpp/src/arrow/util/bpacking_neon.cc b/cpp/src/arrow/util/bpacking_neon.cc new file mode 100644 index 00000000000..a0bb5dc7a9e --- /dev/null +++ b/cpp/src/arrow/util/bpacking_neon.cc @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/bpacking_neon.h" +#include "arrow/util/bpacking_simd128_generated.h" +#include "arrow/util/bpacking_simd_internal.h" + +namespace arrow { +namespace internal { + +int unpack32_neon(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { + return unpack32_specialized>(in, out, batch_size, + num_bits); +} + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/bpacking_neon.h b/cpp/src/arrow/util/bpacking_neon.h new file mode 100644 index 00000000000..9d02cd568ac --- /dev/null +++ b/cpp/src/arrow/util/bpacking_neon.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +namespace arrow { +namespace internal { + +int unpack32_neon(const uint32_t* in, uint32_t* out, int batch_size, int num_bits); + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/bpacking_simd128_generated.h b/cpp/src/arrow/util/bpacking_simd128_generated.h new file mode 100644 index 00000000000..f7700fd0e76 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_simd128_generated.h @@ -0,0 +1,2138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Automatically generated file; DO NOT EDIT. + +#pragma once + +#include +#include + +#include + +#include "arrow/util/dispatch.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace internal { +namespace { + +using ::arrow::util::SafeLoad; + +template +struct UnpackBits128 { + +using simd_batch = xsimd::batch; + +inline static const uint32_t* unpack0_32(const uint32_t* in, uint32_t* out) { + memset(out, 0x0, 32 * sizeof(*out)); + out += 32; + + return in; +} + +inline static const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 1-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 0, 1, 2, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 1-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 4, 5, 6, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 1-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 8, 9, 10, 11 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 1-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 12, 13, 14, 15 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 1-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 16, 17, 18, 19 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 1-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 20, 21, 22, 23 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 1-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 24, 25, 26, 27 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 1-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 28, 29, 30, 31 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 1; + return in; +} + +inline static const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 2-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 0, 2, 4, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 2-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 8, 10, 12, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 2-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 16, 18, 20, 22 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 2-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 24, 26, 28, 30 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 2-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 2, 4, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 2-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 8, 10, 12, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 2-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 16, 18, 20, 22 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 2-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 24, 26, 28, 30 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 2; + return in; +} + +inline static const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 3-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 0, 3, 6, 9 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 3-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 12, 15, 18, 21 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 3-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1) }; + shifts = simd_batch{ 24, 27, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 3-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 4, 7, 10, 13 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 3-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 16, 19, 22, 25 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 3-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 28, 0, 2, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 3-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 8, 11, 14, 17 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 3-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 20, 23, 26, 29 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 3; + return in; +} + +inline static const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xf; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 4-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 0, 4, 8, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 4-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 16, 20, 24, 28 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 4-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 4, 8, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 4-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 16, 20, 24, 28 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 4-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 0, 4, 8, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 4-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 16, 20, 24, 28 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 4-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 0, 4, 8, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 4-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 16, 20, 24, 28 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 4; + return in; +} + +inline static const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 5-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 0, 5, 10, 15 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 5-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1) }; + shifts = simd_batch{ 20, 25, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 5-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 8, 13, 18, 23 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 5-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 0, 1, 6, 11 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 5-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1 }; + shifts = simd_batch{ 16, 21, 26, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 5-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 4, 9, 14, 19 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 5-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 4), SafeLoad(in + 4) }; + shifts = simd_batch{ 24, 0, 2, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 5-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; + shifts = simd_batch{ 12, 17, 22, 27 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 5; + return in; +} + +inline static const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 6-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 0, 6, 12, 18 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 6-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 24, 0, 4, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 6-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) }; + shifts = simd_batch{ 16, 22, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 6-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 8, 14, 20, 26 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 6-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 0, 6, 12, 18 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 6-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4) }; + shifts = simd_batch{ 24, 0, 4, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 6-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5) }; + shifts = simd_batch{ 16, 22, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 6-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) }; + shifts = simd_batch{ 8, 14, 20, 26 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 6; + return in; +} + +inline static const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 7-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 0, 7, 14, 21 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 7-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 3, 10, 17 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 7-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 24, 0, 6, 13 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 7-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5, SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 20, 0, 2, 9 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 7-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) }; + shifts = simd_batch{ 16, 23, 0, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 7-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5) }; + shifts = simd_batch{ 12, 19, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 7-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3 }; + shifts = simd_batch{ 8, 15, 22, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 7-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) }; + shifts = simd_batch{ 4, 11, 18, 25 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 7; + return in; +} + +inline static const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 8-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 0, 8, 16, 24 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 8-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 8, 16, 24 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 8-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 0, 8, 16, 24 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 8-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 0, 8, 16, 24 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 8-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; + shifts = simd_batch{ 0, 8, 16, 24 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 8-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) }; + shifts = simd_batch{ 0, 8, 16, 24 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 8-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) }; + shifts = simd_batch{ 0, 8, 16, 24 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 8-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) }; + shifts = simd_batch{ 0, 8, 16, 24 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 8; + return in; +} + +inline static const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 9-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5 }; + shifts = simd_batch{ 0, 9, 18, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 9-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1 }; + shifts = simd_batch{ 4, 13, 22, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 9-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3) }; + shifts = simd_batch{ 8, 17, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 9-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) }; + shifts = simd_batch{ 12, 21, 0, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 9-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 5), SafeLoad(in + 5) }; + shifts = simd_batch{ 16, 0, 2, 11 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 9-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6), SafeLoad(in + 6) }; + shifts = simd_batch{ 20, 0, 6, 15 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 9-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) }; + shifts = simd_batch{ 0, 1, 10, 19 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 9-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) }; + shifts = simd_batch{ 0, 5, 14, 23 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 9; + return in; +} + +inline static const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 10-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2 }; + shifts = simd_batch{ 0, 10, 20, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 10-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) }; + shifts = simd_batch{ 8, 18, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 10-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 16, 0, 4, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 10-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; + shifts = simd_batch{ 0, 2, 12, 22 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 10-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2 }; + shifts = simd_batch{ 0, 10, 20, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 10-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7) }; + shifts = simd_batch{ 8, 18, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 10-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6, SafeLoad(in + 8), SafeLoad(in + 8) }; + shifts = simd_batch{ 16, 0, 4, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 10-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) }; + shifts = simd_batch{ 0, 2, 12, 22 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 10; + return in; +} + +inline static const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 11-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10, SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 11, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 11-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1) >> 23 | SafeLoad(in + 2) << 9, SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 12, 0, 2, 13 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 11-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 25 | SafeLoad(in + 4) << 7 }; + shifts = simd_batch{ 0, 3, 14, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 11-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5) }; + shifts = simd_batch{ 4, 15, 0, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 11-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5, SafeLoad(in + 6), SafeLoad(in + 6) }; + shifts = simd_batch{ 16, 0, 6, 17 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 11-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3 }; + shifts = simd_batch{ 0, 7, 18, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 11-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) }; + shifts = simd_batch{ 8, 19, 0, 9 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 11-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1, SafeLoad(in + 10), SafeLoad(in + 10) }; + shifts = simd_batch{ 20, 0, 10, 21 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 11; + return in; +} + +inline static const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xfff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 12-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 24 | SafeLoad(in + 1) << 8, SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 12, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 12-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 16, 0, 8, 20 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 12-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) }; + shifts = simd_batch{ 0, 12, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 12-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5), SafeLoad(in + 5) }; + shifts = simd_batch{ 16, 0, 8, 20 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 12-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7) }; + shifts = simd_batch{ 0, 12, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 12-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8) }; + shifts = simd_batch{ 16, 0, 8, 20 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 12-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10) }; + shifts = simd_batch{ 0, 12, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 12-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 10), SafeLoad(in + 10) >> 28 | SafeLoad(in + 11) << 4, SafeLoad(in + 11), SafeLoad(in + 11) }; + shifts = simd_batch{ 16, 0, 8, 20 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 12; + return in; +} + +inline static const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 13-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 26 | SafeLoad(in + 1) << 6, SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 13, 0, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 13-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 1) >> 20 | SafeLoad(in + 2) << 12, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5 }; + shifts = simd_batch{ 0, 1, 14, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 13-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 21 | SafeLoad(in + 4) << 11, SafeLoad(in + 4), SafeLoad(in + 4) }; + shifts = simd_batch{ 8, 0, 2, 15 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 13-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5), SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 6) }; + shifts = simd_batch{ 0, 9, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 13-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6) >> 29 | SafeLoad(in + 7) << 3, SafeLoad(in + 7), SafeLoad(in + 7) >> 23 | SafeLoad(in + 8) << 9 }; + shifts = simd_batch{ 16, 0, 10, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 13-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) }; + shifts = simd_batch{ 4, 17, 0, 11 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 13-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 10) >> 31 | SafeLoad(in + 11) << 1 }; + shifts = simd_batch{ 0, 5, 18, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 13-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 11), SafeLoad(in + 11) >> 25 | SafeLoad(in + 12) << 7, SafeLoad(in + 12), SafeLoad(in + 12) }; + shifts = simd_batch{ 12, 0, 6, 19 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 13; + return in; +} + +inline static const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 14-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 14, 0, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 14-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 1) >> 24 | SafeLoad(in + 2) << 8, SafeLoad(in + 2), SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 3) }; + shifts = simd_batch{ 0, 6, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 14-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6 }; + shifts = simd_batch{ 16, 0, 12, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 14-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 6), SafeLoad(in + 6) }; + shifts = simd_batch{ 8, 0, 4, 18 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 14-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) }; + shifts = simd_batch{ 0, 14, 0, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 14-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9), SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 10) }; + shifts = simd_batch{ 0, 6, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 14-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 10), SafeLoad(in + 10) >> 30 | SafeLoad(in + 11) << 2, SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6 }; + shifts = simd_batch{ 16, 0, 12, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 14-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 12), SafeLoad(in + 12) >> 22 | SafeLoad(in + 13) << 10, SafeLoad(in + 13), SafeLoad(in + 13) }; + shifts = simd_batch{ 8, 0, 4, 18 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 14; + return in; +} + +inline static const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 15-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 15, 0, 13 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 15-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3) }; + shifts = simd_batch{ 0, 11, 0, 9 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 15-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5) }; + shifts = simd_batch{ 0, 7, 0, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 15-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6), SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14, SafeLoad(in + 7) }; + shifts = simd_batch{ 0, 3, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 15-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 31 | SafeLoad(in + 8) << 1, SafeLoad(in + 8), SafeLoad(in + 8) >> 29 | SafeLoad(in + 9) << 3 }; + shifts = simd_batch{ 16, 0, 14, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 15-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9) >> 27 | SafeLoad(in + 10) << 5, SafeLoad(in + 10), SafeLoad(in + 10) >> 25 | SafeLoad(in + 11) << 7 }; + shifts = simd_batch{ 12, 0, 10, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 15-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 11), SafeLoad(in + 11) >> 23 | SafeLoad(in + 12) << 9, SafeLoad(in + 12), SafeLoad(in + 12) >> 21 | SafeLoad(in + 13) << 11 }; + shifts = simd_batch{ 8, 0, 6, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 15-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 13), SafeLoad(in + 13) >> 19 | SafeLoad(in + 14) << 13, SafeLoad(in + 14), SafeLoad(in + 14) }; + shifts = simd_batch{ 4, 0, 2, 17 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 15; + return in; +} + +inline static const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 16-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 16, 0, 16 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 16-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 0, 16, 0, 16 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 16-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 5), SafeLoad(in + 5) }; + shifts = simd_batch{ 0, 16, 0, 16 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 16-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 7), SafeLoad(in + 7) }; + shifts = simd_batch{ 0, 16, 0, 16 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 16-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 9), SafeLoad(in + 9) }; + shifts = simd_batch{ 0, 16, 0, 16 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 16-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 11), SafeLoad(in + 11) }; + shifts = simd_batch{ 0, 16, 0, 16 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 16-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 12), SafeLoad(in + 12), SafeLoad(in + 13), SafeLoad(in + 13) }; + shifts = simd_batch{ 0, 16, 0, 16 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 16-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 14), SafeLoad(in + 14), SafeLoad(in + 15), SafeLoad(in + 15) }; + shifts = simd_batch{ 0, 16, 0, 16 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 16; + return in; +} + +inline static const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1ffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 17-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 17 | SafeLoad(in + 1) << 15, SafeLoad(in + 1), SafeLoad(in + 1) >> 19 | SafeLoad(in + 2) << 13 }; + shifts = simd_batch{ 0, 0, 2, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 17-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2) >> 21 | SafeLoad(in + 3) << 11, SafeLoad(in + 3), SafeLoad(in + 3) >> 23 | SafeLoad(in + 4) << 9 }; + shifts = simd_batch{ 4, 0, 6, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 17-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 5), SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5 }; + shifts = simd_batch{ 8, 0, 10, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 17-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6) >> 29 | SafeLoad(in + 7) << 3, SafeLoad(in + 7), SafeLoad(in + 7) >> 31 | SafeLoad(in + 8) << 1 }; + shifts = simd_batch{ 12, 0, 14, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 17-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 8) >> 16 | SafeLoad(in + 9) << 16, SafeLoad(in + 9), SafeLoad(in + 9) >> 18 | SafeLoad(in + 10) << 14, SafeLoad(in + 10) }; + shifts = simd_batch{ 0, 1, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 17-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 10) >> 20 | SafeLoad(in + 11) << 12, SafeLoad(in + 11), SafeLoad(in + 11) >> 22 | SafeLoad(in + 12) << 10, SafeLoad(in + 12) }; + shifts = simd_batch{ 0, 5, 0, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 17-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 12) >> 24 | SafeLoad(in + 13) << 8, SafeLoad(in + 13), SafeLoad(in + 13) >> 26 | SafeLoad(in + 14) << 6, SafeLoad(in + 14) }; + shifts = simd_batch{ 0, 9, 0, 11 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 17-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 14) >> 28 | SafeLoad(in + 15) << 4, SafeLoad(in + 15), SafeLoad(in + 15) >> 30 | SafeLoad(in + 16) << 2, SafeLoad(in + 16) }; + shifts = simd_batch{ 0, 13, 0, 15 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 17; + return in; +} + +inline static const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3ffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 18-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 18 | SafeLoad(in + 1) << 14, SafeLoad(in + 1), SafeLoad(in + 1) >> 22 | SafeLoad(in + 2) << 10 }; + shifts = simd_batch{ 0, 0, 4, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 18-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2 }; + shifts = simd_batch{ 8, 0, 12, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 18-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 4) >> 16 | SafeLoad(in + 5) << 16, SafeLoad(in + 5), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6) }; + shifts = simd_batch{ 0, 2, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 18-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) }; + shifts = simd_batch{ 0, 10, 0, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 18-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9) >> 18 | SafeLoad(in + 10) << 14, SafeLoad(in + 10), SafeLoad(in + 10) >> 22 | SafeLoad(in + 11) << 10 }; + shifts = simd_batch{ 0, 0, 4, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 18-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12), SafeLoad(in + 12) >> 30 | SafeLoad(in + 13) << 2 }; + shifts = simd_batch{ 8, 0, 12, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 18-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14), SafeLoad(in + 14) >> 20 | SafeLoad(in + 15) << 12, SafeLoad(in + 15) }; + shifts = simd_batch{ 0, 2, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 18-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) }; + shifts = simd_batch{ 0, 10, 0, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 18; + return in; +} + +inline static const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7ffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 19-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 19 | SafeLoad(in + 1) << 13, SafeLoad(in + 1), SafeLoad(in + 1) >> 25 | SafeLoad(in + 2) << 7 }; + shifts = simd_batch{ 0, 0, 6, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 19-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1, SafeLoad(in + 3) >> 18 | SafeLoad(in + 4) << 14, SafeLoad(in + 4) }; + shifts = simd_batch{ 12, 0, 0, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 19-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 4) >> 24 | SafeLoad(in + 5) << 8, SafeLoad(in + 5), SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2, SafeLoad(in + 6) >> 17 | SafeLoad(in + 7) << 15 }; + shifts = simd_batch{ 0, 11, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 19-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 23 | SafeLoad(in + 8) << 9, SafeLoad(in + 8), SafeLoad(in + 8) >> 29 | SafeLoad(in + 9) << 3 }; + shifts = simd_batch{ 4, 0, 10, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 19-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 9) >> 16 | SafeLoad(in + 10) << 16, SafeLoad(in + 10), SafeLoad(in + 10) >> 22 | SafeLoad(in + 11) << 10, SafeLoad(in + 11) }; + shifts = simd_batch{ 0, 3, 0, 9 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 19-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 11) >> 28 | SafeLoad(in + 12) << 4, SafeLoad(in + 12) >> 15 | SafeLoad(in + 13) << 17, SafeLoad(in + 13), SafeLoad(in + 13) >> 21 | SafeLoad(in + 14) << 11 }; + shifts = simd_batch{ 0, 0, 2, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 19-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 14), SafeLoad(in + 14) >> 27 | SafeLoad(in + 15) << 5, SafeLoad(in + 15) >> 14 | SafeLoad(in + 16) << 18, SafeLoad(in + 16) }; + shifts = simd_batch{ 8, 0, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 19-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 16) >> 20 | SafeLoad(in + 17) << 12, SafeLoad(in + 17), SafeLoad(in + 17) >> 26 | SafeLoad(in + 18) << 6, SafeLoad(in + 18) }; + shifts = simd_batch{ 0, 7, 0, 13 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 19; + return in; +} + +inline static const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xfffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 20-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 20 | SafeLoad(in + 1) << 12, SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4 }; + shifts = simd_batch{ 0, 0, 8, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 20-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 2) >> 16 | SafeLoad(in + 3) << 16, SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) }; + shifts = simd_batch{ 0, 4, 0, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 20-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4 }; + shifts = simd_batch{ 0, 0, 8, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 20-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9) }; + shifts = simd_batch{ 0, 4, 0, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 20-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 10), SafeLoad(in + 10) >> 20 | SafeLoad(in + 11) << 12, SafeLoad(in + 11), SafeLoad(in + 11) >> 28 | SafeLoad(in + 12) << 4 }; + shifts = simd_batch{ 0, 0, 8, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 20-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 12) >> 16 | SafeLoad(in + 13) << 16, SafeLoad(in + 13), SafeLoad(in + 13) >> 24 | SafeLoad(in + 14) << 8, SafeLoad(in + 14) }; + shifts = simd_batch{ 0, 4, 0, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 20-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 15), SafeLoad(in + 15) >> 20 | SafeLoad(in + 16) << 12, SafeLoad(in + 16), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4 }; + shifts = simd_batch{ 0, 0, 8, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 20-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 17) >> 16 | SafeLoad(in + 18) << 16, SafeLoad(in + 18), SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) }; + shifts = simd_batch{ 0, 4, 0, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 20; + return in; +} + +inline static const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1fffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 21-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 21 | SafeLoad(in + 1) << 11, SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1 }; + shifts = simd_batch{ 0, 0, 10, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 21-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) >> 19 | SafeLoad(in + 5) << 13 }; + shifts = simd_batch{ 0, 9, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 21-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14, SafeLoad(in + 7) }; + shifts = simd_batch{ 8, 0, 0, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 21-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) >> 17 | SafeLoad(in + 9) << 15, SafeLoad(in + 9), SafeLoad(in + 9) >> 27 | SafeLoad(in + 10) << 5 }; + shifts = simd_batch{ 0, 0, 6, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 21-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12) >> 15 | SafeLoad(in + 13) << 17 }; + shifts = simd_batch{ 0, 5, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 21-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 13), SafeLoad(in + 13) >> 25 | SafeLoad(in + 14) << 7, SafeLoad(in + 14) >> 14 | SafeLoad(in + 15) << 18, SafeLoad(in + 15) }; + shifts = simd_batch{ 4, 0, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 21-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 13 | SafeLoad(in + 17) << 19, SafeLoad(in + 17), SafeLoad(in + 17) >> 23 | SafeLoad(in + 18) << 9 }; + shifts = simd_batch{ 0, 0, 2, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 21-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 18) >> 12 | SafeLoad(in + 19) << 20, SafeLoad(in + 19), SafeLoad(in + 19) >> 22 | SafeLoad(in + 20) << 10, SafeLoad(in + 20) }; + shifts = simd_batch{ 0, 1, 0, 11 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 21; + return in; +} + +inline static const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3fffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 22-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10, SafeLoad(in + 1) >> 12 | SafeLoad(in + 2) << 20, SafeLoad(in + 2) }; + shifts = simd_batch{ 0, 0, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 22-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8, SafeLoad(in + 3) >> 14 | SafeLoad(in + 4) << 18, SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6 }; + shifts = simd_batch{ 0, 0, 4, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 22-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 5) >> 16 | SafeLoad(in + 6) << 16, SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7) >> 18 | SafeLoad(in + 8) << 14 }; + shifts = simd_batch{ 0, 6, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 22-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 10) }; + shifts = simd_batch{ 8, 0, 0, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 22-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 11), SafeLoad(in + 11) >> 22 | SafeLoad(in + 12) << 10, SafeLoad(in + 12) >> 12 | SafeLoad(in + 13) << 20, SafeLoad(in + 13) }; + shifts = simd_batch{ 0, 0, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 22-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 13) >> 24 | SafeLoad(in + 14) << 8, SafeLoad(in + 14) >> 14 | SafeLoad(in + 15) << 18, SafeLoad(in + 15), SafeLoad(in + 15) >> 26 | SafeLoad(in + 16) << 6 }; + shifts = simd_batch{ 0, 0, 4, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 22-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 16) >> 16 | SafeLoad(in + 17) << 16, SafeLoad(in + 17), SafeLoad(in + 17) >> 28 | SafeLoad(in + 18) << 4, SafeLoad(in + 18) >> 18 | SafeLoad(in + 19) << 14 }; + shifts = simd_batch{ 0, 6, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 22-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 19), SafeLoad(in + 19) >> 30 | SafeLoad(in + 20) << 2, SafeLoad(in + 20) >> 20 | SafeLoad(in + 21) << 12, SafeLoad(in + 21) }; + shifts = simd_batch{ 8, 0, 0, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 22; + return in; +} + +inline static const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7fffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 23-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 23 | SafeLoad(in + 1) << 9, SafeLoad(in + 1) >> 14 | SafeLoad(in + 2) << 18, SafeLoad(in + 2) }; + shifts = simd_batch{ 0, 0, 0, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 23-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 2) >> 28 | SafeLoad(in + 3) << 4, SafeLoad(in + 3) >> 19 | SafeLoad(in + 4) << 13, SafeLoad(in + 4) >> 10 | SafeLoad(in + 5) << 22, SafeLoad(in + 5) }; + shifts = simd_batch{ 0, 0, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 23-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 5) >> 24 | SafeLoad(in + 6) << 8, SafeLoad(in + 6) >> 15 | SafeLoad(in + 7) << 17, SafeLoad(in + 7), SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3 }; + shifts = simd_batch{ 0, 0, 6, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 23-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 8) >> 20 | SafeLoad(in + 9) << 12, SafeLoad(in + 9) >> 11 | SafeLoad(in + 10) << 21, SafeLoad(in + 10), SafeLoad(in + 10) >> 25 | SafeLoad(in + 11) << 7 }; + shifts = simd_batch{ 0, 0, 2, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 23-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 11) >> 16 | SafeLoad(in + 12) << 16, SafeLoad(in + 12), SafeLoad(in + 12) >> 30 | SafeLoad(in + 13) << 2, SafeLoad(in + 13) >> 21 | SafeLoad(in + 14) << 11 }; + shifts = simd_batch{ 0, 7, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 23-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 14) >> 12 | SafeLoad(in + 15) << 20, SafeLoad(in + 15), SafeLoad(in + 15) >> 26 | SafeLoad(in + 16) << 6, SafeLoad(in + 16) >> 17 | SafeLoad(in + 17) << 15 }; + shifts = simd_batch{ 0, 3, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 23-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 17), SafeLoad(in + 17) >> 31 | SafeLoad(in + 18) << 1, SafeLoad(in + 18) >> 22 | SafeLoad(in + 19) << 10, SafeLoad(in + 19) >> 13 | SafeLoad(in + 20) << 19 }; + shifts = simd_batch{ 8, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 23-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 20), SafeLoad(in + 20) >> 27 | SafeLoad(in + 21) << 5, SafeLoad(in + 21) >> 18 | SafeLoad(in + 22) << 14, SafeLoad(in + 22) }; + shifts = simd_batch{ 4, 0, 0, 9 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 23; + return in; +} + +inline static const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 24-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 24 | SafeLoad(in + 1) << 8, SafeLoad(in + 1) >> 16 | SafeLoad(in + 2) << 16, SafeLoad(in + 2) }; + shifts = simd_batch{ 0, 0, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 24-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) >> 16 | SafeLoad(in + 5) << 16, SafeLoad(in + 5) }; + shifts = simd_batch{ 0, 0, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 24-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8) }; + shifts = simd_batch{ 0, 0, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 24-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11) }; + shifts = simd_batch{ 0, 0, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 24-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 12), SafeLoad(in + 12) >> 24 | SafeLoad(in + 13) << 8, SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14) }; + shifts = simd_batch{ 0, 0, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 24-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 15), SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 16 | SafeLoad(in + 17) << 16, SafeLoad(in + 17) }; + shifts = simd_batch{ 0, 0, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 24-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 18), SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 16 | SafeLoad(in + 20) << 16, SafeLoad(in + 20) }; + shifts = simd_batch{ 0, 0, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 24-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 21), SafeLoad(in + 21) >> 24 | SafeLoad(in + 22) << 8, SafeLoad(in + 22) >> 16 | SafeLoad(in + 23) << 16, SafeLoad(in + 23) }; + shifts = simd_batch{ 0, 0, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 24; + return in; +} + +inline static const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1ffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 25-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 25 | SafeLoad(in + 1) << 7, SafeLoad(in + 1) >> 18 | SafeLoad(in + 2) << 14, SafeLoad(in + 2) >> 11 | SafeLoad(in + 3) << 21 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 25-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5) >> 15 | SafeLoad(in + 6) << 17 }; + shifts = simd_batch{ 4, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 25-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 6) >> 8 | SafeLoad(in + 7) << 24, SafeLoad(in + 7), SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6, SafeLoad(in + 8) >> 19 | SafeLoad(in + 9) << 13 }; + shifts = simd_batch{ 0, 1, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 25-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 20, SafeLoad(in + 10), SafeLoad(in + 10) >> 30 | SafeLoad(in + 11) << 2, SafeLoad(in + 11) >> 23 | SafeLoad(in + 12) << 9 }; + shifts = simd_batch{ 0, 5, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 25-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 12) >> 16 | SafeLoad(in + 13) << 16, SafeLoad(in + 13) >> 9 | SafeLoad(in + 14) << 23, SafeLoad(in + 14), SafeLoad(in + 14) >> 27 | SafeLoad(in + 15) << 5 }; + shifts = simd_batch{ 0, 0, 2, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 25-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 15) >> 20 | SafeLoad(in + 16) << 12, SafeLoad(in + 16) >> 13 | SafeLoad(in + 17) << 19, SafeLoad(in + 17), SafeLoad(in + 17) >> 31 | SafeLoad(in + 18) << 1 }; + shifts = simd_batch{ 0, 0, 6, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 25-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 17 | SafeLoad(in + 20) << 15, SafeLoad(in + 20) >> 10 | SafeLoad(in + 21) << 22, SafeLoad(in + 21) }; + shifts = simd_batch{ 0, 0, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 25-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 21) >> 28 | SafeLoad(in + 22) << 4, SafeLoad(in + 22) >> 21 | SafeLoad(in + 23) << 11, SafeLoad(in + 23) >> 14 | SafeLoad(in + 24) << 18, SafeLoad(in + 24) }; + shifts = simd_batch{ 0, 0, 0, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 25; + return in; +} + +inline static const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3ffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 26-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 26 | SafeLoad(in + 1) << 6, SafeLoad(in + 1) >> 20 | SafeLoad(in + 2) << 12, SafeLoad(in + 2) >> 14 | SafeLoad(in + 3) << 18 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 26-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 3) >> 8 | SafeLoad(in + 4) << 24, SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10 }; + shifts = simd_batch{ 0, 2, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 26-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 6) >> 16 | SafeLoad(in + 7) << 16, SafeLoad(in + 7) >> 10 | SafeLoad(in + 8) << 22, SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2 }; + shifts = simd_batch{ 0, 0, 4, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 26-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10) >> 18 | SafeLoad(in + 11) << 14, SafeLoad(in + 11) >> 12 | SafeLoad(in + 12) << 20, SafeLoad(in + 12) }; + shifts = simd_batch{ 0, 0, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 26-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 13), SafeLoad(in + 13) >> 26 | SafeLoad(in + 14) << 6, SafeLoad(in + 14) >> 20 | SafeLoad(in + 15) << 12, SafeLoad(in + 15) >> 14 | SafeLoad(in + 16) << 18 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 26-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 16) >> 8 | SafeLoad(in + 17) << 24, SafeLoad(in + 17), SafeLoad(in + 17) >> 28 | SafeLoad(in + 18) << 4, SafeLoad(in + 18) >> 22 | SafeLoad(in + 19) << 10 }; + shifts = simd_batch{ 0, 2, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 26-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 19) >> 16 | SafeLoad(in + 20) << 16, SafeLoad(in + 20) >> 10 | SafeLoad(in + 21) << 22, SafeLoad(in + 21), SafeLoad(in + 21) >> 30 | SafeLoad(in + 22) << 2 }; + shifts = simd_batch{ 0, 0, 4, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 26-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 22) >> 24 | SafeLoad(in + 23) << 8, SafeLoad(in + 23) >> 18 | SafeLoad(in + 24) << 14, SafeLoad(in + 24) >> 12 | SafeLoad(in + 25) << 20, SafeLoad(in + 25) }; + shifts = simd_batch{ 0, 0, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 26; + return in; +} + +inline static const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7ffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 27-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5, SafeLoad(in + 1) >> 22 | SafeLoad(in + 2) << 10, SafeLoad(in + 2) >> 17 | SafeLoad(in + 3) << 15 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 27-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 3) >> 12 | SafeLoad(in + 4) << 20, SafeLoad(in + 4) >> 7 | SafeLoad(in + 5) << 25, SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3 }; + shifts = simd_batch{ 0, 0, 2, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 27-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7) >> 19 | SafeLoad(in + 8) << 13, SafeLoad(in + 8) >> 14 | SafeLoad(in + 9) << 18, SafeLoad(in + 9) >> 9 | SafeLoad(in + 10) << 23 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 27-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 10), SafeLoad(in + 10) >> 31 | SafeLoad(in + 11) << 1, SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12) >> 21 | SafeLoad(in + 13) << 11 }; + shifts = simd_batch{ 4, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 27-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14) >> 11 | SafeLoad(in + 15) << 21, SafeLoad(in + 15) >> 6 | SafeLoad(in + 16) << 26, SafeLoad(in + 16) }; + shifts = simd_batch{ 0, 0, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 27-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) >> 23 | SafeLoad(in + 18) << 9, SafeLoad(in + 18) >> 18 | SafeLoad(in + 19) << 14, SafeLoad(in + 19) >> 13 | SafeLoad(in + 20) << 19 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 27-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 20) >> 8 | SafeLoad(in + 21) << 24, SafeLoad(in + 21), SafeLoad(in + 21) >> 30 | SafeLoad(in + 22) << 2, SafeLoad(in + 22) >> 25 | SafeLoad(in + 23) << 7 }; + shifts = simd_batch{ 0, 3, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 27-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 23) >> 20 | SafeLoad(in + 24) << 12, SafeLoad(in + 24) >> 15 | SafeLoad(in + 25) << 17, SafeLoad(in + 25) >> 10 | SafeLoad(in + 26) << 22, SafeLoad(in + 26) }; + shifts = simd_batch{ 0, 0, 0, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 27; + return in; +} + +inline static const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xfffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 28-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1) >> 24 | SafeLoad(in + 2) << 8, SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 28-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 3) >> 16 | SafeLoad(in + 4) << 16, SafeLoad(in + 4) >> 12 | SafeLoad(in + 5) << 20, SafeLoad(in + 5) >> 8 | SafeLoad(in + 6) << 24, SafeLoad(in + 6) }; + shifts = simd_batch{ 0, 0, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 28-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 28-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11) >> 12 | SafeLoad(in + 12) << 20, SafeLoad(in + 12) >> 8 | SafeLoad(in + 13) << 24, SafeLoad(in + 13) }; + shifts = simd_batch{ 0, 0, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 28-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 14), SafeLoad(in + 14) >> 28 | SafeLoad(in + 15) << 4, SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 20 | SafeLoad(in + 17) << 12 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 28-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 17) >> 16 | SafeLoad(in + 18) << 16, SafeLoad(in + 18) >> 12 | SafeLoad(in + 19) << 20, SafeLoad(in + 19) >> 8 | SafeLoad(in + 20) << 24, SafeLoad(in + 20) }; + shifts = simd_batch{ 0, 0, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 28-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 21), SafeLoad(in + 21) >> 28 | SafeLoad(in + 22) << 4, SafeLoad(in + 22) >> 24 | SafeLoad(in + 23) << 8, SafeLoad(in + 23) >> 20 | SafeLoad(in + 24) << 12 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 28-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 24) >> 16 | SafeLoad(in + 25) << 16, SafeLoad(in + 25) >> 12 | SafeLoad(in + 26) << 20, SafeLoad(in + 26) >> 8 | SafeLoad(in + 27) << 24, SafeLoad(in + 27) }; + shifts = simd_batch{ 0, 0, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 28; + return in; +} + +inline static const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1fffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 29-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 29 | SafeLoad(in + 1) << 3, SafeLoad(in + 1) >> 26 | SafeLoad(in + 2) << 6, SafeLoad(in + 2) >> 23 | SafeLoad(in + 3) << 9 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 29-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 3) >> 20 | SafeLoad(in + 4) << 12, SafeLoad(in + 4) >> 17 | SafeLoad(in + 5) << 15, SafeLoad(in + 5) >> 14 | SafeLoad(in + 6) << 18, SafeLoad(in + 6) >> 11 | SafeLoad(in + 7) << 21 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 29-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 7) >> 8 | SafeLoad(in + 8) << 24, SafeLoad(in + 8) >> 5 | SafeLoad(in + 9) << 27, SafeLoad(in + 9), SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1 }; + shifts = simd_batch{ 0, 0, 2, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 29-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 10) >> 28 | SafeLoad(in + 11) << 4, SafeLoad(in + 11) >> 25 | SafeLoad(in + 12) << 7, SafeLoad(in + 12) >> 22 | SafeLoad(in + 13) << 10, SafeLoad(in + 13) >> 19 | SafeLoad(in + 14) << 13 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 29-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 14) >> 16 | SafeLoad(in + 15) << 16, SafeLoad(in + 15) >> 13 | SafeLoad(in + 16) << 19, SafeLoad(in + 16) >> 10 | SafeLoad(in + 17) << 22, SafeLoad(in + 17) >> 7 | SafeLoad(in + 18) << 25 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 29-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 18) >> 4 | SafeLoad(in + 19) << 28, SafeLoad(in + 19), SafeLoad(in + 19) >> 30 | SafeLoad(in + 20) << 2, SafeLoad(in + 20) >> 27 | SafeLoad(in + 21) << 5 }; + shifts = simd_batch{ 0, 1, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 29-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 21) >> 24 | SafeLoad(in + 22) << 8, SafeLoad(in + 22) >> 21 | SafeLoad(in + 23) << 11, SafeLoad(in + 23) >> 18 | SafeLoad(in + 24) << 14, SafeLoad(in + 24) >> 15 | SafeLoad(in + 25) << 17 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 29-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 25) >> 12 | SafeLoad(in + 26) << 20, SafeLoad(in + 26) >> 9 | SafeLoad(in + 27) << 23, SafeLoad(in + 27) >> 6 | SafeLoad(in + 28) << 26, SafeLoad(in + 28) }; + shifts = simd_batch{ 0, 0, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 29; + return in; +} + +inline static const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3fffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 30-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 30-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 30-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8) >> 14 | SafeLoad(in + 9) << 18, SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 20, SafeLoad(in + 10) >> 10 | SafeLoad(in + 11) << 22 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 30-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 11) >> 8 | SafeLoad(in + 12) << 24, SafeLoad(in + 12) >> 6 | SafeLoad(in + 13) << 26, SafeLoad(in + 13) >> 4 | SafeLoad(in + 14) << 28, SafeLoad(in + 14) }; + shifts = simd_batch{ 0, 0, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 30-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 15), SafeLoad(in + 15) >> 30 | SafeLoad(in + 16) << 2, SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) >> 26 | SafeLoad(in + 18) << 6 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 30-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 22 | SafeLoad(in + 20) << 10, SafeLoad(in + 20) >> 20 | SafeLoad(in + 21) << 12, SafeLoad(in + 21) >> 18 | SafeLoad(in + 22) << 14 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 30-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 22) >> 16 | SafeLoad(in + 23) << 16, SafeLoad(in + 23) >> 14 | SafeLoad(in + 24) << 18, SafeLoad(in + 24) >> 12 | SafeLoad(in + 25) << 20, SafeLoad(in + 25) >> 10 | SafeLoad(in + 26) << 22 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 30-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 26) >> 8 | SafeLoad(in + 27) << 24, SafeLoad(in + 27) >> 6 | SafeLoad(in + 28) << 26, SafeLoad(in + 28) >> 4 | SafeLoad(in + 29) << 28, SafeLoad(in + 29) }; + shifts = simd_batch{ 0, 0, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 30; + return in; +} + +inline static const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7fffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 31-bit bundles 0 to 3 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 31 | SafeLoad(in + 1) << 1, SafeLoad(in + 1) >> 30 | SafeLoad(in + 2) << 2, SafeLoad(in + 2) >> 29 | SafeLoad(in + 3) << 3 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 31-bit bundles 4 to 7 + words = simd_batch{ SafeLoad(in + 3) >> 28 | SafeLoad(in + 4) << 4, SafeLoad(in + 4) >> 27 | SafeLoad(in + 5) << 5, SafeLoad(in + 5) >> 26 | SafeLoad(in + 6) << 6, SafeLoad(in + 6) >> 25 | SafeLoad(in + 7) << 7 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 31-bit bundles 8 to 11 + words = simd_batch{ SafeLoad(in + 7) >> 24 | SafeLoad(in + 8) << 8, SafeLoad(in + 8) >> 23 | SafeLoad(in + 9) << 9, SafeLoad(in + 9) >> 22 | SafeLoad(in + 10) << 10, SafeLoad(in + 10) >> 21 | SafeLoad(in + 11) << 11 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 31-bit bundles 12 to 15 + words = simd_batch{ SafeLoad(in + 11) >> 20 | SafeLoad(in + 12) << 12, SafeLoad(in + 12) >> 19 | SafeLoad(in + 13) << 13, SafeLoad(in + 13) >> 18 | SafeLoad(in + 14) << 14, SafeLoad(in + 14) >> 17 | SafeLoad(in + 15) << 15 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 31-bit bundles 16 to 19 + words = simd_batch{ SafeLoad(in + 15) >> 16 | SafeLoad(in + 16) << 16, SafeLoad(in + 16) >> 15 | SafeLoad(in + 17) << 17, SafeLoad(in + 17) >> 14 | SafeLoad(in + 18) << 18, SafeLoad(in + 18) >> 13 | SafeLoad(in + 19) << 19 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 31-bit bundles 20 to 23 + words = simd_batch{ SafeLoad(in + 19) >> 12 | SafeLoad(in + 20) << 20, SafeLoad(in + 20) >> 11 | SafeLoad(in + 21) << 21, SafeLoad(in + 21) >> 10 | SafeLoad(in + 22) << 22, SafeLoad(in + 22) >> 9 | SafeLoad(in + 23) << 23 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 31-bit bundles 24 to 27 + words = simd_batch{ SafeLoad(in + 23) >> 8 | SafeLoad(in + 24) << 24, SafeLoad(in + 24) >> 7 | SafeLoad(in + 25) << 25, SafeLoad(in + 25) >> 6 | SafeLoad(in + 26) << 26, SafeLoad(in + 26) >> 5 | SafeLoad(in + 27) << 27 }; + shifts = simd_batch{ 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + // extract 31-bit bundles 28 to 31 + words = simd_batch{ SafeLoad(in + 27) >> 4 | SafeLoad(in + 28) << 28, SafeLoad(in + 28) >> 3 | SafeLoad(in + 29) << 29, SafeLoad(in + 29) >> 2 | SafeLoad(in + 30) << 30, SafeLoad(in + 30) }; + shifts = simd_batch{ 0, 0, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 4; + + in += 31; + return in; +} + +inline static const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) { + memcpy(out, in, 32 * sizeof(*out)); + in += 32; + out += 32; + + return in; +} + +}; // struct UnpackBits128 + +} // namespace +} // namespace internal +} // namespace arrow + diff --git a/cpp/src/arrow/util/bpacking_simd256_generated.h b/cpp/src/arrow/util/bpacking_simd256_generated.h new file mode 100644 index 00000000000..a73bafe17e5 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_simd256_generated.h @@ -0,0 +1,1270 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Automatically generated file; DO NOT EDIT. + +#pragma once + +#include +#include + +#include + +#include "arrow/util/dispatch.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace internal { +namespace { + +using ::arrow::util::SafeLoad; + +template +struct UnpackBits256 { + +using simd_batch = xsimd::batch; + +inline static const uint32_t* unpack0_32(const uint32_t* in, uint32_t* out) { + memset(out, 0x0, 32 * sizeof(*out)); + out += 32; + + return in; +} + +inline static const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 1-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 0, 1, 2, 3, 4, 5, 6, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 1-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 8, 9, 10, 11, 12, 13, 14, 15 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 1-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 16, 17, 18, 19, 20, 21, 22, 23 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 1-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 24, 25, 26, 27, 28, 29, 30, 31 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 1; + return in; +} + +inline static const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 2-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 2-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 16, 18, 20, 22, 24, 26, 28, 30 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 2-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 2-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 16, 18, 20, 22, 24, 26, 28, 30 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 2; + return in; +} + +inline static const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 3-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 0, 3, 6, 9, 12, 15, 18, 21 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 3-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 24, 27, 0, 1, 4, 7, 10, 13 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 3-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 16, 19, 22, 25, 28, 0, 2, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 3-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 8, 11, 14, 17, 20, 23, 26, 29 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 3; + return in; +} + +inline static const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xf; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 4-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 4-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 4-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 4-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 4; + return in; +} + +inline static const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 5-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 5, 10, 15, 20, 25, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 5-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 8, 13, 18, 23, 0, 1, 6, 11 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 5-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 16, 21, 26, 0, 4, 9, 14, 19 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 5-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; + shifts = simd_batch{ 24, 0, 2, 7, 12, 17, 22, 27 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 5; + return in; +} + +inline static const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 6-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 6, 12, 18, 24, 0, 4, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 6-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 16, 22, 0, 2, 8, 14, 20, 26 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 6-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4) }; + shifts = simd_batch{ 0, 6, 12, 18, 24, 0, 4, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 6-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) }; + shifts = simd_batch{ 16, 22, 0, 2, 8, 14, 20, 26 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 6; + return in; +} + +inline static const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 7-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 7, 14, 21, 0, 3, 10, 17 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 7-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5, SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 24, 0, 6, 13, 20, 0, 2, 9 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 7-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5) }; + shifts = simd_batch{ 16, 23, 0, 5, 12, 19, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 7-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) }; + shifts = simd_batch{ 8, 15, 22, 0, 4, 11, 18, 25 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 7; + return in; +} + +inline static const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 8-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 8-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 8-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) }; + shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 8-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) }; + shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 8; + return in; +} + +inline static const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 9-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1 }; + shifts = simd_batch{ 0, 9, 18, 0, 4, 13, 22, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 9-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) }; + shifts = simd_batch{ 8, 17, 0, 3, 12, 21, 0, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 9-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6), SafeLoad(in + 6) }; + shifts = simd_batch{ 16, 0, 2, 11, 20, 0, 6, 15 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 9-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) }; + shifts = simd_batch{ 0, 1, 10, 19, 0, 5, 14, 23 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 9; + return in; +} + +inline static const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 10-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) }; + shifts = simd_batch{ 0, 10, 20, 0, 8, 18, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 10-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; + shifts = simd_batch{ 16, 0, 4, 14, 0, 2, 12, 22 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 10-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2, SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7) }; + shifts = simd_batch{ 0, 10, 20, 0, 8, 18, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 10-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) }; + shifts = simd_batch{ 16, 0, 4, 14, 0, 2, 12, 22 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 10; + return in; +} + +inline static const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 11-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 23 | SafeLoad(in + 2) << 9, SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 0, 11, 0, 1, 12, 0, 2, 13 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 11-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 25 | SafeLoad(in + 4) << 7, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5) }; + shifts = simd_batch{ 0, 3, 14, 0, 4, 15, 0, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 11-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5, SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3 }; + shifts = simd_batch{ 16, 0, 6, 17, 0, 7, 18, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 11-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1, SafeLoad(in + 10), SafeLoad(in + 10) }; + shifts = simd_batch{ 8, 19, 0, 9, 20, 0, 10, 21 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 11; + return in; +} + +inline static const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xfff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 12-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 24 | SafeLoad(in + 1) << 8, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 12-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5), SafeLoad(in + 5) }; + shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 12-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8) }; + shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 12-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 10) >> 28 | SafeLoad(in + 11) << 4, SafeLoad(in + 11), SafeLoad(in + 11) }; + shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 12; + return in; +} + +inline static const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 13-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 26 | SafeLoad(in + 1) << 6, SafeLoad(in + 1), SafeLoad(in + 1) >> 20 | SafeLoad(in + 2) << 12, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5 }; + shifts = simd_batch{ 0, 13, 0, 7, 0, 1, 14, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 13-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 21 | SafeLoad(in + 4) << 11, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5), SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 6) }; + shifts = simd_batch{ 8, 0, 2, 15, 0, 9, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 13-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6) >> 29 | SafeLoad(in + 7) << 3, SafeLoad(in + 7), SafeLoad(in + 7) >> 23 | SafeLoad(in + 8) << 9, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) }; + shifts = simd_batch{ 16, 0, 10, 0, 4, 17, 0, 11 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 13-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 10) >> 31 | SafeLoad(in + 11) << 1, SafeLoad(in + 11), SafeLoad(in + 11) >> 25 | SafeLoad(in + 12) << 7, SafeLoad(in + 12), SafeLoad(in + 12) }; + shifts = simd_batch{ 0, 5, 18, 0, 12, 0, 6, 19 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 13; + return in; +} + +inline static const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 14-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1), SafeLoad(in + 1) >> 24 | SafeLoad(in + 2) << 8, SafeLoad(in + 2), SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 3) }; + shifts = simd_batch{ 0, 14, 0, 10, 0, 6, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 14-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5), SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 6), SafeLoad(in + 6) }; + shifts = simd_batch{ 16, 0, 12, 0, 8, 0, 4, 18 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 14-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9), SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 10) }; + shifts = simd_batch{ 0, 14, 0, 10, 0, 6, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 14-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 10), SafeLoad(in + 10) >> 30 | SafeLoad(in + 11) << 2, SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12), SafeLoad(in + 12) >> 22 | SafeLoad(in + 13) << 10, SafeLoad(in + 13), SafeLoad(in + 13) }; + shifts = simd_batch{ 16, 0, 12, 0, 8, 0, 4, 18 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 14; + return in; +} + +inline static const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 15-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3) }; + shifts = simd_batch{ 0, 15, 0, 13, 0, 11, 0, 9 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 15-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6), SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14, SafeLoad(in + 7) }; + shifts = simd_batch{ 0, 7, 0, 5, 0, 3, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 15-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 31 | SafeLoad(in + 8) << 1, SafeLoad(in + 8), SafeLoad(in + 8) >> 29 | SafeLoad(in + 9) << 3, SafeLoad(in + 9), SafeLoad(in + 9) >> 27 | SafeLoad(in + 10) << 5, SafeLoad(in + 10), SafeLoad(in + 10) >> 25 | SafeLoad(in + 11) << 7 }; + shifts = simd_batch{ 16, 0, 14, 0, 12, 0, 10, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 15-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 11), SafeLoad(in + 11) >> 23 | SafeLoad(in + 12) << 9, SafeLoad(in + 12), SafeLoad(in + 12) >> 21 | SafeLoad(in + 13) << 11, SafeLoad(in + 13), SafeLoad(in + 13) >> 19 | SafeLoad(in + 14) << 13, SafeLoad(in + 14), SafeLoad(in + 14) }; + shifts = simd_batch{ 8, 0, 6, 0, 4, 0, 2, 17 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 15; + return in; +} + +inline static const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 16-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 16-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 7), SafeLoad(in + 7) }; + shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 16-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 11), SafeLoad(in + 11) }; + shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 16-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 12), SafeLoad(in + 12), SafeLoad(in + 13), SafeLoad(in + 13), SafeLoad(in + 14), SafeLoad(in + 14), SafeLoad(in + 15), SafeLoad(in + 15) }; + shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 16; + return in; +} + +inline static const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1ffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 17-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 17 | SafeLoad(in + 1) << 15, SafeLoad(in + 1), SafeLoad(in + 1) >> 19 | SafeLoad(in + 2) << 13, SafeLoad(in + 2), SafeLoad(in + 2) >> 21 | SafeLoad(in + 3) << 11, SafeLoad(in + 3), SafeLoad(in + 3) >> 23 | SafeLoad(in + 4) << 9 }; + shifts = simd_batch{ 0, 0, 2, 0, 4, 0, 6, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 17-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 5), SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5, SafeLoad(in + 6), SafeLoad(in + 6) >> 29 | SafeLoad(in + 7) << 3, SafeLoad(in + 7), SafeLoad(in + 7) >> 31 | SafeLoad(in + 8) << 1 }; + shifts = simd_batch{ 8, 0, 10, 0, 12, 0, 14, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 17-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 8) >> 16 | SafeLoad(in + 9) << 16, SafeLoad(in + 9), SafeLoad(in + 9) >> 18 | SafeLoad(in + 10) << 14, SafeLoad(in + 10), SafeLoad(in + 10) >> 20 | SafeLoad(in + 11) << 12, SafeLoad(in + 11), SafeLoad(in + 11) >> 22 | SafeLoad(in + 12) << 10, SafeLoad(in + 12) }; + shifts = simd_batch{ 0, 1, 0, 3, 0, 5, 0, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 17-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 12) >> 24 | SafeLoad(in + 13) << 8, SafeLoad(in + 13), SafeLoad(in + 13) >> 26 | SafeLoad(in + 14) << 6, SafeLoad(in + 14), SafeLoad(in + 14) >> 28 | SafeLoad(in + 15) << 4, SafeLoad(in + 15), SafeLoad(in + 15) >> 30 | SafeLoad(in + 16) << 2, SafeLoad(in + 16) }; + shifts = simd_batch{ 0, 9, 0, 11, 0, 13, 0, 15 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 17; + return in; +} + +inline static const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3ffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 18-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 18 | SafeLoad(in + 1) << 14, SafeLoad(in + 1), SafeLoad(in + 1) >> 22 | SafeLoad(in + 2) << 10, SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2 }; + shifts = simd_batch{ 0, 0, 4, 0, 8, 0, 12, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 18-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 4) >> 16 | SafeLoad(in + 5) << 16, SafeLoad(in + 5), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) }; + shifts = simd_batch{ 0, 2, 0, 6, 0, 10, 0, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 18-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9) >> 18 | SafeLoad(in + 10) << 14, SafeLoad(in + 10), SafeLoad(in + 10) >> 22 | SafeLoad(in + 11) << 10, SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12), SafeLoad(in + 12) >> 30 | SafeLoad(in + 13) << 2 }; + shifts = simd_batch{ 0, 0, 4, 0, 8, 0, 12, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 18-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14), SafeLoad(in + 14) >> 20 | SafeLoad(in + 15) << 12, SafeLoad(in + 15), SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) }; + shifts = simd_batch{ 0, 2, 0, 6, 0, 10, 0, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 18; + return in; +} + +inline static const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7ffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 19-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 19 | SafeLoad(in + 1) << 13, SafeLoad(in + 1), SafeLoad(in + 1) >> 25 | SafeLoad(in + 2) << 7, SafeLoad(in + 2), SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1, SafeLoad(in + 3) >> 18 | SafeLoad(in + 4) << 14, SafeLoad(in + 4) }; + shifts = simd_batch{ 0, 0, 6, 0, 12, 0, 0, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 19-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 4) >> 24 | SafeLoad(in + 5) << 8, SafeLoad(in + 5), SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2, SafeLoad(in + 6) >> 17 | SafeLoad(in + 7) << 15, SafeLoad(in + 7), SafeLoad(in + 7) >> 23 | SafeLoad(in + 8) << 9, SafeLoad(in + 8), SafeLoad(in + 8) >> 29 | SafeLoad(in + 9) << 3 }; + shifts = simd_batch{ 0, 11, 0, 0, 4, 0, 10, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 19-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 9) >> 16 | SafeLoad(in + 10) << 16, SafeLoad(in + 10), SafeLoad(in + 10) >> 22 | SafeLoad(in + 11) << 10, SafeLoad(in + 11), SafeLoad(in + 11) >> 28 | SafeLoad(in + 12) << 4, SafeLoad(in + 12) >> 15 | SafeLoad(in + 13) << 17, SafeLoad(in + 13), SafeLoad(in + 13) >> 21 | SafeLoad(in + 14) << 11 }; + shifts = simd_batch{ 0, 3, 0, 9, 0, 0, 2, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 19-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 14), SafeLoad(in + 14) >> 27 | SafeLoad(in + 15) << 5, SafeLoad(in + 15) >> 14 | SafeLoad(in + 16) << 18, SafeLoad(in + 16), SafeLoad(in + 16) >> 20 | SafeLoad(in + 17) << 12, SafeLoad(in + 17), SafeLoad(in + 17) >> 26 | SafeLoad(in + 18) << 6, SafeLoad(in + 18) }; + shifts = simd_batch{ 8, 0, 0, 1, 0, 7, 0, 13 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 19; + return in; +} + +inline static const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xfffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 20-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 20 | SafeLoad(in + 1) << 12, SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) >> 16 | SafeLoad(in + 3) << 16, SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) }; + shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 20-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9) }; + shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 20-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 10), SafeLoad(in + 10) >> 20 | SafeLoad(in + 11) << 12, SafeLoad(in + 11), SafeLoad(in + 11) >> 28 | SafeLoad(in + 12) << 4, SafeLoad(in + 12) >> 16 | SafeLoad(in + 13) << 16, SafeLoad(in + 13), SafeLoad(in + 13) >> 24 | SafeLoad(in + 14) << 8, SafeLoad(in + 14) }; + shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 20-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 15), SafeLoad(in + 15) >> 20 | SafeLoad(in + 16) << 12, SafeLoad(in + 16), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) >> 16 | SafeLoad(in + 18) << 16, SafeLoad(in + 18), SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) }; + shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 20; + return in; +} + +inline static const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1fffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 21-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 21 | SafeLoad(in + 1) << 11, SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) >> 19 | SafeLoad(in + 5) << 13 }; + shifts = simd_batch{ 0, 0, 10, 0, 0, 9, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 21-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14, SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) >> 17 | SafeLoad(in + 9) << 15, SafeLoad(in + 9), SafeLoad(in + 9) >> 27 | SafeLoad(in + 10) << 5 }; + shifts = simd_batch{ 8, 0, 0, 7, 0, 0, 6, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 21-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12) >> 15 | SafeLoad(in + 13) << 17, SafeLoad(in + 13), SafeLoad(in + 13) >> 25 | SafeLoad(in + 14) << 7, SafeLoad(in + 14) >> 14 | SafeLoad(in + 15) << 18, SafeLoad(in + 15) }; + shifts = simd_batch{ 0, 5, 0, 0, 4, 0, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 21-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 13 | SafeLoad(in + 17) << 19, SafeLoad(in + 17), SafeLoad(in + 17) >> 23 | SafeLoad(in + 18) << 9, SafeLoad(in + 18) >> 12 | SafeLoad(in + 19) << 20, SafeLoad(in + 19), SafeLoad(in + 19) >> 22 | SafeLoad(in + 20) << 10, SafeLoad(in + 20) }; + shifts = simd_batch{ 0, 0, 2, 0, 0, 1, 0, 11 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 21; + return in; +} + +inline static const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3fffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 22-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10, SafeLoad(in + 1) >> 12 | SafeLoad(in + 2) << 20, SafeLoad(in + 2), SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8, SafeLoad(in + 3) >> 14 | SafeLoad(in + 4) << 18, SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6 }; + shifts = simd_batch{ 0, 0, 0, 2, 0, 0, 4, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 22-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 5) >> 16 | SafeLoad(in + 6) << 16, SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7) >> 18 | SafeLoad(in + 8) << 14, SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 10) }; + shifts = simd_batch{ 0, 6, 0, 0, 8, 0, 0, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 22-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 11), SafeLoad(in + 11) >> 22 | SafeLoad(in + 12) << 10, SafeLoad(in + 12) >> 12 | SafeLoad(in + 13) << 20, SafeLoad(in + 13), SafeLoad(in + 13) >> 24 | SafeLoad(in + 14) << 8, SafeLoad(in + 14) >> 14 | SafeLoad(in + 15) << 18, SafeLoad(in + 15), SafeLoad(in + 15) >> 26 | SafeLoad(in + 16) << 6 }; + shifts = simd_batch{ 0, 0, 0, 2, 0, 0, 4, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 22-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 16) >> 16 | SafeLoad(in + 17) << 16, SafeLoad(in + 17), SafeLoad(in + 17) >> 28 | SafeLoad(in + 18) << 4, SafeLoad(in + 18) >> 18 | SafeLoad(in + 19) << 14, SafeLoad(in + 19), SafeLoad(in + 19) >> 30 | SafeLoad(in + 20) << 2, SafeLoad(in + 20) >> 20 | SafeLoad(in + 21) << 12, SafeLoad(in + 21) }; + shifts = simd_batch{ 0, 6, 0, 0, 8, 0, 0, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 22; + return in; +} + +inline static const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7fffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 23-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 23 | SafeLoad(in + 1) << 9, SafeLoad(in + 1) >> 14 | SafeLoad(in + 2) << 18, SafeLoad(in + 2), SafeLoad(in + 2) >> 28 | SafeLoad(in + 3) << 4, SafeLoad(in + 3) >> 19 | SafeLoad(in + 4) << 13, SafeLoad(in + 4) >> 10 | SafeLoad(in + 5) << 22, SafeLoad(in + 5) }; + shifts = simd_batch{ 0, 0, 0, 5, 0, 0, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 23-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 5) >> 24 | SafeLoad(in + 6) << 8, SafeLoad(in + 6) >> 15 | SafeLoad(in + 7) << 17, SafeLoad(in + 7), SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3, SafeLoad(in + 8) >> 20 | SafeLoad(in + 9) << 12, SafeLoad(in + 9) >> 11 | SafeLoad(in + 10) << 21, SafeLoad(in + 10), SafeLoad(in + 10) >> 25 | SafeLoad(in + 11) << 7 }; + shifts = simd_batch{ 0, 0, 6, 0, 0, 0, 2, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 23-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 11) >> 16 | SafeLoad(in + 12) << 16, SafeLoad(in + 12), SafeLoad(in + 12) >> 30 | SafeLoad(in + 13) << 2, SafeLoad(in + 13) >> 21 | SafeLoad(in + 14) << 11, SafeLoad(in + 14) >> 12 | SafeLoad(in + 15) << 20, SafeLoad(in + 15), SafeLoad(in + 15) >> 26 | SafeLoad(in + 16) << 6, SafeLoad(in + 16) >> 17 | SafeLoad(in + 17) << 15 }; + shifts = simd_batch{ 0, 7, 0, 0, 0, 3, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 23-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 17), SafeLoad(in + 17) >> 31 | SafeLoad(in + 18) << 1, SafeLoad(in + 18) >> 22 | SafeLoad(in + 19) << 10, SafeLoad(in + 19) >> 13 | SafeLoad(in + 20) << 19, SafeLoad(in + 20), SafeLoad(in + 20) >> 27 | SafeLoad(in + 21) << 5, SafeLoad(in + 21) >> 18 | SafeLoad(in + 22) << 14, SafeLoad(in + 22) }; + shifts = simd_batch{ 8, 0, 0, 0, 4, 0, 0, 9 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 23; + return in; +} + +inline static const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 24-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 24 | SafeLoad(in + 1) << 8, SafeLoad(in + 1) >> 16 | SafeLoad(in + 2) << 16, SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) >> 16 | SafeLoad(in + 5) << 16, SafeLoad(in + 5) }; + shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 24-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8), SafeLoad(in + 9), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11) }; + shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 24-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 12), SafeLoad(in + 12) >> 24 | SafeLoad(in + 13) << 8, SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14), SafeLoad(in + 15), SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 16 | SafeLoad(in + 17) << 16, SafeLoad(in + 17) }; + shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 24-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 18), SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 16 | SafeLoad(in + 20) << 16, SafeLoad(in + 20), SafeLoad(in + 21), SafeLoad(in + 21) >> 24 | SafeLoad(in + 22) << 8, SafeLoad(in + 22) >> 16 | SafeLoad(in + 23) << 16, SafeLoad(in + 23) }; + shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 24; + return in; +} + +inline static const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1ffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 25-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 25 | SafeLoad(in + 1) << 7, SafeLoad(in + 1) >> 18 | SafeLoad(in + 2) << 14, SafeLoad(in + 2) >> 11 | SafeLoad(in + 3) << 21, SafeLoad(in + 3), SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5) >> 15 | SafeLoad(in + 6) << 17 }; + shifts = simd_batch{ 0, 0, 0, 0, 4, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 25-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 6) >> 8 | SafeLoad(in + 7) << 24, SafeLoad(in + 7), SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6, SafeLoad(in + 8) >> 19 | SafeLoad(in + 9) << 13, SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 20, SafeLoad(in + 10), SafeLoad(in + 10) >> 30 | SafeLoad(in + 11) << 2, SafeLoad(in + 11) >> 23 | SafeLoad(in + 12) << 9 }; + shifts = simd_batch{ 0, 1, 0, 0, 0, 5, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 25-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 12) >> 16 | SafeLoad(in + 13) << 16, SafeLoad(in + 13) >> 9 | SafeLoad(in + 14) << 23, SafeLoad(in + 14), SafeLoad(in + 14) >> 27 | SafeLoad(in + 15) << 5, SafeLoad(in + 15) >> 20 | SafeLoad(in + 16) << 12, SafeLoad(in + 16) >> 13 | SafeLoad(in + 17) << 19, SafeLoad(in + 17), SafeLoad(in + 17) >> 31 | SafeLoad(in + 18) << 1 }; + shifts = simd_batch{ 0, 0, 2, 0, 0, 0, 6, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 25-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 17 | SafeLoad(in + 20) << 15, SafeLoad(in + 20) >> 10 | SafeLoad(in + 21) << 22, SafeLoad(in + 21), SafeLoad(in + 21) >> 28 | SafeLoad(in + 22) << 4, SafeLoad(in + 22) >> 21 | SafeLoad(in + 23) << 11, SafeLoad(in + 23) >> 14 | SafeLoad(in + 24) << 18, SafeLoad(in + 24) }; + shifts = simd_batch{ 0, 0, 0, 3, 0, 0, 0, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 25; + return in; +} + +inline static const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3ffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 26-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 26 | SafeLoad(in + 1) << 6, SafeLoad(in + 1) >> 20 | SafeLoad(in + 2) << 12, SafeLoad(in + 2) >> 14 | SafeLoad(in + 3) << 18, SafeLoad(in + 3) >> 8 | SafeLoad(in + 4) << 24, SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10 }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 2, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 26-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 6) >> 16 | SafeLoad(in + 7) << 16, SafeLoad(in + 7) >> 10 | SafeLoad(in + 8) << 22, SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10) >> 18 | SafeLoad(in + 11) << 14, SafeLoad(in + 11) >> 12 | SafeLoad(in + 12) << 20, SafeLoad(in + 12) }; + shifts = simd_batch{ 0, 0, 4, 0, 0, 0, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 26-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 13), SafeLoad(in + 13) >> 26 | SafeLoad(in + 14) << 6, SafeLoad(in + 14) >> 20 | SafeLoad(in + 15) << 12, SafeLoad(in + 15) >> 14 | SafeLoad(in + 16) << 18, SafeLoad(in + 16) >> 8 | SafeLoad(in + 17) << 24, SafeLoad(in + 17), SafeLoad(in + 17) >> 28 | SafeLoad(in + 18) << 4, SafeLoad(in + 18) >> 22 | SafeLoad(in + 19) << 10 }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 2, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 26-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 19) >> 16 | SafeLoad(in + 20) << 16, SafeLoad(in + 20) >> 10 | SafeLoad(in + 21) << 22, SafeLoad(in + 21), SafeLoad(in + 21) >> 30 | SafeLoad(in + 22) << 2, SafeLoad(in + 22) >> 24 | SafeLoad(in + 23) << 8, SafeLoad(in + 23) >> 18 | SafeLoad(in + 24) << 14, SafeLoad(in + 24) >> 12 | SafeLoad(in + 25) << 20, SafeLoad(in + 25) }; + shifts = simd_batch{ 0, 0, 4, 0, 0, 0, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 26; + return in; +} + +inline static const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7ffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 27-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5, SafeLoad(in + 1) >> 22 | SafeLoad(in + 2) << 10, SafeLoad(in + 2) >> 17 | SafeLoad(in + 3) << 15, SafeLoad(in + 3) >> 12 | SafeLoad(in + 4) << 20, SafeLoad(in + 4) >> 7 | SafeLoad(in + 5) << 25, SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3 }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 2, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 27-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7) >> 19 | SafeLoad(in + 8) << 13, SafeLoad(in + 8) >> 14 | SafeLoad(in + 9) << 18, SafeLoad(in + 9) >> 9 | SafeLoad(in + 10) << 23, SafeLoad(in + 10), SafeLoad(in + 10) >> 31 | SafeLoad(in + 11) << 1, SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12) >> 21 | SafeLoad(in + 13) << 11 }; + shifts = simd_batch{ 0, 0, 0, 0, 4, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 27-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14) >> 11 | SafeLoad(in + 15) << 21, SafeLoad(in + 15) >> 6 | SafeLoad(in + 16) << 26, SafeLoad(in + 16), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) >> 23 | SafeLoad(in + 18) << 9, SafeLoad(in + 18) >> 18 | SafeLoad(in + 19) << 14, SafeLoad(in + 19) >> 13 | SafeLoad(in + 20) << 19 }; + shifts = simd_batch{ 0, 0, 0, 1, 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 27-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 20) >> 8 | SafeLoad(in + 21) << 24, SafeLoad(in + 21), SafeLoad(in + 21) >> 30 | SafeLoad(in + 22) << 2, SafeLoad(in + 22) >> 25 | SafeLoad(in + 23) << 7, SafeLoad(in + 23) >> 20 | SafeLoad(in + 24) << 12, SafeLoad(in + 24) >> 15 | SafeLoad(in + 25) << 17, SafeLoad(in + 25) >> 10 | SafeLoad(in + 26) << 22, SafeLoad(in + 26) }; + shifts = simd_batch{ 0, 3, 0, 0, 0, 0, 0, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 27; + return in; +} + +inline static const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xfffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 28-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1) >> 24 | SafeLoad(in + 2) << 8, SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 3) >> 16 | SafeLoad(in + 4) << 16, SafeLoad(in + 4) >> 12 | SafeLoad(in + 5) << 20, SafeLoad(in + 5) >> 8 | SafeLoad(in + 6) << 24, SafeLoad(in + 6) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 28-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11) >> 12 | SafeLoad(in + 12) << 20, SafeLoad(in + 12) >> 8 | SafeLoad(in + 13) << 24, SafeLoad(in + 13) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 28-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 14), SafeLoad(in + 14) >> 28 | SafeLoad(in + 15) << 4, SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 20 | SafeLoad(in + 17) << 12, SafeLoad(in + 17) >> 16 | SafeLoad(in + 18) << 16, SafeLoad(in + 18) >> 12 | SafeLoad(in + 19) << 20, SafeLoad(in + 19) >> 8 | SafeLoad(in + 20) << 24, SafeLoad(in + 20) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 28-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 21), SafeLoad(in + 21) >> 28 | SafeLoad(in + 22) << 4, SafeLoad(in + 22) >> 24 | SafeLoad(in + 23) << 8, SafeLoad(in + 23) >> 20 | SafeLoad(in + 24) << 12, SafeLoad(in + 24) >> 16 | SafeLoad(in + 25) << 16, SafeLoad(in + 25) >> 12 | SafeLoad(in + 26) << 20, SafeLoad(in + 26) >> 8 | SafeLoad(in + 27) << 24, SafeLoad(in + 27) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 28; + return in; +} + +inline static const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1fffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 29-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 29 | SafeLoad(in + 1) << 3, SafeLoad(in + 1) >> 26 | SafeLoad(in + 2) << 6, SafeLoad(in + 2) >> 23 | SafeLoad(in + 3) << 9, SafeLoad(in + 3) >> 20 | SafeLoad(in + 4) << 12, SafeLoad(in + 4) >> 17 | SafeLoad(in + 5) << 15, SafeLoad(in + 5) >> 14 | SafeLoad(in + 6) << 18, SafeLoad(in + 6) >> 11 | SafeLoad(in + 7) << 21 }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 29-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 7) >> 8 | SafeLoad(in + 8) << 24, SafeLoad(in + 8) >> 5 | SafeLoad(in + 9) << 27, SafeLoad(in + 9), SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1, SafeLoad(in + 10) >> 28 | SafeLoad(in + 11) << 4, SafeLoad(in + 11) >> 25 | SafeLoad(in + 12) << 7, SafeLoad(in + 12) >> 22 | SafeLoad(in + 13) << 10, SafeLoad(in + 13) >> 19 | SafeLoad(in + 14) << 13 }; + shifts = simd_batch{ 0, 0, 2, 0, 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 29-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 14) >> 16 | SafeLoad(in + 15) << 16, SafeLoad(in + 15) >> 13 | SafeLoad(in + 16) << 19, SafeLoad(in + 16) >> 10 | SafeLoad(in + 17) << 22, SafeLoad(in + 17) >> 7 | SafeLoad(in + 18) << 25, SafeLoad(in + 18) >> 4 | SafeLoad(in + 19) << 28, SafeLoad(in + 19), SafeLoad(in + 19) >> 30 | SafeLoad(in + 20) << 2, SafeLoad(in + 20) >> 27 | SafeLoad(in + 21) << 5 }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 1, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 29-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 21) >> 24 | SafeLoad(in + 22) << 8, SafeLoad(in + 22) >> 21 | SafeLoad(in + 23) << 11, SafeLoad(in + 23) >> 18 | SafeLoad(in + 24) << 14, SafeLoad(in + 24) >> 15 | SafeLoad(in + 25) << 17, SafeLoad(in + 25) >> 12 | SafeLoad(in + 26) << 20, SafeLoad(in + 26) >> 9 | SafeLoad(in + 27) << 23, SafeLoad(in + 27) >> 6 | SafeLoad(in + 28) << 26, SafeLoad(in + 28) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 29; + return in; +} + +inline static const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3fffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 30-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14 }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 30-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8) >> 14 | SafeLoad(in + 9) << 18, SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 20, SafeLoad(in + 10) >> 10 | SafeLoad(in + 11) << 22, SafeLoad(in + 11) >> 8 | SafeLoad(in + 12) << 24, SafeLoad(in + 12) >> 6 | SafeLoad(in + 13) << 26, SafeLoad(in + 13) >> 4 | SafeLoad(in + 14) << 28, SafeLoad(in + 14) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 30-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 15), SafeLoad(in + 15) >> 30 | SafeLoad(in + 16) << 2, SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) >> 26 | SafeLoad(in + 18) << 6, SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 22 | SafeLoad(in + 20) << 10, SafeLoad(in + 20) >> 20 | SafeLoad(in + 21) << 12, SafeLoad(in + 21) >> 18 | SafeLoad(in + 22) << 14 }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 30-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 22) >> 16 | SafeLoad(in + 23) << 16, SafeLoad(in + 23) >> 14 | SafeLoad(in + 24) << 18, SafeLoad(in + 24) >> 12 | SafeLoad(in + 25) << 20, SafeLoad(in + 25) >> 10 | SafeLoad(in + 26) << 22, SafeLoad(in + 26) >> 8 | SafeLoad(in + 27) << 24, SafeLoad(in + 27) >> 6 | SafeLoad(in + 28) << 26, SafeLoad(in + 28) >> 4 | SafeLoad(in + 29) << 28, SafeLoad(in + 29) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 30; + return in; +} + +inline static const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7fffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 31-bit bundles 0 to 7 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 31 | SafeLoad(in + 1) << 1, SafeLoad(in + 1) >> 30 | SafeLoad(in + 2) << 2, SafeLoad(in + 2) >> 29 | SafeLoad(in + 3) << 3, SafeLoad(in + 3) >> 28 | SafeLoad(in + 4) << 4, SafeLoad(in + 4) >> 27 | SafeLoad(in + 5) << 5, SafeLoad(in + 5) >> 26 | SafeLoad(in + 6) << 6, SafeLoad(in + 6) >> 25 | SafeLoad(in + 7) << 7 }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 31-bit bundles 8 to 15 + words = simd_batch{ SafeLoad(in + 7) >> 24 | SafeLoad(in + 8) << 8, SafeLoad(in + 8) >> 23 | SafeLoad(in + 9) << 9, SafeLoad(in + 9) >> 22 | SafeLoad(in + 10) << 10, SafeLoad(in + 10) >> 21 | SafeLoad(in + 11) << 11, SafeLoad(in + 11) >> 20 | SafeLoad(in + 12) << 12, SafeLoad(in + 12) >> 19 | SafeLoad(in + 13) << 13, SafeLoad(in + 13) >> 18 | SafeLoad(in + 14) << 14, SafeLoad(in + 14) >> 17 | SafeLoad(in + 15) << 15 }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 31-bit bundles 16 to 23 + words = simd_batch{ SafeLoad(in + 15) >> 16 | SafeLoad(in + 16) << 16, SafeLoad(in + 16) >> 15 | SafeLoad(in + 17) << 17, SafeLoad(in + 17) >> 14 | SafeLoad(in + 18) << 18, SafeLoad(in + 18) >> 13 | SafeLoad(in + 19) << 19, SafeLoad(in + 19) >> 12 | SafeLoad(in + 20) << 20, SafeLoad(in + 20) >> 11 | SafeLoad(in + 21) << 21, SafeLoad(in + 21) >> 10 | SafeLoad(in + 22) << 22, SafeLoad(in + 22) >> 9 | SafeLoad(in + 23) << 23 }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + // extract 31-bit bundles 24 to 31 + words = simd_batch{ SafeLoad(in + 23) >> 8 | SafeLoad(in + 24) << 24, SafeLoad(in + 24) >> 7 | SafeLoad(in + 25) << 25, SafeLoad(in + 25) >> 6 | SafeLoad(in + 26) << 26, SafeLoad(in + 26) >> 5 | SafeLoad(in + 27) << 27, SafeLoad(in + 27) >> 4 | SafeLoad(in + 28) << 28, SafeLoad(in + 28) >> 3 | SafeLoad(in + 29) << 29, SafeLoad(in + 29) >> 2 | SafeLoad(in + 30) << 30, SafeLoad(in + 30) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 8; + + in += 31; + return in; +} + +inline static const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) { + memcpy(out, in, 32 * sizeof(*out)); + in += 32; + out += 32; + + return in; +} + +}; // struct UnpackBits256 + +} // namespace +} // namespace internal +} // namespace arrow + diff --git a/cpp/src/arrow/util/bpacking_simd512_generated.h b/cpp/src/arrow/util/bpacking_simd512_generated.h new file mode 100644 index 00000000000..2a62c962cd0 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_simd512_generated.h @@ -0,0 +1,836 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Automatically generated file; DO NOT EDIT. + +#pragma once + +#include +#include + +#include + +#include "arrow/util/dispatch.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace internal { +namespace { + +using ::arrow::util::SafeLoad; + +template +struct UnpackBits512 { + +using simd_batch = xsimd::batch; + +inline static const uint32_t* unpack0_32(const uint32_t* in, uint32_t* out) { + memset(out, 0x0, 32 * sizeof(*out)); + out += 32; + + return in; +} + +inline static const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 1-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 1-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 1; + return in; +} + +inline static const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 2-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) }; + shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 2-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 2; + return in; +} + +inline static const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 3-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 0, 1, 4, 7, 10, 13 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 3-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 16, 19, 22, 25, 28, 0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 3; + return in; +} + +inline static const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xf; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 4-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) }; + shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 4-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 4; + return in; +} + +inline static const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 5-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 0, 5, 10, 15, 20, 25, 0, 3, 8, 13, 18, 23, 0, 1, 6, 11 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 5-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; + shifts = simd_batch{ 16, 21, 26, 0, 4, 9, 14, 19, 24, 0, 2, 7, 12, 17, 22, 27 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 5; + return in; +} + +inline static const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 6-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) }; + shifts = simd_batch{ 0, 6, 12, 18, 24, 0, 4, 10, 16, 22, 0, 2, 8, 14, 20, 26 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 6-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) }; + shifts = simd_batch{ 0, 6, 12, 18, 24, 0, 4, 10, 16, 22, 0, 2, 8, 14, 20, 26 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 6; + return in; +} + +inline static const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7f; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 7-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5, SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 0, 7, 14, 21, 0, 3, 10, 17, 24, 0, 6, 13, 20, 0, 2, 9 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 7-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) }; + shifts = simd_batch{ 16, 23, 0, 5, 12, 19, 0, 1, 8, 15, 22, 0, 4, 11, 18, 25 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 7; + return in; +} + +inline static const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 8-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) }; + shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 8-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) }; + shifts = simd_batch{ 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 8; + return in; +} + +inline static const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 9-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) }; + shifts = simd_batch{ 0, 9, 18, 0, 4, 13, 22, 0, 8, 17, 0, 3, 12, 21, 0, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 9-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 4), SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) }; + shifts = simd_batch{ 16, 0, 2, 11, 20, 0, 6, 15, 0, 1, 10, 19, 0, 5, 14, 23 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 9; + return in; +} + +inline static const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 10-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) }; + shifts = simd_batch{ 0, 10, 20, 0, 8, 18, 0, 6, 16, 0, 4, 14, 0, 2, 12, 22 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 10-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2, SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) }; + shifts = simd_batch{ 0, 10, 20, 0, 8, 18, 0, 6, 16, 0, 4, 14, 0, 2, 12, 22 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 10; + return in; +} + +inline static const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7ff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 11-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 23 | SafeLoad(in + 2) << 9, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 25 | SafeLoad(in + 4) << 7, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5) }; + shifts = simd_batch{ 0, 11, 0, 1, 12, 0, 2, 13, 0, 3, 14, 0, 4, 15, 0, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 11-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 5), SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5, SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1, SafeLoad(in + 10), SafeLoad(in + 10) }; + shifts = simd_batch{ 16, 0, 6, 17, 0, 7, 18, 0, 8, 19, 0, 9, 20, 0, 10, 21 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 11; + return in; +} + +inline static const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xfff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 12-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 24 | SafeLoad(in + 1) << 8, SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5), SafeLoad(in + 5) }; + shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20, 0, 12, 0, 4, 16, 0, 8, 20 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 12-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 10) >> 28 | SafeLoad(in + 11) << 4, SafeLoad(in + 11), SafeLoad(in + 11) }; + shifts = simd_batch{ 0, 12, 0, 4, 16, 0, 8, 20, 0, 12, 0, 4, 16, 0, 8, 20 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 12; + return in; +} + +inline static const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 13-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 26 | SafeLoad(in + 1) << 6, SafeLoad(in + 1), SafeLoad(in + 1) >> 20 | SafeLoad(in + 2) << 12, SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 2) >> 27 | SafeLoad(in + 3) << 5, SafeLoad(in + 3), SafeLoad(in + 3) >> 21 | SafeLoad(in + 4) << 11, SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5), SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 6) }; + shifts = simd_batch{ 0, 13, 0, 7, 0, 1, 14, 0, 8, 0, 2, 15, 0, 9, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 13-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 6), SafeLoad(in + 6) >> 29 | SafeLoad(in + 7) << 3, SafeLoad(in + 7), SafeLoad(in + 7) >> 23 | SafeLoad(in + 8) << 9, SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 10) >> 31 | SafeLoad(in + 11) << 1, SafeLoad(in + 11), SafeLoad(in + 11) >> 25 | SafeLoad(in + 12) << 7, SafeLoad(in + 12), SafeLoad(in + 12) }; + shifts = simd_batch{ 16, 0, 10, 0, 4, 17, 0, 11, 0, 5, 18, 0, 12, 0, 6, 19 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 13; + return in; +} + +inline static const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 14-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1), SafeLoad(in + 1) >> 24 | SafeLoad(in + 2) << 8, SafeLoad(in + 2), SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5), SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 6), SafeLoad(in + 6) }; + shifts = simd_batch{ 0, 14, 0, 10, 0, 6, 0, 2, 16, 0, 12, 0, 8, 0, 4, 18 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 14-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9), SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 10) >> 30 | SafeLoad(in + 11) << 2, SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12), SafeLoad(in + 12) >> 22 | SafeLoad(in + 13) << 10, SafeLoad(in + 13), SafeLoad(in + 13) }; + shifts = simd_batch{ 0, 14, 0, 10, 0, 6, 0, 2, 16, 0, 12, 0, 8, 0, 4, 18 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 14; + return in; +} + +inline static const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7fff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 15-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6), SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14, SafeLoad(in + 7) }; + shifts = simd_batch{ 0, 15, 0, 13, 0, 11, 0, 9, 0, 7, 0, 5, 0, 3, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 15-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 7), SafeLoad(in + 7) >> 31 | SafeLoad(in + 8) << 1, SafeLoad(in + 8), SafeLoad(in + 8) >> 29 | SafeLoad(in + 9) << 3, SafeLoad(in + 9), SafeLoad(in + 9) >> 27 | SafeLoad(in + 10) << 5, SafeLoad(in + 10), SafeLoad(in + 10) >> 25 | SafeLoad(in + 11) << 7, SafeLoad(in + 11), SafeLoad(in + 11) >> 23 | SafeLoad(in + 12) << 9, SafeLoad(in + 12), SafeLoad(in + 12) >> 21 | SafeLoad(in + 13) << 11, SafeLoad(in + 13), SafeLoad(in + 13) >> 19 | SafeLoad(in + 14) << 13, SafeLoad(in + 14), SafeLoad(in + 14) }; + shifts = simd_batch{ 16, 0, 14, 0, 12, 0, 10, 0, 8, 0, 6, 0, 4, 0, 2, 17 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 15; + return in; +} + +inline static const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 16-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0), SafeLoad(in + 1), SafeLoad(in + 1), SafeLoad(in + 2), SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3), SafeLoad(in + 4), SafeLoad(in + 4), SafeLoad(in + 5), SafeLoad(in + 5), SafeLoad(in + 6), SafeLoad(in + 6), SafeLoad(in + 7), SafeLoad(in + 7) }; + shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 16-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 8), SafeLoad(in + 8), SafeLoad(in + 9), SafeLoad(in + 9), SafeLoad(in + 10), SafeLoad(in + 10), SafeLoad(in + 11), SafeLoad(in + 11), SafeLoad(in + 12), SafeLoad(in + 12), SafeLoad(in + 13), SafeLoad(in + 13), SafeLoad(in + 14), SafeLoad(in + 14), SafeLoad(in + 15), SafeLoad(in + 15) }; + shifts = simd_batch{ 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 16; + return in; +} + +inline static const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1ffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 17-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 17 | SafeLoad(in + 1) << 15, SafeLoad(in + 1), SafeLoad(in + 1) >> 19 | SafeLoad(in + 2) << 13, SafeLoad(in + 2), SafeLoad(in + 2) >> 21 | SafeLoad(in + 3) << 11, SafeLoad(in + 3), SafeLoad(in + 3) >> 23 | SafeLoad(in + 4) << 9, SafeLoad(in + 4), SafeLoad(in + 4) >> 25 | SafeLoad(in + 5) << 7, SafeLoad(in + 5), SafeLoad(in + 5) >> 27 | SafeLoad(in + 6) << 5, SafeLoad(in + 6), SafeLoad(in + 6) >> 29 | SafeLoad(in + 7) << 3, SafeLoad(in + 7), SafeLoad(in + 7) >> 31 | SafeLoad(in + 8) << 1 }; + shifts = simd_batch{ 0, 0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 17-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 8) >> 16 | SafeLoad(in + 9) << 16, SafeLoad(in + 9), SafeLoad(in + 9) >> 18 | SafeLoad(in + 10) << 14, SafeLoad(in + 10), SafeLoad(in + 10) >> 20 | SafeLoad(in + 11) << 12, SafeLoad(in + 11), SafeLoad(in + 11) >> 22 | SafeLoad(in + 12) << 10, SafeLoad(in + 12), SafeLoad(in + 12) >> 24 | SafeLoad(in + 13) << 8, SafeLoad(in + 13), SafeLoad(in + 13) >> 26 | SafeLoad(in + 14) << 6, SafeLoad(in + 14), SafeLoad(in + 14) >> 28 | SafeLoad(in + 15) << 4, SafeLoad(in + 15), SafeLoad(in + 15) >> 30 | SafeLoad(in + 16) << 2, SafeLoad(in + 16) }; + shifts = simd_batch{ 0, 1, 0, 3, 0, 5, 0, 7, 0, 9, 0, 11, 0, 13, 0, 15 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 17; + return in; +} + +inline static const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3ffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 18-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 18 | SafeLoad(in + 1) << 14, SafeLoad(in + 1), SafeLoad(in + 1) >> 22 | SafeLoad(in + 2) << 10, SafeLoad(in + 2), SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) >> 16 | SafeLoad(in + 5) << 16, SafeLoad(in + 5), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) }; + shifts = simd_batch{ 0, 0, 4, 0, 8, 0, 12, 0, 0, 2, 0, 6, 0, 10, 0, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 18-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 9), SafeLoad(in + 9) >> 18 | SafeLoad(in + 10) << 14, SafeLoad(in + 10), SafeLoad(in + 10) >> 22 | SafeLoad(in + 11) << 10, SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12), SafeLoad(in + 12) >> 30 | SafeLoad(in + 13) << 2, SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14), SafeLoad(in + 14) >> 20 | SafeLoad(in + 15) << 12, SafeLoad(in + 15), SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) }; + shifts = simd_batch{ 0, 0, 4, 0, 8, 0, 12, 0, 0, 2, 0, 6, 0, 10, 0, 14 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 18; + return in; +} + +inline static const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7ffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 19-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 19 | SafeLoad(in + 1) << 13, SafeLoad(in + 1), SafeLoad(in + 1) >> 25 | SafeLoad(in + 2) << 7, SafeLoad(in + 2), SafeLoad(in + 2) >> 31 | SafeLoad(in + 3) << 1, SafeLoad(in + 3) >> 18 | SafeLoad(in + 4) << 14, SafeLoad(in + 4), SafeLoad(in + 4) >> 24 | SafeLoad(in + 5) << 8, SafeLoad(in + 5), SafeLoad(in + 5) >> 30 | SafeLoad(in + 6) << 2, SafeLoad(in + 6) >> 17 | SafeLoad(in + 7) << 15, SafeLoad(in + 7), SafeLoad(in + 7) >> 23 | SafeLoad(in + 8) << 9, SafeLoad(in + 8), SafeLoad(in + 8) >> 29 | SafeLoad(in + 9) << 3 }; + shifts = simd_batch{ 0, 0, 6, 0, 12, 0, 0, 5, 0, 11, 0, 0, 4, 0, 10, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 19-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 9) >> 16 | SafeLoad(in + 10) << 16, SafeLoad(in + 10), SafeLoad(in + 10) >> 22 | SafeLoad(in + 11) << 10, SafeLoad(in + 11), SafeLoad(in + 11) >> 28 | SafeLoad(in + 12) << 4, SafeLoad(in + 12) >> 15 | SafeLoad(in + 13) << 17, SafeLoad(in + 13), SafeLoad(in + 13) >> 21 | SafeLoad(in + 14) << 11, SafeLoad(in + 14), SafeLoad(in + 14) >> 27 | SafeLoad(in + 15) << 5, SafeLoad(in + 15) >> 14 | SafeLoad(in + 16) << 18, SafeLoad(in + 16), SafeLoad(in + 16) >> 20 | SafeLoad(in + 17) << 12, SafeLoad(in + 17), SafeLoad(in + 17) >> 26 | SafeLoad(in + 18) << 6, SafeLoad(in + 18) }; + shifts = simd_batch{ 0, 3, 0, 9, 0, 0, 2, 0, 8, 0, 0, 1, 0, 7, 0, 13 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 19; + return in; +} + +inline static const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xfffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 20-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 20 | SafeLoad(in + 1) << 12, SafeLoad(in + 1), SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) >> 16 | SafeLoad(in + 3) << 16, SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4), SafeLoad(in + 5), SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8), SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9) }; + shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12, 0, 0, 8, 0, 0, 4, 0, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 20-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 10), SafeLoad(in + 10) >> 20 | SafeLoad(in + 11) << 12, SafeLoad(in + 11), SafeLoad(in + 11) >> 28 | SafeLoad(in + 12) << 4, SafeLoad(in + 12) >> 16 | SafeLoad(in + 13) << 16, SafeLoad(in + 13), SafeLoad(in + 13) >> 24 | SafeLoad(in + 14) << 8, SafeLoad(in + 14), SafeLoad(in + 15), SafeLoad(in + 15) >> 20 | SafeLoad(in + 16) << 12, SafeLoad(in + 16), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) >> 16 | SafeLoad(in + 18) << 16, SafeLoad(in + 18), SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) }; + shifts = simd_batch{ 0, 0, 8, 0, 0, 4, 0, 12, 0, 0, 8, 0, 0, 4, 0, 12 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 20; + return in; +} + +inline static const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1fffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 21-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 21 | SafeLoad(in + 1) << 11, SafeLoad(in + 1), SafeLoad(in + 1) >> 31 | SafeLoad(in + 2) << 1, SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 3), SafeLoad(in + 3) >> 30 | SafeLoad(in + 4) << 2, SafeLoad(in + 4) >> 19 | SafeLoad(in + 5) << 13, SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14, SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) >> 17 | SafeLoad(in + 9) << 15, SafeLoad(in + 9), SafeLoad(in + 9) >> 27 | SafeLoad(in + 10) << 5 }; + shifts = simd_batch{ 0, 0, 10, 0, 0, 9, 0, 0, 8, 0, 0, 7, 0, 0, 6, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 21-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11), SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12) >> 15 | SafeLoad(in + 13) << 17, SafeLoad(in + 13), SafeLoad(in + 13) >> 25 | SafeLoad(in + 14) << 7, SafeLoad(in + 14) >> 14 | SafeLoad(in + 15) << 18, SafeLoad(in + 15), SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 13 | SafeLoad(in + 17) << 19, SafeLoad(in + 17), SafeLoad(in + 17) >> 23 | SafeLoad(in + 18) << 9, SafeLoad(in + 18) >> 12 | SafeLoad(in + 19) << 20, SafeLoad(in + 19), SafeLoad(in + 19) >> 22 | SafeLoad(in + 20) << 10, SafeLoad(in + 20) }; + shifts = simd_batch{ 0, 5, 0, 0, 4, 0, 0, 3, 0, 0, 2, 0, 0, 1, 0, 11 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 21; + return in; +} + +inline static const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3fffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 22-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 22 | SafeLoad(in + 1) << 10, SafeLoad(in + 1) >> 12 | SafeLoad(in + 2) << 20, SafeLoad(in + 2), SafeLoad(in + 2) >> 24 | SafeLoad(in + 3) << 8, SafeLoad(in + 3) >> 14 | SafeLoad(in + 4) << 18, SafeLoad(in + 4), SafeLoad(in + 4) >> 26 | SafeLoad(in + 5) << 6, SafeLoad(in + 5) >> 16 | SafeLoad(in + 6) << 16, SafeLoad(in + 6), SafeLoad(in + 6) >> 28 | SafeLoad(in + 7) << 4, SafeLoad(in + 7) >> 18 | SafeLoad(in + 8) << 14, SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 10) }; + shifts = simd_batch{ 0, 0, 0, 2, 0, 0, 4, 0, 0, 6, 0, 0, 8, 0, 0, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 22-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 11), SafeLoad(in + 11) >> 22 | SafeLoad(in + 12) << 10, SafeLoad(in + 12) >> 12 | SafeLoad(in + 13) << 20, SafeLoad(in + 13), SafeLoad(in + 13) >> 24 | SafeLoad(in + 14) << 8, SafeLoad(in + 14) >> 14 | SafeLoad(in + 15) << 18, SafeLoad(in + 15), SafeLoad(in + 15) >> 26 | SafeLoad(in + 16) << 6, SafeLoad(in + 16) >> 16 | SafeLoad(in + 17) << 16, SafeLoad(in + 17), SafeLoad(in + 17) >> 28 | SafeLoad(in + 18) << 4, SafeLoad(in + 18) >> 18 | SafeLoad(in + 19) << 14, SafeLoad(in + 19), SafeLoad(in + 19) >> 30 | SafeLoad(in + 20) << 2, SafeLoad(in + 20) >> 20 | SafeLoad(in + 21) << 12, SafeLoad(in + 21) }; + shifts = simd_batch{ 0, 0, 0, 2, 0, 0, 4, 0, 0, 6, 0, 0, 8, 0, 0, 10 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 22; + return in; +} + +inline static const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7fffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 23-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 23 | SafeLoad(in + 1) << 9, SafeLoad(in + 1) >> 14 | SafeLoad(in + 2) << 18, SafeLoad(in + 2), SafeLoad(in + 2) >> 28 | SafeLoad(in + 3) << 4, SafeLoad(in + 3) >> 19 | SafeLoad(in + 4) << 13, SafeLoad(in + 4) >> 10 | SafeLoad(in + 5) << 22, SafeLoad(in + 5), SafeLoad(in + 5) >> 24 | SafeLoad(in + 6) << 8, SafeLoad(in + 6) >> 15 | SafeLoad(in + 7) << 17, SafeLoad(in + 7), SafeLoad(in + 7) >> 29 | SafeLoad(in + 8) << 3, SafeLoad(in + 8) >> 20 | SafeLoad(in + 9) << 12, SafeLoad(in + 9) >> 11 | SafeLoad(in + 10) << 21, SafeLoad(in + 10), SafeLoad(in + 10) >> 25 | SafeLoad(in + 11) << 7 }; + shifts = simd_batch{ 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 6, 0, 0, 0, 2, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 23-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 11) >> 16 | SafeLoad(in + 12) << 16, SafeLoad(in + 12), SafeLoad(in + 12) >> 30 | SafeLoad(in + 13) << 2, SafeLoad(in + 13) >> 21 | SafeLoad(in + 14) << 11, SafeLoad(in + 14) >> 12 | SafeLoad(in + 15) << 20, SafeLoad(in + 15), SafeLoad(in + 15) >> 26 | SafeLoad(in + 16) << 6, SafeLoad(in + 16) >> 17 | SafeLoad(in + 17) << 15, SafeLoad(in + 17), SafeLoad(in + 17) >> 31 | SafeLoad(in + 18) << 1, SafeLoad(in + 18) >> 22 | SafeLoad(in + 19) << 10, SafeLoad(in + 19) >> 13 | SafeLoad(in + 20) << 19, SafeLoad(in + 20), SafeLoad(in + 20) >> 27 | SafeLoad(in + 21) << 5, SafeLoad(in + 21) >> 18 | SafeLoad(in + 22) << 14, SafeLoad(in + 22) }; + shifts = simd_batch{ 0, 7, 0, 0, 0, 3, 0, 0, 8, 0, 0, 0, 4, 0, 0, 9 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 23; + return in; +} + +inline static const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 24-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 24 | SafeLoad(in + 1) << 8, SafeLoad(in + 1) >> 16 | SafeLoad(in + 2) << 16, SafeLoad(in + 2), SafeLoad(in + 3), SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) >> 16 | SafeLoad(in + 5) << 16, SafeLoad(in + 5), SafeLoad(in + 6), SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8), SafeLoad(in + 9), SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11) }; + shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 24-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 12), SafeLoad(in + 12) >> 24 | SafeLoad(in + 13) << 8, SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14), SafeLoad(in + 15), SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 16 | SafeLoad(in + 17) << 16, SafeLoad(in + 17), SafeLoad(in + 18), SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 16 | SafeLoad(in + 20) << 16, SafeLoad(in + 20), SafeLoad(in + 21), SafeLoad(in + 21) >> 24 | SafeLoad(in + 22) << 8, SafeLoad(in + 22) >> 16 | SafeLoad(in + 23) << 16, SafeLoad(in + 23) }; + shifts = simd_batch{ 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, 0, 8 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 24; + return in; +} + +inline static const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1ffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 25-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 25 | SafeLoad(in + 1) << 7, SafeLoad(in + 1) >> 18 | SafeLoad(in + 2) << 14, SafeLoad(in + 2) >> 11 | SafeLoad(in + 3) << 21, SafeLoad(in + 3), SafeLoad(in + 3) >> 29 | SafeLoad(in + 4) << 3, SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5) >> 15 | SafeLoad(in + 6) << 17, SafeLoad(in + 6) >> 8 | SafeLoad(in + 7) << 24, SafeLoad(in + 7), SafeLoad(in + 7) >> 26 | SafeLoad(in + 8) << 6, SafeLoad(in + 8) >> 19 | SafeLoad(in + 9) << 13, SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 20, SafeLoad(in + 10), SafeLoad(in + 10) >> 30 | SafeLoad(in + 11) << 2, SafeLoad(in + 11) >> 23 | SafeLoad(in + 12) << 9 }; + shifts = simd_batch{ 0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 25-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 12) >> 16 | SafeLoad(in + 13) << 16, SafeLoad(in + 13) >> 9 | SafeLoad(in + 14) << 23, SafeLoad(in + 14), SafeLoad(in + 14) >> 27 | SafeLoad(in + 15) << 5, SafeLoad(in + 15) >> 20 | SafeLoad(in + 16) << 12, SafeLoad(in + 16) >> 13 | SafeLoad(in + 17) << 19, SafeLoad(in + 17), SafeLoad(in + 17) >> 31 | SafeLoad(in + 18) << 1, SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 17 | SafeLoad(in + 20) << 15, SafeLoad(in + 20) >> 10 | SafeLoad(in + 21) << 22, SafeLoad(in + 21), SafeLoad(in + 21) >> 28 | SafeLoad(in + 22) << 4, SafeLoad(in + 22) >> 21 | SafeLoad(in + 23) << 11, SafeLoad(in + 23) >> 14 | SafeLoad(in + 24) << 18, SafeLoad(in + 24) }; + shifts = simd_batch{ 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 0, 3, 0, 0, 0, 7 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 25; + return in; +} + +inline static const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3ffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 26-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 26 | SafeLoad(in + 1) << 6, SafeLoad(in + 1) >> 20 | SafeLoad(in + 2) << 12, SafeLoad(in + 2) >> 14 | SafeLoad(in + 3) << 18, SafeLoad(in + 3) >> 8 | SafeLoad(in + 4) << 24, SafeLoad(in + 4), SafeLoad(in + 4) >> 28 | SafeLoad(in + 5) << 4, SafeLoad(in + 5) >> 22 | SafeLoad(in + 6) << 10, SafeLoad(in + 6) >> 16 | SafeLoad(in + 7) << 16, SafeLoad(in + 7) >> 10 | SafeLoad(in + 8) << 22, SafeLoad(in + 8), SafeLoad(in + 8) >> 30 | SafeLoad(in + 9) << 2, SafeLoad(in + 9) >> 24 | SafeLoad(in + 10) << 8, SafeLoad(in + 10) >> 18 | SafeLoad(in + 11) << 14, SafeLoad(in + 11) >> 12 | SafeLoad(in + 12) << 20, SafeLoad(in + 12) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 26-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 13), SafeLoad(in + 13) >> 26 | SafeLoad(in + 14) << 6, SafeLoad(in + 14) >> 20 | SafeLoad(in + 15) << 12, SafeLoad(in + 15) >> 14 | SafeLoad(in + 16) << 18, SafeLoad(in + 16) >> 8 | SafeLoad(in + 17) << 24, SafeLoad(in + 17), SafeLoad(in + 17) >> 28 | SafeLoad(in + 18) << 4, SafeLoad(in + 18) >> 22 | SafeLoad(in + 19) << 10, SafeLoad(in + 19) >> 16 | SafeLoad(in + 20) << 16, SafeLoad(in + 20) >> 10 | SafeLoad(in + 21) << 22, SafeLoad(in + 21), SafeLoad(in + 21) >> 30 | SafeLoad(in + 22) << 2, SafeLoad(in + 22) >> 24 | SafeLoad(in + 23) << 8, SafeLoad(in + 23) >> 18 | SafeLoad(in + 24) << 14, SafeLoad(in + 24) >> 12 | SafeLoad(in + 25) << 20, SafeLoad(in + 25) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 6 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 26; + return in; +} + +inline static const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7ffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 27-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 27 | SafeLoad(in + 1) << 5, SafeLoad(in + 1) >> 22 | SafeLoad(in + 2) << 10, SafeLoad(in + 2) >> 17 | SafeLoad(in + 3) << 15, SafeLoad(in + 3) >> 12 | SafeLoad(in + 4) << 20, SafeLoad(in + 4) >> 7 | SafeLoad(in + 5) << 25, SafeLoad(in + 5), SafeLoad(in + 5) >> 29 | SafeLoad(in + 6) << 3, SafeLoad(in + 6) >> 24 | SafeLoad(in + 7) << 8, SafeLoad(in + 7) >> 19 | SafeLoad(in + 8) << 13, SafeLoad(in + 8) >> 14 | SafeLoad(in + 9) << 18, SafeLoad(in + 9) >> 9 | SafeLoad(in + 10) << 23, SafeLoad(in + 10), SafeLoad(in + 10) >> 31 | SafeLoad(in + 11) << 1, SafeLoad(in + 11) >> 26 | SafeLoad(in + 12) << 6, SafeLoad(in + 12) >> 21 | SafeLoad(in + 13) << 11 }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 4, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 27-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 13) >> 16 | SafeLoad(in + 14) << 16, SafeLoad(in + 14) >> 11 | SafeLoad(in + 15) << 21, SafeLoad(in + 15) >> 6 | SafeLoad(in + 16) << 26, SafeLoad(in + 16), SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) >> 23 | SafeLoad(in + 18) << 9, SafeLoad(in + 18) >> 18 | SafeLoad(in + 19) << 14, SafeLoad(in + 19) >> 13 | SafeLoad(in + 20) << 19, SafeLoad(in + 20) >> 8 | SafeLoad(in + 21) << 24, SafeLoad(in + 21), SafeLoad(in + 21) >> 30 | SafeLoad(in + 22) << 2, SafeLoad(in + 22) >> 25 | SafeLoad(in + 23) << 7, SafeLoad(in + 23) >> 20 | SafeLoad(in + 24) << 12, SafeLoad(in + 24) >> 15 | SafeLoad(in + 25) << 17, SafeLoad(in + 25) >> 10 | SafeLoad(in + 26) << 22, SafeLoad(in + 26) }; + shifts = simd_batch{ 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 5 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 27; + return in; +} + +inline static const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0xfffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 28-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 28 | SafeLoad(in + 1) << 4, SafeLoad(in + 1) >> 24 | SafeLoad(in + 2) << 8, SafeLoad(in + 2) >> 20 | SafeLoad(in + 3) << 12, SafeLoad(in + 3) >> 16 | SafeLoad(in + 4) << 16, SafeLoad(in + 4) >> 12 | SafeLoad(in + 5) << 20, SafeLoad(in + 5) >> 8 | SafeLoad(in + 6) << 24, SafeLoad(in + 6), SafeLoad(in + 7), SafeLoad(in + 7) >> 28 | SafeLoad(in + 8) << 4, SafeLoad(in + 8) >> 24 | SafeLoad(in + 9) << 8, SafeLoad(in + 9) >> 20 | SafeLoad(in + 10) << 12, SafeLoad(in + 10) >> 16 | SafeLoad(in + 11) << 16, SafeLoad(in + 11) >> 12 | SafeLoad(in + 12) << 20, SafeLoad(in + 12) >> 8 | SafeLoad(in + 13) << 24, SafeLoad(in + 13) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 28-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 14), SafeLoad(in + 14) >> 28 | SafeLoad(in + 15) << 4, SafeLoad(in + 15) >> 24 | SafeLoad(in + 16) << 8, SafeLoad(in + 16) >> 20 | SafeLoad(in + 17) << 12, SafeLoad(in + 17) >> 16 | SafeLoad(in + 18) << 16, SafeLoad(in + 18) >> 12 | SafeLoad(in + 19) << 20, SafeLoad(in + 19) >> 8 | SafeLoad(in + 20) << 24, SafeLoad(in + 20), SafeLoad(in + 21), SafeLoad(in + 21) >> 28 | SafeLoad(in + 22) << 4, SafeLoad(in + 22) >> 24 | SafeLoad(in + 23) << 8, SafeLoad(in + 23) >> 20 | SafeLoad(in + 24) << 12, SafeLoad(in + 24) >> 16 | SafeLoad(in + 25) << 16, SafeLoad(in + 25) >> 12 | SafeLoad(in + 26) << 20, SafeLoad(in + 26) >> 8 | SafeLoad(in + 27) << 24, SafeLoad(in + 27) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 4 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 28; + return in; +} + +inline static const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x1fffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 29-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 29 | SafeLoad(in + 1) << 3, SafeLoad(in + 1) >> 26 | SafeLoad(in + 2) << 6, SafeLoad(in + 2) >> 23 | SafeLoad(in + 3) << 9, SafeLoad(in + 3) >> 20 | SafeLoad(in + 4) << 12, SafeLoad(in + 4) >> 17 | SafeLoad(in + 5) << 15, SafeLoad(in + 5) >> 14 | SafeLoad(in + 6) << 18, SafeLoad(in + 6) >> 11 | SafeLoad(in + 7) << 21, SafeLoad(in + 7) >> 8 | SafeLoad(in + 8) << 24, SafeLoad(in + 8) >> 5 | SafeLoad(in + 9) << 27, SafeLoad(in + 9), SafeLoad(in + 9) >> 31 | SafeLoad(in + 10) << 1, SafeLoad(in + 10) >> 28 | SafeLoad(in + 11) << 4, SafeLoad(in + 11) >> 25 | SafeLoad(in + 12) << 7, SafeLoad(in + 12) >> 22 | SafeLoad(in + 13) << 10, SafeLoad(in + 13) >> 19 | SafeLoad(in + 14) << 13 }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 29-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 14) >> 16 | SafeLoad(in + 15) << 16, SafeLoad(in + 15) >> 13 | SafeLoad(in + 16) << 19, SafeLoad(in + 16) >> 10 | SafeLoad(in + 17) << 22, SafeLoad(in + 17) >> 7 | SafeLoad(in + 18) << 25, SafeLoad(in + 18) >> 4 | SafeLoad(in + 19) << 28, SafeLoad(in + 19), SafeLoad(in + 19) >> 30 | SafeLoad(in + 20) << 2, SafeLoad(in + 20) >> 27 | SafeLoad(in + 21) << 5, SafeLoad(in + 21) >> 24 | SafeLoad(in + 22) << 8, SafeLoad(in + 22) >> 21 | SafeLoad(in + 23) << 11, SafeLoad(in + 23) >> 18 | SafeLoad(in + 24) << 14, SafeLoad(in + 24) >> 15 | SafeLoad(in + 25) << 17, SafeLoad(in + 25) >> 12 | SafeLoad(in + 26) << 20, SafeLoad(in + 26) >> 9 | SafeLoad(in + 27) << 23, SafeLoad(in + 27) >> 6 | SafeLoad(in + 28) << 26, SafeLoad(in + 28) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 29; + return in; +} + +inline static const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x3fffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 30-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 30 | SafeLoad(in + 1) << 2, SafeLoad(in + 1) >> 28 | SafeLoad(in + 2) << 4, SafeLoad(in + 2) >> 26 | SafeLoad(in + 3) << 6, SafeLoad(in + 3) >> 24 | SafeLoad(in + 4) << 8, SafeLoad(in + 4) >> 22 | SafeLoad(in + 5) << 10, SafeLoad(in + 5) >> 20 | SafeLoad(in + 6) << 12, SafeLoad(in + 6) >> 18 | SafeLoad(in + 7) << 14, SafeLoad(in + 7) >> 16 | SafeLoad(in + 8) << 16, SafeLoad(in + 8) >> 14 | SafeLoad(in + 9) << 18, SafeLoad(in + 9) >> 12 | SafeLoad(in + 10) << 20, SafeLoad(in + 10) >> 10 | SafeLoad(in + 11) << 22, SafeLoad(in + 11) >> 8 | SafeLoad(in + 12) << 24, SafeLoad(in + 12) >> 6 | SafeLoad(in + 13) << 26, SafeLoad(in + 13) >> 4 | SafeLoad(in + 14) << 28, SafeLoad(in + 14) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 30-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 15), SafeLoad(in + 15) >> 30 | SafeLoad(in + 16) << 2, SafeLoad(in + 16) >> 28 | SafeLoad(in + 17) << 4, SafeLoad(in + 17) >> 26 | SafeLoad(in + 18) << 6, SafeLoad(in + 18) >> 24 | SafeLoad(in + 19) << 8, SafeLoad(in + 19) >> 22 | SafeLoad(in + 20) << 10, SafeLoad(in + 20) >> 20 | SafeLoad(in + 21) << 12, SafeLoad(in + 21) >> 18 | SafeLoad(in + 22) << 14, SafeLoad(in + 22) >> 16 | SafeLoad(in + 23) << 16, SafeLoad(in + 23) >> 14 | SafeLoad(in + 24) << 18, SafeLoad(in + 24) >> 12 | SafeLoad(in + 25) << 20, SafeLoad(in + 25) >> 10 | SafeLoad(in + 26) << 22, SafeLoad(in + 26) >> 8 | SafeLoad(in + 27) << 24, SafeLoad(in + 27) >> 6 | SafeLoad(in + 28) << 26, SafeLoad(in + 28) >> 4 | SafeLoad(in + 29) << 28, SafeLoad(in + 29) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 30; + return in; +} + +inline static const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) { + uint32_t mask = 0x7fffffff; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + + // extract 31-bit bundles 0 to 15 + words = simd_batch{ SafeLoad(in + 0), SafeLoad(in + 0) >> 31 | SafeLoad(in + 1) << 1, SafeLoad(in + 1) >> 30 | SafeLoad(in + 2) << 2, SafeLoad(in + 2) >> 29 | SafeLoad(in + 3) << 3, SafeLoad(in + 3) >> 28 | SafeLoad(in + 4) << 4, SafeLoad(in + 4) >> 27 | SafeLoad(in + 5) << 5, SafeLoad(in + 5) >> 26 | SafeLoad(in + 6) << 6, SafeLoad(in + 6) >> 25 | SafeLoad(in + 7) << 7, SafeLoad(in + 7) >> 24 | SafeLoad(in + 8) << 8, SafeLoad(in + 8) >> 23 | SafeLoad(in + 9) << 9, SafeLoad(in + 9) >> 22 | SafeLoad(in + 10) << 10, SafeLoad(in + 10) >> 21 | SafeLoad(in + 11) << 11, SafeLoad(in + 11) >> 20 | SafeLoad(in + 12) << 12, SafeLoad(in + 12) >> 19 | SafeLoad(in + 13) << 13, SafeLoad(in + 13) >> 18 | SafeLoad(in + 14) << 14, SafeLoad(in + 14) >> 17 | SafeLoad(in + 15) << 15 }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + // extract 31-bit bundles 16 to 31 + words = simd_batch{ SafeLoad(in + 15) >> 16 | SafeLoad(in + 16) << 16, SafeLoad(in + 16) >> 15 | SafeLoad(in + 17) << 17, SafeLoad(in + 17) >> 14 | SafeLoad(in + 18) << 18, SafeLoad(in + 18) >> 13 | SafeLoad(in + 19) << 19, SafeLoad(in + 19) >> 12 | SafeLoad(in + 20) << 20, SafeLoad(in + 20) >> 11 | SafeLoad(in + 21) << 21, SafeLoad(in + 21) >> 10 | SafeLoad(in + 22) << 22, SafeLoad(in + 22) >> 9 | SafeLoad(in + 23) << 23, SafeLoad(in + 23) >> 8 | SafeLoad(in + 24) << 24, SafeLoad(in + 24) >> 7 | SafeLoad(in + 25) << 25, SafeLoad(in + 25) >> 6 | SafeLoad(in + 26) << 26, SafeLoad(in + 26) >> 5 | SafeLoad(in + 27) << 27, SafeLoad(in + 27) >> 4 | SafeLoad(in + 28) << 28, SafeLoad(in + 28) >> 3 | SafeLoad(in + 29) << 29, SafeLoad(in + 29) >> 2 | SafeLoad(in + 30) << 30, SafeLoad(in + 30) }; + shifts = simd_batch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += 16; + + in += 31; + return in; +} + +inline static const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) { + memcpy(out, in, 32 * sizeof(*out)); + in += 32; + out += 32; + + return in; +} + +}; // struct UnpackBits512 + +} // namespace +} // namespace internal +} // namespace arrow + diff --git a/cpp/src/arrow/util/bpacking_simd_codegen.py b/cpp/src/arrow/util/bpacking_simd_codegen.py new file mode 100644 index 00000000000..d033394df97 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_simd_codegen.py @@ -0,0 +1,209 @@ +#!/bin/python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Usage: +# python bpacking_simd_codegen.py 128 > bpacking_simd128_generated.h +# python bpacking_simd_codegen.py 256 > bpacking_simd256_generated.h +# python bpacking_simd_codegen.py 512 > bpacking_simd512_generated.h + +from functools import partial +import sys +from textwrap import dedent, indent + + +class UnpackGenerator: + + def __init__(self, simd_width): + self.simd_width = simd_width + if simd_width % 32 != 0: + raise("SIMD bit width should be a multiple of 32") + self.simd_byte_width = simd_width // 8 + + def print_unpack_bit0_func(self): + print( + "inline static const uint32_t* unpack0_32(const uint32_t* in, uint32_t* out) {") + print(" memset(out, 0x0, 32 * sizeof(*out));") + print(" out += 32;") + print("") + print(" return in;") + print("}") + + + def print_unpack_bit32_func(self): + print( + "inline static const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) {") + print(" memcpy(out, in, 32 * sizeof(*out));") + print(" in += 32;") + print(" out += 32;") + print("") + print(" return in;") + print("}") + + def print_unpack_bit_func(self, bit): + def p(code): + print(indent(code, prefix=' ')) + + shift = 0 + shifts = [] + in_index = 0 + inls = [] + mask = (1 << bit) - 1 + bracket = "{" + + print(f"inline static const uint32_t* unpack{bit}_32(const uint32_t* in, uint32_t* out) {{") + p(dedent(f"""\ + uint32_t mask = 0x{mask:0x}; + + simd_batch masks(mask); + simd_batch words, shifts; + simd_batch results; + """)) + + def safe_load(index): + return f"SafeLoad(in + {index})" + + for i in range(32): + if shift + bit == 32: + shifts.append(shift) + inls.append(safe_load(in_index)) + in_index += 1 + shift = 0 + elif shift + bit > 32: # cross the boundary + inls.append( + f"{safe_load(in_index)} >> {shift} | {safe_load(in_index + 1)} << {32 - shift}") + in_index += 1 + shift = bit - (32 - shift) + shifts.append(0) # zero shift + else: + shifts.append(shift) + inls.append(safe_load(in_index)) + shift += bit + + bytes_per_batch = self.simd_byte_width + words_per_batch = bytes_per_batch // 4 + + one_word_template = dedent("""\ + words = simd_batch{{ {words} }}; + shifts = simd_batch{{ {shifts} }}; + results = (words >> shifts) & masks; + results.store_unaligned(out); + out += {words_per_batch}; + """) + + for start in range(0, 32, words_per_batch): + stop = start + words_per_batch; + p(f"""// extract {bit}-bit bundles {start} to {stop - 1}""") + p(one_word_template.format( + words=", ".join(inls[start:stop]), + shifts=", ".join(map(str, shifts[start:stop])), + words_per_batch=words_per_batch)) + + p(dedent(f"""\ + in += {bit}; + return in;""")) + print("}") + + +def print_copyright(): + print(dedent("""\ + // Licensed to the Apache Software Foundation (ASF) under one + // or more contributor license agreements. See the NOTICE file + // distributed with this work for additional information + // regarding copyright ownership. The ASF licenses this file + // to you under the Apache License, Version 2.0 (the + // "License"); you may not use this file except in compliance + // with the License. You may obtain a copy of the License at + // + // http://www.apache.org/licenses/LICENSE-2.0 + // + // Unless required by applicable law or agreed to in writing, + // software distributed under the License is distributed on an + // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + // KIND, either express or implied. See the License for the + // specific language governing permissions and limitations + // under the License. + """)) + + +def print_note(): + print("// Automatically generated file; DO NOT EDIT.") + print() + + +def main(simd_width): + print_copyright() + print_note() + + struct_name = f"UnpackBits{simd_width}" + + # NOTE: templating the UnpackBits struct on the dispatch level avoids + # potential name collisions if there are several UnpackBits generations + # with the same SIMD width on a given architecture. + + print(dedent(f"""\ + #pragma once + + #include + #include + + #include + + #include "arrow/util/dispatch.h" + #include "arrow/util/ubsan.h" + + namespace arrow {{ + namespace internal {{ + namespace {{ + + using ::arrow::util::SafeLoad; + + template + struct {struct_name} {{ + + using simd_batch = xsimd::batch; + """)) + + gen = UnpackGenerator(simd_width) + gen.print_unpack_bit0_func() + print() + for i in range(1, 32): + gen.print_unpack_bit_func(i) + print() + gen.print_unpack_bit32_func() + print() + + print(dedent(f"""\ + }}; // struct {struct_name} + + }} // namespace + }} // namespace internal + }} // namespace arrow + """)) + + +if __name__ == '__main__': + usage = f"""Usage: {__file__} """ + if len(sys.argv) != 2: + raise ValueError(usage) + try: + simd_width = int(sys.argv[1]) + except ValueError: + raise ValueError(usage) + + main(simd_width) diff --git a/cpp/src/arrow/util/bpacking_simd_internal.h b/cpp/src/arrow/util/bpacking_simd_internal.h new file mode 100644 index 00000000000..72d23f2d38c --- /dev/null +++ b/cpp/src/arrow/util/bpacking_simd_internal.h @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/dispatch.h" +#include "arrow/util/logging.h" + +namespace arrow { +namespace internal { + +template +static int unpack32_specialized(const uint32_t* in, uint32_t* out, int batch_size, + int num_bits) { + batch_size = batch_size / 32 * 32; + int num_loops = batch_size / 32; + + switch (num_bits) { + case 0: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack0_32(in, out + i * 32); + break; + case 1: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack1_32(in, out + i * 32); + break; + case 2: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack2_32(in, out + i * 32); + break; + case 3: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack3_32(in, out + i * 32); + break; + case 4: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack4_32(in, out + i * 32); + break; + case 5: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack5_32(in, out + i * 32); + break; + case 6: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack6_32(in, out + i * 32); + break; + case 7: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack7_32(in, out + i * 32); + break; + case 8: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack8_32(in, out + i * 32); + break; + case 9: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack9_32(in, out + i * 32); + break; + case 10: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack10_32(in, out + i * 32); + break; + case 11: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack11_32(in, out + i * 32); + break; + case 12: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack12_32(in, out + i * 32); + break; + case 13: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack13_32(in, out + i * 32); + break; + case 14: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack14_32(in, out + i * 32); + break; + case 15: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack15_32(in, out + i * 32); + break; + case 16: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack16_32(in, out + i * 32); + break; + case 17: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack17_32(in, out + i * 32); + break; + case 18: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack18_32(in, out + i * 32); + break; + case 19: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack19_32(in, out + i * 32); + break; + case 20: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack20_32(in, out + i * 32); + break; + case 21: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack21_32(in, out + i * 32); + break; + case 22: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack22_32(in, out + i * 32); + break; + case 23: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack23_32(in, out + i * 32); + break; + case 24: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack24_32(in, out + i * 32); + break; + case 25: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack25_32(in, out + i * 32); + break; + case 26: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack26_32(in, out + i * 32); + break; + case 27: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack27_32(in, out + i * 32); + break; + case 28: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack28_32(in, out + i * 32); + break; + case 29: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack29_32(in, out + i * 32); + break; + case 30: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack30_32(in, out + i * 32); + break; + case 31: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack31_32(in, out + i * 32); + break; + case 32: + for (int i = 0; i < num_loops; ++i) in = UnpackBits::unpack32_32(in, out + i * 32); + break; + default: + DCHECK(false) << "Unsupported num_bits"; + } + + return batch_size; +} + +} // namespace internal +} // namespace arrow diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 01b658d6d47..e5ab78c3822 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -52,8 +52,7 @@ ARROW_SNAPPY_BUILD_VERSION=1.1.8 ARROW_THRIFT_BUILD_VERSION=0.13.0 ARROW_THRIFT_BUILD_MD5_CHECKSUM=38a27d391a2b03214b444cb13d5664f1 ARROW_UTF8PROC_BUILD_VERSION=v2.6.1 -# For https://github.com/xtensor-stack/xsimd/pull/419 -ARROW_XSIMD_BUILD_VERSION=e916f3ab1bc513328b627df702226a1d1e2ae3a9 +ARROW_XSIMD_BUILD_VERSION=e9234cd6e6f4428fc260073b2c34ffe86fda1f34 ARROW_ZLIB_BUILD_VERSION=1.2.11 ARROW_ZSTD_BUILD_VERSION=v1.4.8 From 00a443629c00079ea03c0b9f415d74669d2759a7 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 13 Apr 2021 18:46:24 +0200 Subject: [PATCH 012/719] ARROW-12357: [Archery] Bump Jinja2 version requirement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jinja2 < 2.11 doesn't support passing Path objects for filesystem paths. Closes #10011 from pitrou/ARROW-12357-archery-jinja-req Authored-by: Antoine Pitrou Signed-off-by: Krisztián Szűcs --- dev/archery/setup.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dev/archery/setup.py b/dev/archery/setup.py index 892e6b2a8bd..0537e8b4d31 100755 --- a/dev/archery/setup.py +++ b/dev/archery/setup.py @@ -24,11 +24,14 @@ if sys.version_info < (3, 6): sys.exit('Python < 3.6 is not supported') +# For pathlib.Path compatibility +jinja_req = 'jinja2>=2.11' + extras = { 'benchmark': ['pandas'], 'docker': ['ruamel.yaml', 'python-dotenv'], - 'release': ['jinja2', 'jira', 'semver', 'gitpython'], - 'crossbow': ['github3.py', 'jinja2', 'pygit2', 'ruamel.yaml', + 'release': [jinja_req, 'jira', 'semver', 'gitpython'], + 'crossbow': ['github3.py', jinja_req, 'pygit2', 'ruamel.yaml', 'setuptools_scm'], } extras['bot'] = extras['crossbow'] + ['pygithub', 'jira'] From a5f3b35168980eb35d5daf77edb2a1611dd71f7d Mon Sep 17 00:00:00 2001 From: Rok Date: Tue, 13 Apr 2021 12:46:08 -0700 Subject: [PATCH 013/719] ARROW-11070: [C++][Compute] Implement power kernel This is to resolve [ARROW-11070](https://issues.apache.org/jira/projects/ARROW/issues/ARROW-11070). Closes #9841 from rok/ARROW-11070 Lead-authored-by: Rok Co-authored-by: Yibo Cai Signed-off-by: Neal Richardson --- cpp/src/arrow/compute/api_scalar.cc | 1 + cpp/src/arrow/compute/api_scalar.h | 14 +++ .../compute/kernels/scalar_arithmetic.cc | 87 ++++++++++++++ .../compute/kernels/scalar_arithmetic_test.cc | 110 +++++++++++++++++- docs/source/cpp/compute.rst | 4 + docs/source/python/api/compute.rst | 2 + 6 files changed, 217 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index f4696fbe02a..d169fd2ebde 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -52,6 +52,7 @@ SCALAR_ARITHMETIC_BINARY(Add, "add", "add_checked") SCALAR_ARITHMETIC_BINARY(Subtract, "subtract", "subtract_checked") SCALAR_ARITHMETIC_BINARY(Multiply, "multiply", "multiply_checked") SCALAR_ARITHMETIC_BINARY(Divide, "divide", "divide_checked") +SCALAR_ARITHMETIC_BINARY(Power, "power", "power_checked") // ---------------------------------------------------------------------- // Set-related operations diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index f59426d8f1b..6032f656c4a 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -204,6 +204,20 @@ Result Divide(const Datum& left, const Datum& right, ArithmeticOptions options = ArithmeticOptions(), ExecContext* ctx = NULLPTR); +/// \brief Raise the values of base array to the power of the exponent array values. +/// Array values must be the same length. If either base or exponent is null the result +/// will be null. +/// +/// \param[in] left the base +/// \param[in] right the exponent +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise base value raised to the power of exponent +ARROW_EXPORT +Result Power(const Datum& left, const Datum& right, + ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + /// \brief Compare a numeric array with a scalar. /// /// \param[in] left datum to compare, must be an Array diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc index 7abaa1c1a59..260721b08d9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include + #include "arrow/compute/kernels/common.h" #include "arrow/util/int_util_internal.h" #include "arrow/util/macros.h" @@ -233,6 +235,70 @@ struct DivideChecked { } }; +struct Power { + ARROW_NOINLINE + static uint64_t IntegerPower(uint64_t base, uint64_t exp) { + // right to left O(logn) power + uint64_t pow = 1; + while (exp) { + pow *= (exp & 1) ? base : 1; + base *= base; + exp >>= 1; + } + return pow; + } + + template + static enable_if_integer Call(KernelContext* ctx, T base, T exp) { + if (exp < 0) { + ctx->SetStatus( + Status::Invalid("integers to negative integer powers are not allowed")); + return 0; + } + return static_cast(IntegerPower(base, exp)); + } + + template + static enable_if_floating_point Call(KernelContext* ctx, T base, T exp) { + return std::pow(base, exp); + } +}; + +struct PowerChecked { + template + static enable_if_integer Call(KernelContext* ctx, Arg0 base, Arg1 exp) { + if (exp < 0) { + ctx->SetStatus( + Status::Invalid("integers to negative integer powers are not allowed")); + return 0; + } else if (exp == 0) { + return 1; + } + // left to right O(logn) power with overflow checks + bool overflow = false; + uint64_t bitmask = + 1ULL << (63 - BitUtil::CountLeadingZeros(static_cast(exp))); + T pow = 1; + while (bitmask) { + overflow |= MultiplyWithOverflow(pow, pow, &pow); + if (exp & bitmask) { + overflow |= MultiplyWithOverflow(pow, base, &pow); + } + bitmask >>= 1; + } + if (overflow) { + ctx->SetStatus(Status::Invalid("overflow")); + } + return pow; + } + + template + static enable_if_floating_point Call(KernelContext* ctx, Arg0 base, Arg1 exp) { + static_assert(std::is_same::value && std::is_same::value, ""); + return std::pow(base, exp); + } +}; + // Generate a kernel given an arithmetic functor template