From caddfce3dfdcf50629e5abffc6952e002920a5da Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Sat, 14 Oct 2017 15:47:18 -0700 Subject: [PATCH 1/6] add support for byte length booleans --- cpp/src/arrow/ipc/ipc-read-write-test.cc | 11 +++++++++++ cpp/src/arrow/ipc/metadata-internal.cc | 18 ++++++++++++++++-- cpp/src/arrow/type.cc | 5 +++++ cpp/src/arrow/type.h | 16 ++++++++++++++++ cpp/src/arrow/type_fwd.h | 2 ++ cpp/src/arrow/visitor.cc | 1 + cpp/src/arrow/visitor.h | 1 + format/Metadata.md | 3 +++ format/Schema.fbs | 3 +++ 9 files changed, 58 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc index adf34a9eb54..c3e8b54736d 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-test.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc @@ -728,14 +728,25 @@ TEST_F(TestTensorRoundTrip, BasicRoundtrip) { std::vector values; test::randint(size, 0, 100, &values); + std::vector bool_values; + test::randint(size, 0, 1, &bool_values); + std::vector bool8_values; + test::randint(size, 0, 1, &bool8_values); auto data = test::GetBufferFromVector(values); + std::shared_ptr bool_data; + ASSERT_OK(test::GetBitmapFromVector(bool_values, &bool_data)); + auto bool8_data = test::GetBufferFromVector(bool8_values); Tensor t0(int64(), data, shape, strides, dim_names); Tensor tzero(int64(), data, {}, {}, {}); + Tensor tbool(boolean(), bool_data, shape, strides, dim_names); + Tensor tbool8(boolean8(), bool8_data, shape, strides, dim_names); CheckTensorRoundTrip(t0); CheckTensorRoundTrip(tzero); + CheckTensorRoundTrip(tbool); + CheckTensorRoundTrip(tbool8); int64_t serialized_size; ASSERT_OK(GetTensorSize(t0, &serialized_size)); diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 162afb94bfc..48e23061cc7 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -249,9 +249,15 @@ static Status TypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type_Utf8: *out = utf8(); return Status::OK(); - case flatbuf::Type_Bool: - *out = boolean(); + case flatbuf::Type_Bool: { + auto bool_type = static_cast(type_data); + if (bool_type->is_byte()) { + *out = boolean8(); + } else { + *out = boolean(); + } return Status::OK(); + } case flatbuf::Type_Decimal: { auto dec_type = static_cast(type_data); *out = decimal(dec_type->precision(), dec_type->scale()); @@ -458,6 +464,14 @@ static Status TypeToFlatbuffer(FBB& fbb, const DataType& type, static Status TensorTypeToFlatbuffer(FBB& fbb, const DataType& type, flatbuf::Type* out_type, Offset* offset) { switch (type.id()) { + case Type::BOOL: + *out_type = flatbuf::Type_Bool; + *offset = flatbuf::CreateBool(fbb).Union(); + break; + case Type::BOOL8: + *out_type = flatbuf::Type_Bool; + *offset = flatbuf::CreateBool(fbb, true).Union(); + break; case Type::UINT8: INT_TO_FB_CASE(8, false); case Type::INT8: diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index a9bf5919185..70b80658425 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -93,6 +93,8 @@ bool DataType::Equals(const std::shared_ptr& other) const { std::string BooleanType::ToString() const { return name(); } +std::string Boolean8Type::ToString() const { return name(); } + FloatingPoint::Precision HalfFloatType::precision() const { return FloatingPoint::HALF; } FloatingPoint::Precision FloatType::precision() const { return FloatingPoint::SINGLE; } @@ -368,6 +370,7 @@ std::shared_ptr schema(std::vector>&& fields, ACCEPT_VISITOR(NullType); ACCEPT_VISITOR(BooleanType); +ACCEPT_VISITOR(Boolean8Type); ACCEPT_VISITOR(BinaryType); ACCEPT_VISITOR(FixedSizeBinaryType); ACCEPT_VISITOR(StringType); @@ -391,6 +394,7 @@ ACCEPT_VISITOR(DictionaryType); TYPE_FACTORY(null, NullType); TYPE_FACTORY(boolean, BooleanType); +TYPE_FACTORY(boolean8, Boolean8Type); TYPE_FACTORY(int8, Int8Type); TYPE_FACTORY(uint8, UInt8Type); TYPE_FACTORY(int16, Int16Type); @@ -464,6 +468,7 @@ static const BufferDescr kValidityBuffer(BufferType::VALIDITY, 1); static const BufferDescr kOffsetBuffer(BufferType::OFFSET, 32); static const BufferDescr kTypeBuffer(BufferType::TYPE, 32); static const BufferDescr kBooleanBuffer(BufferType::DATA, 1); +static const BufferDescr kBoolean8Buffer(BufferType::DATA, 8); static const BufferDescr kValues64(BufferType::DATA, 64); static const BufferDescr kValues32(BufferType::DATA, 32); static const BufferDescr kValues16(BufferType::DATA, 16); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 443828423e7..90b0d9db3a4 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -49,6 +49,9 @@ struct Type { /// Boolean as 1 bit, LSB bit-packed ordering BOOL, + /// Boolean as 1 byte + BOOL8, + /// Unsigned 8-bit little-endian integer UINT8, @@ -332,6 +335,19 @@ class ARROW_EXPORT BooleanType : public FixedWidthType, public NoExtraMeta { std::string name() const override { return "bool"; } }; +class ARROW_EXPORT Boolean8Type : public FixedWidthType, public NoExtraMeta { + public: + static constexpr Type::type type_id = Type::BOOL8; + + Boolean8Type() : FixedWidthType(Type::BOOL8) {} + + Status Accept(TypeVisitor* visitor) const override; + std::string ToString() const override; + + int bit_width() const override { return 8; } + std::string name() const override { return "bool8"; } +}; + class ARROW_EXPORT UInt8Type : public detail::IntegerTypeImpl { public: diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 0d06b6f6cb8..8b5648a5037 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -45,6 +45,7 @@ class NullArray; class NullBuilder; class BooleanType; +class Boolean8Type; class BooleanArray; class BooleanBuilder; @@ -132,6 +133,7 @@ using IntervalArray = NumericArray; std::shared_ptr ARROW_EXPORT null(); std::shared_ptr ARROW_EXPORT boolean(); +std::shared_ptr ARROW_EXPORT boolean8(); std::shared_ptr ARROW_EXPORT int8(); std::shared_ptr ARROW_EXPORT int16(); std::shared_ptr ARROW_EXPORT int32(); diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index a7b01b0f631..4c262231ea5 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -70,6 +70,7 @@ ARRAY_VISITOR_DEFAULT(DecimalArray); TYPE_VISITOR_DEFAULT(NullType); TYPE_VISITOR_DEFAULT(BooleanType); +TYPE_VISITOR_DEFAULT(Boolean8Type); TYPE_VISITOR_DEFAULT(Int8Type); TYPE_VISITOR_DEFAULT(Int16Type); TYPE_VISITOR_DEFAULT(Int32Type); diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index 6c36e465ec4..510d5f012ae 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -63,6 +63,7 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const NullType& type); virtual Status Visit(const BooleanType& type); + virtual Status Visit(const Boolean8Type& type); virtual Status Visit(const Int8Type& type); virtual Status Visit(const Int16Type& type); virtual Status Visit(const Int32Type& type); diff --git a/format/Metadata.md b/format/Metadata.md index 80ca08ae13f..bd157406125 100644 --- a/format/Metadata.md +++ b/format/Metadata.md @@ -352,6 +352,9 @@ table FloatingPoint { The Boolean logical type is represented as a 1-bit wide primitive physical type. The bits are numbered using least-significant bit (LSB) ordering. +Inside of tensors, boolean data may also be represented as +a 1-byte wide primitive physical type; in this case the +flag `is_byte` is set. Like other fixed bit-width primitive types, boolean data appears as 2 buffers in the data header (one bitmap for the validity vector and one for the values). diff --git a/format/Schema.fbs b/format/Schema.fbs index 186f8e362bd..b9baed96139 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -108,6 +108,9 @@ table FixedSizeBinary { } table Bool { + /// If this flag is set, the bool is represented as a byte, + /// by default it is represented as a bit. + is_byte: bool; } table Decimal { From 246f99417fb5b29db9c40630995171f307668391 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Sat, 14 Oct 2017 15:52:20 -0700 Subject: [PATCH 2/6] update --- cpp/src/arrow/type.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 90b0d9db3a4..d2e9ecc7d9a 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -49,7 +49,7 @@ struct Type { /// Boolean as 1 bit, LSB bit-packed ordering BOOL, - /// Boolean as 1 byte + /// Boolean as 1 byte (may only be used in Tensors) BOOL8, /// Unsigned 8-bit little-endian integer From 92a5be122f88344fa05a1331d0cba6305afe0211 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Sat, 14 Oct 2017 16:27:41 -0700 Subject: [PATCH 3/6] add BOOL and BOOL8 to is_tensor_supported --- cpp/src/arrow/tensor.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/tensor.h b/cpp/src/arrow/tensor.h index 4e4c6b8d57c..cc622948f9b 100644 --- a/cpp/src/arrow/tensor.h +++ b/cpp/src/arrow/tensor.h @@ -32,6 +32,8 @@ namespace arrow { static inline bool is_tensor_supported(Type::type type_id) { switch (type_id) { + case Type::BOOL: + case Type::BOOL8: case Type::UINT8: case Type::INT8: case Type::UINT16: From fcde2f7baca56f3841193abd7093e3a2a401c225 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Sat, 14 Oct 2017 17:04:41 -0700 Subject: [PATCH 4/6] fix valgrind --- cpp/src/arrow/ipc/ipc-read-write-test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc index c3e8b54736d..b60f3af39b5 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-test.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc @@ -740,8 +740,8 @@ TEST_F(TestTensorRoundTrip, BasicRoundtrip) { Tensor t0(int64(), data, shape, strides, dim_names); Tensor tzero(int64(), data, {}, {}, {}); - Tensor tbool(boolean(), bool_data, shape, strides, dim_names); - Tensor tbool8(boolean8(), bool8_data, shape, strides, dim_names); + Tensor tbool(boolean(), bool_data, {}, {}, {}); + Tensor tbool8(boolean8(), bool8_data, {}, {}, {}); CheckTensorRoundTrip(t0); CheckTensorRoundTrip(tzero); From 32085460a29dcb62380a5b49780682bda1245b09 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Sat, 14 Oct 2017 18:10:49 -0700 Subject: [PATCH 5/6] fix tensor comparison function --- cpp/src/arrow/compare.cc | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 2ec86c3695a..552e1d7ca4b 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -740,19 +740,27 @@ bool TensorEquals(const Tensor& left, const Tensor& right) { are_equal = false; } else { const auto& type = static_cast(*left.type()); + // Type::BOOL strided tensors are currently not supported + DCHECK_GT(type.bit_width() / CHAR_BIT, 0); are_equal = - StridedTensorContentEquals(0, 0, 0, type.bit_width() / 8, left, right); + StridedTensorContentEquals(0, 0, 0, type.bit_width() / CHAR_BIT, left, right); } } else { const auto& size_meta = dynamic_cast(*left.type()); - const int byte_width = size_meta.bit_width() / CHAR_BIT; - DCHECK_GT(byte_width, 0); const uint8_t* left_data = left.data()->data(); const uint8_t* right_data = right.data()->data(); - are_equal = memcmp(left_data, right_data, - static_cast(byte_width * left.size())) == 0; + if (size_meta.bit_width() == 1) { + int64_t bytes = (left.size() + CHAR_BIT - 1) / CHAR_BIT; + are_equal = memcmp(left_data, right_data, + static_cast(bytes)) == 0; + } else { + const int byte_width = size_meta.bit_width() / CHAR_BIT; + DCHECK_GT(byte_width, 0); + are_equal = memcmp(left_data, right_data, + static_cast(byte_width * left.size())) == 0; + } } } return are_equal; From 58ff6ebc5e402eaa976dcb37ac056e8c43527c33 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Sat, 14 Oct 2017 18:52:19 -0700 Subject: [PATCH 6/6] fix windows build --- cpp/src/arrow/ipc/ipc-read-write-test.cc | 2 +- cpp/src/arrow/test-util.h | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc index b60f3af39b5..fbbcf3dd48d 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-test.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc @@ -729,7 +729,7 @@ TEST_F(TestTensorRoundTrip, BasicRoundtrip) { std::vector values; test::randint(size, 0, 100, &values); std::vector bool_values; - test::randint(size, 0, 1, &bool_values); + test::randbool(size, &bool_values); std::vector bool8_values; test::randint(size, 0, 1, &bool8_values); diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 80e4feb6c32..553a319d091 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -78,6 +78,15 @@ using ArrayVector = std::vector>; namespace test { +void randbool(int64_t N, std::vector* out) { + Random rng(random_seed()); + bool val; + for (int64_t i = 0; i < N; ++i) { + val = rng.OneIn(2); + out->push_back(val); + } +} + template void randint(int64_t N, T lower, T upper, std::vector* out) { Random rng(random_seed());