From caddfce3dfdcf50629e5abffc6952e002920a5da Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Sat, 14 Oct 2017 15:47:18 -0700
Subject: [PATCH 1/6] add support for byte length booleans

---
 cpp/src/arrow/ipc/ipc-read-write-test.cc | 11 +++++++++++
 cpp/src/arrow/ipc/metadata-internal.cc   | 18 ++++++++++++++++--
 cpp/src/arrow/type.cc                    |  5 +++++
 cpp/src/arrow/type.h                     | 16 ++++++++++++++++
 cpp/src/arrow/type_fwd.h                 |  2 ++
 cpp/src/arrow/visitor.cc                 |  1 +
 cpp/src/arrow/visitor.h                  |  1 +
 format/Metadata.md                       |  3 +++
 format/Schema.fbs                        |  3 +++
 9 files changed, 58 insertions(+), 2 deletions(-)
diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc
index adf34a9eb54..c3e8b54736d 100644
--- a/cpp/src/arrow/ipc/ipc-read-write-test.cc
+++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc
@@ -728,14 +728,25 @@ TEST_F(TestTensorRoundTrip, BasicRoundtrip) {
 
   std::vector<int64_t> values;
   test::randint<int64_t>(size, 0, 100, &values);
+  std::vector<bool> bool_values;
+  test::randint<bool>(size, 0, 1, &bool_values);
+  std::vector<uint8_t> bool8_values;
+  test::randint<uint8_t>(size, 0, 1, &bool8_values);
 
   auto data = test::GetBufferFromVector(values);
+  std::shared_ptr<Buffer> bool_data;
+  ASSERT_OK(test::GetBitmapFromVector(bool_values, &bool_data));
+  auto bool8_data = test::GetBufferFromVector(bool8_values);
 
   Tensor t0(int64(), data, shape, strides, dim_names);
   Tensor tzero(int64(), data, {}, {}, {});
+  Tensor tbool(boolean(), bool_data, shape, strides, dim_names);
+  Tensor tbool8(boolean8(), bool8_data, shape, strides, dim_names);
 
   CheckTensorRoundTrip(t0);
   CheckTensorRoundTrip(tzero);
+  CheckTensorRoundTrip(tbool);
+  CheckTensorRoundTrip(tbool8);
 
   int64_t serialized_size;
   ASSERT_OK(GetTensorSize(t0, &serialized_size));
diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc
index 162afb94bfc..48e23061cc7 100644
--- a/cpp/src/arrow/ipc/metadata-internal.cc
+++ b/cpp/src/arrow/ipc/metadata-internal.cc
@@ -249,9 +249,15 @@ static Status TypeFromFlatbuffer(flatbuf::Type type, const void* type_data,
     case flatbuf::Type_Utf8:
       *out = utf8();
       return Status::OK();
-    case flatbuf::Type_Bool:
-      *out = boolean();
+    case flatbuf::Type_Bool: {
+      auto bool_type = static_cast<const flatbuf::Bool*>(type_data);
+      if (bool_type->is_byte()) {
+        *out = boolean8();
+      } else {
+        *out = boolean();
+      }
       return Status::OK();
+    }
     case flatbuf::Type_Decimal: {
       auto dec_type = static_cast<const flatbuf::Decimal*>(type_data);
       *out = decimal(dec_type->precision(), dec_type->scale());
@@ -458,6 +464,14 @@ static Status TypeToFlatbuffer(FBB& fbb, const DataType& type,
 static Status TensorTypeToFlatbuffer(FBB& fbb, const DataType& type,
                                      flatbuf::Type* out_type, Offset* offset) {
   switch (type.id()) {
+    case Type::BOOL:
+      *out_type = flatbuf::Type_Bool;
+      *offset = flatbuf::CreateBool(fbb).Union();
+      break;
+    case Type::BOOL8:
+      *out_type = flatbuf::Type_Bool;
+      *offset = flatbuf::CreateBool(fbb, true).Union();
+      break;
     case Type::UINT8:
       INT_TO_FB_CASE(8, false);
     case Type::INT8:
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index a9bf5919185..70b80658425 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -93,6 +93,8 @@ bool DataType::Equals(const std::shared_ptr<DataType>& other) const {
 
 std::string BooleanType::ToString() const { return name(); }
 
+std::string Boolean8Type::ToString() const { return name(); }
+
 FloatingPoint::Precision HalfFloatType::precision() const { return FloatingPoint::HALF; }
 
 FloatingPoint::Precision FloatType::precision() const { return FloatingPoint::SINGLE; }
@@ -368,6 +370,7 @@ std::shared_ptr<Schema> schema(std::vector<std::shared_ptr<Field>>&& fields,
 
 ACCEPT_VISITOR(NullType);
 ACCEPT_VISITOR(BooleanType);
+ACCEPT_VISITOR(Boolean8Type);
 ACCEPT_VISITOR(BinaryType);
 ACCEPT_VISITOR(FixedSizeBinaryType);
 ACCEPT_VISITOR(StringType);
@@ -391,6 +394,7 @@ ACCEPT_VISITOR(DictionaryType);
 
 TYPE_FACTORY(null, NullType);
 TYPE_FACTORY(boolean, BooleanType);
+TYPE_FACTORY(boolean8, Boolean8Type);
 TYPE_FACTORY(int8, Int8Type);
 TYPE_FACTORY(uint8, UInt8Type);
 TYPE_FACTORY(int16, Int16Type);
@@ -464,6 +468,7 @@ static const BufferDescr kValidityBuffer(BufferType::VALIDITY, 1);
 static const BufferDescr kOffsetBuffer(BufferType::OFFSET, 32);
 static const BufferDescr kTypeBuffer(BufferType::TYPE, 32);
 static const BufferDescr kBooleanBuffer(BufferType::DATA, 1);
+static const BufferDescr kBoolean8Buffer(BufferType::DATA, 8);
 static const BufferDescr kValues64(BufferType::DATA, 64);
 static const BufferDescr kValues32(BufferType::DATA, 32);
 static const BufferDescr kValues16(BufferType::DATA, 16);
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 443828423e7..90b0d9db3a4 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -49,6 +49,9 @@ struct Type {
     /// Boolean as 1 bit, LSB bit-packed ordering
     BOOL,
 
+    /// Boolean as 1 byte
+    BOOL8,
+
     /// Unsigned 8-bit little-endian integer
     UINT8,
 
@@ -332,6 +335,19 @@ class ARROW_EXPORT BooleanType : public FixedWidthType, public NoExtraMeta {
   std::string name() const override { return "bool"; }
 };
 
+class ARROW_EXPORT Boolean8Type : public FixedWidthType, public NoExtraMeta {
+ public:
+  static constexpr Type::type type_id = Type::BOOL8;
+
+  Boolean8Type() : FixedWidthType(Type::BOOL8) {}
+
+  Status Accept(TypeVisitor* visitor) const override;
+  std::string ToString() const override;
+
+  int bit_width() const override { return 8; }
+  std::string name() const override { return "bool8"; }
+};
+
 class ARROW_EXPORT UInt8Type
     : public detail::IntegerTypeImpl<UInt8Type, Type::UINT8, uint8_t> {
  public:
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index 0d06b6f6cb8..8b5648a5037 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -45,6 +45,7 @@ class NullArray;
 class NullBuilder;
 
 class BooleanType;
+class Boolean8Type;
 class BooleanArray;
 class BooleanBuilder;
 
@@ -132,6 +133,7 @@ using IntervalArray = NumericArray<IntervalType>;
 
 std::shared_ptr<DataType> ARROW_EXPORT null();
 std::shared_ptr<DataType> ARROW_EXPORT boolean();
+std::shared_ptr<DataType> ARROW_EXPORT boolean8();
 std::shared_ptr<DataType> ARROW_EXPORT int8();
 std::shared_ptr<DataType> ARROW_EXPORT int16();
 std::shared_ptr<DataType> ARROW_EXPORT int32();
diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc
index a7b01b0f631..4c262231ea5 100644
--- a/cpp/src/arrow/visitor.cc
+++ b/cpp/src/arrow/visitor.cc
@@ -70,6 +70,7 @@ ARRAY_VISITOR_DEFAULT(DecimalArray);
 
 TYPE_VISITOR_DEFAULT(NullType);
 TYPE_VISITOR_DEFAULT(BooleanType);
+TYPE_VISITOR_DEFAULT(Boolean8Type);
 TYPE_VISITOR_DEFAULT(Int8Type);
 TYPE_VISITOR_DEFAULT(Int16Type);
 TYPE_VISITOR_DEFAULT(Int32Type);
diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h
index 6c36e465ec4..510d5f012ae 100644
--- a/cpp/src/arrow/visitor.h
+++ b/cpp/src/arrow/visitor.h
@@ -63,6 +63,7 @@ class ARROW_EXPORT TypeVisitor {
 
   virtual Status Visit(const NullType& type);
   virtual Status Visit(const BooleanType& type);
+  virtual Status Visit(const Boolean8Type& type);
   virtual Status Visit(const Int8Type& type);
   virtual Status Visit(const Int16Type& type);
   virtual Status Visit(const Int32Type& type);
diff --git a/format/Metadata.md b/format/Metadata.md
index 80ca08ae13f..bd157406125 100644
--- a/format/Metadata.md
+++ b/format/Metadata.md
@@ -352,6 +352,9 @@ table FloatingPoint {
 
 The Boolean logical type is represented as a 1-bit wide primitive physical
 type. The bits are numbered using least-significant bit (LSB) ordering.
+Inside of tensors, boolean data may also be represented as
+a 1-byte wide primitive physical type; in this case the
+flag `is_byte` is set.
 
 Like other fixed bit-width primitive types, boolean data appears as 2 buffers
 in the data header (one bitmap for the validity vector and one for the values).
diff --git a/format/Schema.fbs b/format/Schema.fbs
index 186f8e362bd..b9baed96139 100644
--- a/format/Schema.fbs
+++ b/format/Schema.fbs
@@ -108,6 +108,9 @@ table FixedSizeBinary {
 }
 
 table Bool {
+  /// If this flag is set, the bool is represented as a byte,
+  /// by default it is represented as a bit.
+  is_byte: bool;
 }
 
 table Decimal {

From 246f99417fb5b29db9c40630995171f307668391 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Sat, 14 Oct 2017 15:52:20 -0700
Subject: [PATCH 2/6] update

---
 cpp/src/arrow/type.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 90b0d9db3a4..d2e9ecc7d9a 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -49,7 +49,7 @@ struct Type {
     /// Boolean as 1 bit, LSB bit-packed ordering
     BOOL,
 
-    /// Boolean as 1 byte
+    /// Boolean as 1 byte (may only be used in Tensors)
     BOOL8,
 
     /// Unsigned 8-bit little-endian integer

From 92a5be122f88344fa05a1331d0cba6305afe0211 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Sat, 14 Oct 2017 16:27:41 -0700
Subject: [PATCH 3/6] add BOOL and BOOL8 to is_tensor_supported

---
 cpp/src/arrow/tensor.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/arrow/tensor.h b/cpp/src/arrow/tensor.h
index 4e4c6b8d57c..cc622948f9b 100644
--- a/cpp/src/arrow/tensor.h
+++ b/cpp/src/arrow/tensor.h
@@ -32,6 +32,8 @@ namespace arrow {
 
 static inline bool is_tensor_supported(Type::type type_id) {
   switch (type_id) {
+    case Type::BOOL:
+    case Type::BOOL8:
     case Type::UINT8:
     case Type::INT8:
     case Type::UINT16:

From fcde2f7baca56f3841193abd7093e3a2a401c225 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Sat, 14 Oct 2017 17:04:41 -0700
Subject: [PATCH 4/6] fix valgrind

---
 cpp/src/arrow/ipc/ipc-read-write-test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc
index c3e8b54736d..b60f3af39b5 100644
--- a/cpp/src/arrow/ipc/ipc-read-write-test.cc
+++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc
@@ -740,8 +740,8 @@ TEST_F(TestTensorRoundTrip, BasicRoundtrip) {
 
   Tensor t0(int64(), data, shape, strides, dim_names);
   Tensor tzero(int64(), data, {}, {}, {});
-  Tensor tbool(boolean(), bool_data, shape, strides, dim_names);
-  Tensor tbool8(boolean8(), bool8_data, shape, strides, dim_names);
+  Tensor tbool(boolean(), bool_data, {}, {}, {});
+  Tensor tbool8(boolean8(), bool8_data, {}, {}, {});
 
   CheckTensorRoundTrip(t0);
   CheckTensorRoundTrip(tzero);

From 32085460a29dcb62380a5b49780682bda1245b09 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Sat, 14 Oct 2017 18:10:49 -0700
Subject: [PATCH 5/6] fix tensor comparison function

---
 cpp/src/arrow/compare.cc | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 2ec86c3695a..552e1d7ca4b 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -740,19 +740,27 @@ bool TensorEquals(const Tensor& left, const Tensor& right) {
         are_equal = false;
       } else {
         const auto& type = static_cast<const FixedWidthType&>(*left.type());
+        // Type::BOOL strided tensors are currently not supported
+        DCHECK_GT(type.bit_width() / CHAR_BIT, 0);
         are_equal =
-            StridedTensorContentEquals(0, 0, 0, type.bit_width() / 8, left, right);
+            StridedTensorContentEquals(0, 0, 0, type.bit_width() / CHAR_BIT, left, right);
       }
     } else {
       const auto& size_meta = dynamic_cast<const FixedWidthType&>(*left.type());
-      const int byte_width = size_meta.bit_width() / CHAR_BIT;
-      DCHECK_GT(byte_width, 0);
 
       const uint8_t* left_data = left.data()->data();
       const uint8_t* right_data = right.data()->data();
 
-      are_equal = memcmp(left_data, right_data,
-                         static_cast<size_t>(byte_width * left.size())) == 0;
+      if (size_meta.bit_width() == 1) {
+        int64_t bytes = (left.size() + CHAR_BIT - 1) / CHAR_BIT;
+        are_equal = memcmp(left_data, right_data,
+                           static_cast<size_t>(bytes)) == 0;
+      } else {
+        const int byte_width = size_meta.bit_width() / CHAR_BIT;
+        DCHECK_GT(byte_width, 0);
+        are_equal = memcmp(left_data, right_data,
+                           static_cast<size_t>(byte_width * left.size())) == 0;
+      }
     }
   }
   return are_equal;

From 58ff6ebc5e402eaa976dcb37ac056e8c43527c33 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Sat, 14 Oct 2017 18:52:19 -0700
Subject: [PATCH 6/6] fix windows build

---
 cpp/src/arrow/ipc/ipc-read-write-test.cc | 2 +-
 cpp/src/arrow/test-util.h                | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc
index b60f3af39b5..fbbcf3dd48d 100644
--- a/cpp/src/arrow/ipc/ipc-read-write-test.cc
+++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc
@@ -729,7 +729,7 @@ TEST_F(TestTensorRoundTrip, BasicRoundtrip) {
   std::vector<int64_t> values;
   test::randint<int64_t>(size, 0, 100, &values);
   std::vector<bool> bool_values;
-  test::randint<bool>(size, 0, 1, &bool_values);
+  test::randbool(size, &bool_values);
   std::vector<uint8_t> bool8_values;
   test::randint<uint8_t>(size, 0, 1, &bool8_values);
 
diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h
index 80e4feb6c32..553a319d091 100644
--- a/cpp/src/arrow/test-util.h
+++ b/cpp/src/arrow/test-util.h
@@ -78,6 +78,15 @@ using ArrayVector = std::vector<std::shared_ptr<Array>>;
 
 namespace test {
 
+void randbool(int64_t N, std::vector<bool>* out) {
+  Random rng(random_seed());
+  bool val;
+  for (int64_t i = 0; i < N; ++i) {
+    val = rng.OneIn(2);
+    out->push_back(val);
+  }
+}
+
 template <typename T>
 void randint(int64_t N, T lower, T upper, std::vector<T>* out) {
   Random rng(random_seed());