From 8e318d02ee89b2a91cc0a71d8ec5208c2f61fed0 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Wed, 9 Jun 2021 16:29:26 +0200
Subject: [PATCH 01/61] Complex Number Extension Type

---
 cpp/src/arrow/CMakeLists.txt             |  1 +
 cpp/src/arrow/extension_type_test.cc     | 28 +++++++-
 cpp/src/arrow/extensions/complex_type.cc | 88 ++++++++++++++++++++++++
 cpp/src/arrow/extensions/complex_type.h  | 44 ++++++++++++
 cpp/src/arrow/python/numpy_convert.cc    | 27 ++++++++
 cpp/src/arrow/python/numpy_internal.h    |  2 +
 cpp/src/arrow/python/python_to_arrow.cc  | 17 +++++
 7 files changed, 206 insertions(+), 1 deletion(-)
 create mode 100644 cpp/src/arrow/extensions/complex_type.cc
 create mode 100644 cpp/src/arrow/extensions/complex_type.h
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index f6d5a540c98..333ca6b6b59 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -155,6 +155,7 @@ set(ARROW_SRCS
     array/diff.cc
     array/util.cc
     array/validate.cc
+    extensions/complex_type.cc
     builder.cc
     buffer.cc
     chunked_array.cc
diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc
index cd1c3b9790e..2aba8512053 100644
--- a/cpp/src/arrow/extension_type_test.cc
+++ b/cpp/src/arrow/extension_type_test.cc
@@ -27,6 +27,7 @@
 #include "arrow/array/array_nested.h"
 #include "arrow/array/util.h"
 #include "arrow/extension_type.h"
+#include "arrow/extensions/complex_type.h"
 #include "arrow/io/memory.h"
 #include "arrow/ipc/options.h"
 #include "arrow/ipc/reader.h"
@@ -178,15 +179,40 @@ class ExtStructType : public ExtensionType {
 
 class TestExtensionType : public ::testing::Test {
  public:
-  void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared<UuidType>())); }
+  void SetUp() {
+    ASSERT_OK(RegisterExtensionType(std::make_shared<UuidType>()));
+    ASSERT_OK(RegisterExtensionType(std::make_shared<ComplexType>(float32())));
+  }
 
   void TearDown() {
     if (GetExtensionType("uuid")) {
       ASSERT_OK(UnregisterExtensionType("uuid"));
     }
+    if (GetExtensionType("complex")) {
+      ASSERT_OK(UnregisterExtensionType("complex"));
+    }
   }
 };
 
+TEST_F(TestExtensionType, ComplexTypeTest) {
+  auto registered_type = GetExtensionType("complex");
+  ASSERT_NE(registered_type, nullptr);
+
+  auto type = complex64();
+  ASSERT_EQ(type->id(), Type::EXTENSION);
+
+  const auto & ext_type = static_cast<const ExtensionType &>(*type);
+  std::string serialized = ext_type.Serialize();
+
+  ASSERT_OK_AND_ASSIGN(auto deserialized,
+                       ext_type.Deserialize(fixed_size_list(float32(), 2), serialized));
+
+  ASSERT_TRUE(deserialized->Equals(*type));
+  ASSERT_FALSE(deserialized->Equals(*fixed_size_list(float32(), 2)));
+
+  //auto type2 = complex(int16());
+}
+
 TEST_F(TestExtensionType, ExtensionTypeTest) {
   auto type_not_exist = GetExtensionType("uuid-unknown");
   ASSERT_EQ(type_not_exist, nullptr);
diff --git a/cpp/src/arrow/extensions/complex_type.cc b/cpp/src/arrow/extensions/complex_type.cc
new file mode 100644
index 00000000000..8a62e40879f
--- /dev/null
+++ b/cpp/src/arrow/extensions/complex_type.cc
@@ -0,0 +1,88 @@
+#include <sstream>
+
+#include <arrow/extensions/complex_type.h>
+
+namespace arrow {
+
+std::shared_ptr<DataType> ComplexType::MakeType(std::shared_ptr<DataType> subtype)
+{
+    return fixed_size_list(FloatCast(subtype), 2);
+}
+
+std::shared_ptr<FloatingPointType> ComplexType::FloatCast(std::shared_ptr<DataType> subtype)
+{
+    auto float_type = std::dynamic_pointer_cast<FloatingPointType>(subtype);
+
+    if(!float_type)
+    {
+        throw std::runtime_error("ComplexType subtype not floating point");
+    }
+
+    if(float_type->precision() != FloatingPointType::SINGLE &&
+       float_type->precision() != FloatingPointType::DOUBLE)
+    {
+        throw std::runtime_error("Complex subtype must be single or double precision");        
+    }
+
+    return float_type;
+}
+
+
+std::string ComplexType::name() const {
+    std::stringstream ss("complex");
+
+    switch(subtype()->precision())
+    {
+        case FloatingPointType::SINGLE:
+            ss << "64";
+            break;
+        case FloatingPointType::DOUBLE:
+            ss << "128";
+            break;
+        case FloatingPointType::HALF:
+        default:
+            throw std::runtime_error("Complex Type must be single or double precision");
+            break;
+    }
+
+    return ss.str();
+}
+
+std::string ComplexType::extension_name() const {
+    return "complex";
+}
+
+
+bool ComplexType::ExtensionEquals(const ExtensionType& other) const {
+    const auto& other_ext = static_cast<const ExtensionType&>(other);
+    if (other_ext.extension_name() != this->extension_name()) {
+        return false;
+    }
+    return this->subtype() == static_cast<const ComplexType&>(other).subtype();
+}
+
+Result<std::shared_ptr<DataType>> ComplexType::Deserialize(
+    std::shared_ptr<DataType> storage_type,
+    const std::string& serialized) const {
+
+    auto ltype = std::static_pointer_cast<ListType>(storage_type);
+    return std::make_shared<ComplexType>(ltype->value_type());
+}
+
+std::string ComplexType::Serialize() const {
+return "";
+}
+
+std::shared_ptr<DataType> complex(std::shared_ptr<DataType> subtype) {
+  return std::make_shared<ComplexType>(subtype);
+}
+
+std::shared_ptr<DataType> complex64() {
+    return std::make_shared<ComplexType>(float32());
+}
+
+std::shared_ptr<DataType> complex128() {
+    return std::make_shared<ComplexType>(float64());
+}
+
+};
\ No newline at end of file
diff --git a/cpp/src/arrow/extensions/complex_type.h b/cpp/src/arrow/extensions/complex_type.h
new file mode 100644
index 00000000000..076ef744da1
--- /dev/null
+++ b/cpp/src/arrow/extensions/complex_type.h
@@ -0,0 +1,44 @@
+#include "arrow/extension_type.h"
+
+namespace arrow {
+
+class ComplexArray : public ExtensionArray {
+  public:
+    using ExtensionArray::ExtensionArray;
+};
+
+class ComplexType : public ExtensionType {
+  private:
+    std::shared_ptr<FloatingPointType> subtype_;
+
+    static std::shared_ptr<DataType> MakeType(std::shared_ptr<DataType> subtype);
+    static std::shared_ptr<FloatingPointType> FloatCast(std::shared_ptr<DataType> subtype);
+
+  public:
+    explicit ComplexType(std::shared_ptr<DataType> subtype) :
+      ExtensionType(MakeType(subtype)),
+      subtype_(FloatCast(subtype)) {}
+
+    std::shared_ptr<FloatingPointType> subtype() const { return subtype_; }
+    std::string name() const override;
+    std::string extension_name() const override;
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override {
+    return std::make_shared<ComplexArray>(data);
+  }
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized) const override;
+
+  std::string Serialize() const override;
+};
+
+std::shared_ptr<DataType> complex(std::shared_ptr<DataType> subtype);
+std::shared_ptr<DataType> complex64();
+std::shared_ptr<DataType> complex128();
+
+
+};
\ No newline at end of file
diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc
index bf4afb2a0a1..8202331cddf 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -30,6 +30,8 @@
 #include "arrow/type.h"
 #include "arrow/util/logging.h"
 
+#include "arrow/extensions/complex_type.h"
+
 #include "arrow/python/common.h"
 #include "arrow/python/pyarrow.h"
 #include "arrow/python/type_traits.h"
@@ -84,6 +86,12 @@ Status GetTensorType(PyObject* dtype, std::shared_ptr<DataType>* out) {
     TO_ARROW_TYPE_CASE(FLOAT16, float16);
     TO_ARROW_TYPE_CASE(FLOAT32, float32);
     TO_ARROW_TYPE_CASE(FLOAT64, float64);
+    case NPY_COMPLEX64:
+      *out = complex(float32());
+      break;
+    case NPY_COMPLEX128:
+      *out = complex(float64());
+      break;
     default: {
       return Status::NotImplemented("Unsupported numpy type ", descr->type_num);
     }
@@ -109,6 +117,23 @@ Status GetNumPyType(const DataType& type, int* type_num) {
     NUMPY_TYPE_CASE(HALF_FLOAT, FLOAT16);
     NUMPY_TYPE_CASE(FLOAT, FLOAT32);
     NUMPY_TYPE_CASE(DOUBLE, FLOAT64);
+    case Type::EXTENSION: {
+      const auto * ptr = dynamic_cast<const ComplexType *>(&type);
+
+      if(ptr == nullptr) {
+        // continue into the default branch
+      } else if(ptr->subtype()->Equals(float32())) {
+        *type_num = NPY_COMPLEX64;
+        break;
+      } else if(ptr->subtype()->Equals(float64())) {
+        *type_num = NPY_COMPLEX128;
+        break;
+      } else {
+        return Status::NotImplemented("Unsupported complex tensor type: ", ptr->ToString());
+        break;
+      }
+    }
+
     default: {
       return Status::NotImplemented("Unsupported tensor type: ", type.ToString());
     }
@@ -144,6 +169,8 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out) {
     TO_ARROW_TYPE_CASE(FLOAT16, float16);
     TO_ARROW_TYPE_CASE(FLOAT32, float32);
     TO_ARROW_TYPE_CASE(FLOAT64, float64);
+    TO_ARROW_TYPE_CASE(COMPLEX64, complex64);
+    TO_ARROW_TYPE_CASE(COMPLEX128, complex128);
     TO_ARROW_TYPE_CASE(STRING, binary);
     TO_ARROW_TYPE_CASE(UNICODE, utf8);
     case NPY_DATETIME: {
diff --git a/cpp/src/arrow/python/numpy_internal.h b/cpp/src/arrow/python/numpy_internal.h
index f43599eb3eb..58f61d5899e 100644
--- a/cpp/src/arrow/python/numpy_internal.h
+++ b/cpp/src/arrow/python/numpy_internal.h
@@ -102,6 +102,8 @@ static inline std::string GetNumPyTypeName(int npy_type) {
     TYPE_CASE(FLOAT16, "float16")
     TYPE_CASE(FLOAT32, "float32")
     TYPE_CASE(FLOAT64, "float64")
+    TYPE_CASE(COMPLEX64, "complex64")
+    TYPE_CASE(COMPLEX128, "complex128")
     TYPE_CASE(DATETIME, "datetime64")
     TYPE_CASE(TIMEDELTA, "timedelta64")
     TYPE_CASE(OBJECT, "object")
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index b2d9f1cb5a3..bf45142f1f7 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -21,6 +21,7 @@
 #include <datetime.h>
 
 #include <algorithm>
+#include <complex>
 #include <limits>
 #include <sstream>
 #include <string>
@@ -33,6 +34,7 @@
 #include "arrow/array/builder_dict.h"
 #include "arrow/array/builder_nested.h"
 #include "arrow/array/builder_primitive.h"
+#include "arrow/extensions/complex_type.h"
 #include "arrow/chunked_array.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
@@ -164,6 +166,21 @@ class PyValue {
     return value;
   }
 
+  static Result<std::complex<double>> Convert(const ComplexType*, const O&, I obj) {
+    std::complex<double> value;
+
+    if (PyComplex_Check(obj)) {
+      value = std::complex<double>(
+          PyComplex_RealAsDouble(obj),
+          PyComplex_ImagAsDouble(obj));
+      RETURN_IF_PYERROR();
+    } else {
+      return internal::InvalidValue(obj, "tried to convert to std::complex<double>");
+    }
+
+    return value;
+  };
+
   static Result<Decimal128> Convert(const Decimal128Type* type, const O&, I obj) {
     Decimal128 value;
     RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value));

From bd2cc0b9f0c984c0a708a025955f380fc0894d7f Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Wed, 9 Jun 2021 17:05:02 +0200
Subject: [PATCH 02/61] lint

---
 cpp/src/arrow/extension_type_test.cc     |  4 +-
 cpp/src/arrow/extensions/complex_type.cc | 98 ++++++++++--------------
 cpp/src/arrow/extensions/complex_type.h  | 28 ++++---
 cpp/src/arrow/python/numpy_convert.cc    | 11 +--
 cpp/src/arrow/python/python_to_arrow.cc  |  7 +-
 5 files changed, 66 insertions(+), 82 deletions(-)

diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc
index 2aba8512053..e2094e84cc0 100644
--- a/cpp/src/arrow/extension_type_test.cc
+++ b/cpp/src/arrow/extension_type_test.cc
@@ -201,7 +201,7 @@ TEST_F(TestExtensionType, ComplexTypeTest) {
   auto type = complex64();
   ASSERT_EQ(type->id(), Type::EXTENSION);
 
-  const auto & ext_type = static_cast<const ExtensionType &>(*type);
+  const auto& ext_type = static_cast<const ExtensionType&>(*type);
   std::string serialized = ext_type.Serialize();
 
   ASSERT_OK_AND_ASSIGN(auto deserialized,
@@ -210,7 +210,7 @@ TEST_F(TestExtensionType, ComplexTypeTest) {
   ASSERT_TRUE(deserialized->Equals(*type));
   ASSERT_FALSE(deserialized->Equals(*fixed_size_list(float32(), 2)));
 
-  //auto type2 = complex(int16());
+  // auto type2 = complex(int16());
 }
 
 TEST_F(TestExtensionType, ExtensionTypeTest) {
diff --git a/cpp/src/arrow/extensions/complex_type.cc b/cpp/src/arrow/extensions/complex_type.cc
index 8a62e40879f..6b1a1bd6b21 100644
--- a/cpp/src/arrow/extensions/complex_type.cc
+++ b/cpp/src/arrow/extensions/complex_type.cc
@@ -4,85 +4,71 @@
 
 namespace arrow {
 
-std::shared_ptr<DataType> ComplexType::MakeType(std::shared_ptr<DataType> subtype)
-{
-    return fixed_size_list(FloatCast(subtype), 2);
+std::shared_ptr<DataType> ComplexType::MakeType(std::shared_ptr<DataType> subtype) {
+  return fixed_size_list(FloatCast(subtype), 2);
 }
 
-std::shared_ptr<FloatingPointType> ComplexType::FloatCast(std::shared_ptr<DataType> subtype)
-{
-    auto float_type = std::dynamic_pointer_cast<FloatingPointType>(subtype);
+std::shared_ptr<FloatingPointType> ComplexType::FloatCast(
+    std::shared_ptr<DataType> subtype) {
+  auto float_type = std::dynamic_pointer_cast<FloatingPointType>(subtype);
 
-    if(!float_type)
-    {
-        throw std::runtime_error("ComplexType subtype not floating point");
-    }
+  if (!float_type) {
+    throw std::runtime_error("ComplexType subtype not floating point");
+  }
 
-    if(float_type->precision() != FloatingPointType::SINGLE &&
-       float_type->precision() != FloatingPointType::DOUBLE)
-    {
-        throw std::runtime_error("Complex subtype must be single or double precision");        
-    }
+  if (float_type->precision() != FloatingPointType::SINGLE &&
+      float_type->precision() != FloatingPointType::DOUBLE) {
+    throw std::runtime_error("Complex subtype must be single or double precision");
+  }
 
-    return float_type;
+  return float_type;
 }
 
-
 std::string ComplexType::name() const {
-    std::stringstream ss("complex");
-
-    switch(subtype()->precision())
-    {
-        case FloatingPointType::SINGLE:
-            ss << "64";
-            break;
-        case FloatingPointType::DOUBLE:
-            ss << "128";
-            break;
-        case FloatingPointType::HALF:
-        default:
-            throw std::runtime_error("Complex Type must be single or double precision");
-            break;
-    }
-
-    return ss.str();
-}
-
-std::string ComplexType::extension_name() const {
-    return "complex";
+  std::stringstream ss("complex");
+
+  switch (subtype()->precision()) {
+    case FloatingPointType::SINGLE:
+      ss << "64";
+      break;
+    case FloatingPointType::DOUBLE:
+      ss << "128";
+      break;
+    case FloatingPointType::HALF:
+    default:
+      throw std::runtime_error("Complex Type must be single or double precision");
+      break;
+  }
+
+  return ss.str();
 }
 
+std::string ComplexType::extension_name() const { return "complex"; }
 
 bool ComplexType::ExtensionEquals(const ExtensionType& other) const {
-    const auto& other_ext = static_cast<const ExtensionType&>(other);
-    if (other_ext.extension_name() != this->extension_name()) {
-        return false;
-    }
-    return this->subtype() == static_cast<const ComplexType&>(other).subtype();
+  const auto& other_ext = static_cast<const ExtensionType&>(other);
+  if (other_ext.extension_name() != this->extension_name()) {
+    return false;
+  }
+  return this->subtype() == static_cast<const ComplexType&>(other).subtype();
 }
 
 Result<std::shared_ptr<DataType>> ComplexType::Deserialize(
-    std::shared_ptr<DataType> storage_type,
-    const std::string& serialized) const {
-
-    auto ltype = std::static_pointer_cast<ListType>(storage_type);
-    return std::make_shared<ComplexType>(ltype->value_type());
+    std::shared_ptr<DataType> storage_type, const std::string& serialized) const {
+  auto ltype = std::static_pointer_cast<ListType>(storage_type);
+  return std::make_shared<ComplexType>(ltype->value_type());
 }
 
-std::string ComplexType::Serialize() const {
-return "";
-}
+std::string ComplexType::Serialize() const { return ""; }
 
 std::shared_ptr<DataType> complex(std::shared_ptr<DataType> subtype) {
   return std::make_shared<ComplexType>(subtype);
 }
 
-std::shared_ptr<DataType> complex64() {
-    return std::make_shared<ComplexType>(float32());
-}
+std::shared_ptr<DataType> complex64() { return std::make_shared<ComplexType>(float32()); }
 
 std::shared_ptr<DataType> complex128() {
-    return std::make_shared<ComplexType>(float64());
+  return std::make_shared<ComplexType>(float64());
 }
 
-};
\ No newline at end of file
+};  // namespace arrow
\ No newline at end of file
diff --git a/cpp/src/arrow/extensions/complex_type.h b/cpp/src/arrow/extensions/complex_type.h
index 076ef744da1..1b813c421c0 100644
--- a/cpp/src/arrow/extensions/complex_type.h
+++ b/cpp/src/arrow/extensions/complex_type.h
@@ -3,25 +3,24 @@
 namespace arrow {
 
 class ComplexArray : public ExtensionArray {
-  public:
-    using ExtensionArray::ExtensionArray;
+ public:
+  using ExtensionArray::ExtensionArray;
 };
 
 class ComplexType : public ExtensionType {
-  private:
-    std::shared_ptr<FloatingPointType> subtype_;
+ private:
+  std::shared_ptr<FloatingPointType> subtype_;
 
-    static std::shared_ptr<DataType> MakeType(std::shared_ptr<DataType> subtype);
-    static std::shared_ptr<FloatingPointType> FloatCast(std::shared_ptr<DataType> subtype);
+  static std::shared_ptr<DataType> MakeType(std::shared_ptr<DataType> subtype);
+  static std::shared_ptr<FloatingPointType> FloatCast(std::shared_ptr<DataType> subtype);
 
-  public:
-    explicit ComplexType(std::shared_ptr<DataType> subtype) :
-      ExtensionType(MakeType(subtype)),
-      subtype_(FloatCast(subtype)) {}
+ public:
+  explicit ComplexType(std::shared_ptr<DataType> subtype)
+      : ExtensionType(MakeType(subtype)), subtype_(FloatCast(subtype)) {}
 
-    std::shared_ptr<FloatingPointType> subtype() const { return subtype_; }
-    std::string name() const override;
-    std::string extension_name() const override;
+  std::shared_ptr<FloatingPointType> subtype() const { return subtype_; }
+  std::string name() const override;
+  std::string extension_name() const override;
 
   bool ExtensionEquals(const ExtensionType& other) const override;
 
@@ -40,5 +39,4 @@ std::shared_ptr<DataType> complex(std::shared_ptr<DataType> subtype);
 std::shared_ptr<DataType> complex64();
 std::shared_ptr<DataType> complex128();
 
-
-};
\ No newline at end of file
+};  // namespace arrow
\ No newline at end of file
diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc
index 8202331cddf..241b0323513 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -118,18 +118,19 @@ Status GetNumPyType(const DataType& type, int* type_num) {
     NUMPY_TYPE_CASE(FLOAT, FLOAT32);
     NUMPY_TYPE_CASE(DOUBLE, FLOAT64);
     case Type::EXTENSION: {
-      const auto * ptr = dynamic_cast<const ComplexType *>(&type);
+      const auto* ptr = dynamic_cast<const ComplexType*>(&type);
 
-      if(ptr == nullptr) {
+      if (ptr == nullptr) {
         // continue into the default branch
-      } else if(ptr->subtype()->Equals(float32())) {
+      } else if (ptr->subtype()->Equals(float32())) {
         *type_num = NPY_COMPLEX64;
         break;
-      } else if(ptr->subtype()->Equals(float64())) {
+      } else if (ptr->subtype()->Equals(float64())) {
         *type_num = NPY_COMPLEX128;
         break;
       } else {
-        return Status::NotImplemented("Unsupported complex tensor type: ", ptr->ToString());
+        return Status::NotImplemented("Unsupported complex tensor type: ",
+                                      ptr->ToString());
         break;
       }
     }
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index bf45142f1f7..3739508b45f 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -34,8 +34,8 @@
 #include "arrow/array/builder_dict.h"
 #include "arrow/array/builder_nested.h"
 #include "arrow/array/builder_primitive.h"
-#include "arrow/extensions/complex_type.h"
 #include "arrow/chunked_array.h"
+#include "arrow/extensions/complex_type.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
@@ -170,9 +170,8 @@ class PyValue {
     std::complex<double> value;
 
     if (PyComplex_Check(obj)) {
-      value = std::complex<double>(
-          PyComplex_RealAsDouble(obj),
-          PyComplex_ImagAsDouble(obj));
+      value =
+          std::complex<double>(PyComplex_RealAsDouble(obj), PyComplex_ImagAsDouble(obj));
       RETURN_IF_PYERROR();
     } else {
       return internal::InvalidValue(obj, "tried to convert to std::complex<double>");

From 2b6f0d106bd7d7864005a6604c077bf50a1aedb1 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Wed, 9 Jun 2021 17:35:22 +0200
Subject: [PATCH 03/61] more linting

---
 cpp/src/arrow/extensions/complex_type.cc | 2 +-
 cpp/src/arrow/extensions/complex_type.h  | 2 +-
 cpp/src/arrow/python/python_to_arrow.cc  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/extensions/complex_type.cc b/cpp/src/arrow/extensions/complex_type.cc
index 6b1a1bd6b21..a8a6be9f0cc 100644
--- a/cpp/src/arrow/extensions/complex_type.cc
+++ b/cpp/src/arrow/extensions/complex_type.cc
@@ -71,4 +71,4 @@ std::shared_ptr<DataType> complex128() {
   return std::make_shared<ComplexType>(float64());
 }
 
-};  // namespace arrow
\ No newline at end of file
+};  // namespace arrow
diff --git a/cpp/src/arrow/extensions/complex_type.h b/cpp/src/arrow/extensions/complex_type.h
index 1b813c421c0..de0305392b1 100644
--- a/cpp/src/arrow/extensions/complex_type.h
+++ b/cpp/src/arrow/extensions/complex_type.h
@@ -39,4 +39,4 @@ std::shared_ptr<DataType> complex(std::shared_ptr<DataType> subtype);
 std::shared_ptr<DataType> complex64();
 std::shared_ptr<DataType> complex128();
 
-};  // namespace arrow
\ No newline at end of file
+};  // namespace arrow
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index 3739508b45f..cb40ca827aa 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -178,7 +178,7 @@ class PyValue {
     }
 
     return value;
-  };
+  }
 
   static Result<Decimal128> Convert(const Decimal128Type* type, const O&, I obj) {
     Decimal128 value;

From 1db170d96424a4cfe1b7312a1ae814f4d3b50816 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Wed, 9 Jun 2021 17:54:52 +0200
Subject: [PATCH 04/61] Copyright headers

---
 cpp/src/arrow/extensions/complex_type.cc | 19 +++++++++++++++++++
 cpp/src/arrow/extensions/complex_type.h  | 19 +++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/cpp/src/arrow/extensions/complex_type.cc b/cpp/src/arrow/extensions/complex_type.cc
index a8a6be9f0cc..8e8d21f0e09 100644
--- a/cpp/src/arrow/extensions/complex_type.cc
+++ b/cpp/src/arrow/extensions/complex_type.cc
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Complex Number Extension Type
+
 #include <sstream>
 
 #include <arrow/extensions/complex_type.h>
diff --git a/cpp/src/arrow/extensions/complex_type.h b/cpp/src/arrow/extensions/complex_type.h
index de0305392b1..97ce3f9afbe 100644
--- a/cpp/src/arrow/extensions/complex_type.h
+++ b/cpp/src/arrow/extensions/complex_type.h
@@ -1,3 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Complex Number Extension Type
+
 #include "arrow/extension_type.h"
 
 namespace arrow {

From e4b930929faf4cb6a770e6194a240bc48576ad9a Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Mon, 21 Jun 2021 14:51:21 +0200
Subject: [PATCH 05/61] [skip ci] WIP

---
 cpp/src/arrow/extension_type_test.cc     | 12 ++--
 cpp/src/arrow/extensions/complex_type.cc | 66 ++++-----------------
 cpp/src/arrow/extensions/complex_type.h  | 75 ++++++++++++++++++------
 cpp/src/arrow/testing/gtest_util.cc      |  2 +
 cpp/src/arrow/type.cc                    |  2 +
 cpp/src/arrow/type_fwd.h                 | 16 +++++
 cpp/src/arrow/visitor_inline.h           |  2 +
 7 files changed, 97 insertions(+), 78 deletions(-)

diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc
index e2094e84cc0..b20c30c6a54 100644
--- a/cpp/src/arrow/extension_type_test.cc
+++ b/cpp/src/arrow/extension_type_test.cc
@@ -181,21 +181,25 @@ class TestExtensionType : public ::testing::Test {
  public:
   void SetUp() {
     ASSERT_OK(RegisterExtensionType(std::make_shared<UuidType>()));
-    ASSERT_OK(RegisterExtensionType(std::make_shared<ComplexType>(float32())));
+    ASSERT_OK(RegisterExtensionType(std::make_shared<ComplexFloatType>()));
+    ASSERT_OK(RegisterExtensionType(std::make_shared<ComplexDoubleType>()));
   }
 
   void TearDown() {
     if (GetExtensionType("uuid")) {
       ASSERT_OK(UnregisterExtensionType("uuid"));
     }
-    if (GetExtensionType("complex")) {
-      ASSERT_OK(UnregisterExtensionType("complex"));
+    if (GetExtensionType("arrow.complex64")) {
+      ASSERT_OK(UnregisterExtensionType("arrow.complex64"));
+    }
+    if (GetExtensionType("arrow.complex128")) {
+      ASSERT_OK(UnregisterExtensionType("arrow.complex128"));
     }
   }
 };
 
 TEST_F(TestExtensionType, ComplexTypeTest) {
-  auto registered_type = GetExtensionType("complex");
+  auto registered_type = GetExtensionType("arrow.complex64");
   ASSERT_NE(registered_type, nullptr);
 
   auto type = complex64();
diff --git a/cpp/src/arrow/extensions/complex_type.cc b/cpp/src/arrow/extensions/complex_type.cc
index 8e8d21f0e09..30ce7442062 100644
--- a/cpp/src/arrow/extensions/complex_type.cc
+++ b/cpp/src/arrow/extensions/complex_type.cc
@@ -19,75 +19,29 @@
 
 #include <sstream>
 
-#include <arrow/extensions/complex_type.h>
+#include "arrow/extensions/complex_type.h"
 
 namespace arrow {
 
-std::shared_ptr<DataType> ComplexType::MakeType(std::shared_ptr<DataType> subtype) {
-  return fixed_size_list(FloatCast(subtype), 2);
-}
-
-std::shared_ptr<FloatingPointType> ComplexType::FloatCast(
-    std::shared_ptr<DataType> subtype) {
-  auto float_type = std::dynamic_pointer_cast<FloatingPointType>(subtype);
-
-  if (!float_type) {
-    throw std::runtime_error("ComplexType subtype not floating point");
-  }
-
-  if (float_type->precision() != FloatingPointType::SINGLE &&
-      float_type->precision() != FloatingPointType::DOUBLE) {
-    throw std::runtime_error("Complex subtype must be single or double precision");
-  }
-
-  return float_type;
-}
-
-std::string ComplexType::name() const {
-  std::stringstream ss("complex");
 
-  switch (subtype()->precision()) {
-    case FloatingPointType::SINGLE:
-      ss << "64";
-      break;
-    case FloatingPointType::DOUBLE:
-      ss << "128";
-      break;
-    case FloatingPointType::HALF:
-    default:
-      throw std::runtime_error("Complex Type must be single or double precision");
-      break;
-  }
-
-  return ss.str();
-}
-
-std::string ComplexType::extension_name() const { return "complex"; }
-
-bool ComplexType::ExtensionEquals(const ExtensionType& other) const {
+bool ComplexFloatType::ExtensionEquals(const ExtensionType& other) const {
   const auto& other_ext = static_cast<const ExtensionType&>(other);
-  if (other_ext.extension_name() != this->extension_name()) {
-    return false;
-  }
-  return this->subtype() == static_cast<const ComplexType&>(other).subtype();
+  return other_ext.extension_name() == this->extension_name();
 }
 
-Result<std::shared_ptr<DataType>> ComplexType::Deserialize(
-    std::shared_ptr<DataType> storage_type, const std::string& serialized) const {
-  auto ltype = std::static_pointer_cast<ListType>(storage_type);
-  return std::make_shared<ComplexType>(ltype->value_type());
+bool ComplexDoubleType::ExtensionEquals(const ExtensionType& other) const {
+  const auto& other_ext = static_cast<const ExtensionType&>(other);
+  return other_ext.extension_name() == this->extension_name();
 }
 
-std::string ComplexType::Serialize() const { return ""; }
 
-std::shared_ptr<DataType> complex(std::shared_ptr<DataType> subtype) {
-  return std::make_shared<ComplexType>(subtype);
+std::shared_ptr<DataType> complex64() {
+  return std::make_shared<ComplexFloatType>();
 }
 
-std::shared_ptr<DataType> complex64() { return std::make_shared<ComplexType>(float32()); }
-
 std::shared_ptr<DataType> complex128() {
-  return std::make_shared<ComplexType>(float64());
+  return std::make_shared<ComplexDoubleType>();
 }
 
+
 };  // namespace arrow
diff --git a/cpp/src/arrow/extensions/complex_type.h b/cpp/src/arrow/extensions/complex_type.h
index 97ce3f9afbe..8b868fad90a 100644
--- a/cpp/src/arrow/extensions/complex_type.h
+++ b/cpp/src/arrow/extensions/complex_type.h
@@ -21,41 +21,80 @@
 
 namespace arrow {
 
-class ComplexArray : public ExtensionArray {
+
+std::shared_ptr<DataType> complex64();
+std::shared_ptr<DataType> complex128();
+
+
+class ComplexFloatArray : public ExtensionArray {
  public:
   using ExtensionArray::ExtensionArray;
 };
 
-class ComplexType : public ExtensionType {
- private:
-  std::shared_ptr<FloatingPointType> subtype_;
+class ComplexFloatType : public ExtensionType {
+ public:
+  explicit ComplexFloatType()
+      : ExtensionType(fixed_size_list(float32(), 2)) {}
+
+  std::string name() const override {
+    return "complex64";
+  }
+
+  std::string extension_name() const override {
+    return "arrow.complex64";
+  }
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override {
+    return std::make_shared<ComplexFloatArray>(data);
+  }
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized) const override {
+    return complex64();
+  };
+
+  std::string Serialize() const override {
+    return "";
+  }
+};
+
 
-  static std::shared_ptr<DataType> MakeType(std::shared_ptr<DataType> subtype);
-  static std::shared_ptr<FloatingPointType> FloatCast(std::shared_ptr<DataType> subtype);
+class ComplexDoubleArray : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+};
 
+class ComplexDoubleType : public ExtensionType {
  public:
-  explicit ComplexType(std::shared_ptr<DataType> subtype)
-      : ExtensionType(MakeType(subtype)), subtype_(FloatCast(subtype)) {}
+  explicit ComplexDoubleType()
+      : ExtensionType(fixed_size_list(float64(), 2)) {}
+
+  std::string name() const override {
+    return "complex128";
+  }
 
-  std::shared_ptr<FloatingPointType> subtype() const { return subtype_; }
-  std::string name() const override;
-  std::string extension_name() const override;
+  std::string extension_name() const override {
+    return "arrow.complex128";
+  }
 
   bool ExtensionEquals(const ExtensionType& other) const override;
 
   std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override {
-    return std::make_shared<ComplexArray>(data);
+    return std::make_shared<ComplexFloatArray>(data);
   }
 
   Result<std::shared_ptr<DataType>> Deserialize(
       std::shared_ptr<DataType> storage_type,
-      const std::string& serialized) const override;
+      const std::string& serialized) const override {
+    return complex128();
+  };
 
-  std::string Serialize() const override;
+  std::string Serialize() const override {
+    return "";
+  }
 };
 
-std::shared_ptr<DataType> complex(std::shared_ptr<DataType> subtype);
-std::shared_ptr<DataType> complex64();
-std::shared_ptr<DataType> complex128();
-
 };  // namespace arrow
diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc
index eb0edd56566..85c1129e3a8 100644
--- a/cpp/src/arrow/testing/gtest_util.cc
+++ b/cpp/src/arrow/testing/gtest_util.cc
@@ -73,6 +73,8 @@ std::vector<Type::type> AllTypeIds() {
           Type::HALF_FLOAT,
           Type::FLOAT,
           Type::DOUBLE,
+          Type::COMPLEX_FLOAT,
+          Type::COMPLEX_DOUBLE,
           Type::DECIMAL128,
           Type::DECIMAL256,
           Type::DATE32,
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 344585446fc..4489c116e6d 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -130,6 +130,8 @@ std::string ToString(Type::type id) {
     TO_STRING_CASE(HALF_FLOAT)
     TO_STRING_CASE(FLOAT)
     TO_STRING_CASE(DOUBLE)
+    TO_STRING_CASE(COMPLEX_FLOAT)
+    TO_STRING_CASE(COMPLEX_DOUBLE)
     TO_STRING_CASE(DECIMAL128)
     TO_STRING_CASE(DECIMAL256)
     TO_STRING_CASE(DATE32)
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index 7e564106bbe..14d0b7480ad 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -207,6 +207,12 @@ _NUMERIC_TYPE_DECL(HalfFloat)
 _NUMERIC_TYPE_DECL(Float)
 _NUMERIC_TYPE_DECL(Double)
 
+class ComplexFloatType;
+class ComplexFloatArray;
+
+class ComplexDoubleType;
+class ComplexDoubleArray;
+
 #undef _NUMERIC_TYPE_DECL
 
 enum class DateUnit : char { DAY = 0, MILLI = 1 };
@@ -394,6 +400,12 @@ struct Type {
     /// Like LIST, but with 64-bit offsets
     LARGE_LIST,
 
+    // Single-precision 32-bit complex numbers
+    COMPLEX_FLOAT,
+
+    // Double-precision 64-bit complex numbers
+    COMPLEX_DOUBLE,
+
     // Leave this at the end
     MAX_ID
   };
@@ -430,6 +442,10 @@ std::shared_ptr<DataType> ARROW_EXPORT float16();
 std::shared_ptr<DataType> ARROW_EXPORT float32();
 /// \brief Return a DoubleType instance
 std::shared_ptr<DataType> ARROW_EXPORT float64();
+/// \brief Return a ComplexFloatType instance
+std::shared_ptr<DataType> ARROW_EXPORT complex64();
+/// \brief Return a ComplexDoubleType instance
+std::shared_ptr<DataType> ARROW_EXPORT complex128();
 /// \brief Return a StringType instance
 std::shared_ptr<DataType> ARROW_EXPORT utf8();
 /// \brief Return a LargeStringType instance
diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h
index 132c35aeaa1..eba2522044f 100644
--- a/cpp/src/arrow/visitor_inline.h
+++ b/cpp/src/arrow/visitor_inline.h
@@ -56,6 +56,8 @@ namespace arrow {
   ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \
   ACTION(String);                               \
   ACTION(Binary);                               \
+  ACTION(ComplexFloat);                         \
+  ACTION(ComplexDouble);                        \
   ACTION(LargeString);                          \
   ACTION(LargeBinary);                          \
   ACTION(FixedSizeBinary);                      \

From fe1e1456d026ee286412802fa8696fb8a7897ed5 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Mon, 21 Jun 2021 15:33:59 +0200
Subject: [PATCH 06/61] Add Type Traits

---
 cpp/src/arrow/type_traits.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index b74aa3b0adb..f6b8135ebc5 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -362,6 +362,30 @@ struct CTypeTraits<DayTimeIntervalType::DayMilliseconds>
   using ArrowType = DayTimeIntervalType;
 };
 
+template <>
+struct TypeTraits<ComplexFloatType> {
+  using ArrayType = ComplexFloatArray;
+  using BuilderType = FixedSizeListBuilder;
+  using ScalarType = FixedSizeListScalar;
+  using OffsetType = Int32Type;
+  using OffsetArrayType = Int32Array;
+  using OffsetBuilderType = Int32Builder;
+  using OffsetScalarType = Int32Scalar;
+  constexpr static bool is_parameter_free = true;
+};
+
+template <>
+struct TypeTraits<ComplexDoubleType> {
+  using ArrayType = ComplexDoubleArray;
+  using BuilderType = FixedSizeListBuilder;
+  using ScalarType = FixedSizeListScalar;
+  using OffsetType = Int32Type;
+  using OffsetArrayType = Int32Array;
+  using OffsetBuilderType = Int32Builder;
+  using OffsetScalarType = Int32Scalar;
+  constexpr static bool is_parameter_free = true;
+};
+
 template <>
 struct TypeTraits<ListType> {
   using ArrayType = ListArray;

From 76c8ef6dffce53f8de174b0a16efdcc79d2866f0 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Mon, 21 Jun 2021 15:34:23 +0200
Subject: [PATCH 07/61] [skip ci] #include complex_type.h

---
 cpp/src/arrow/visitor_inline.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h
index eba2522044f..665217221b1 100644
--- a/cpp/src/arrow/visitor_inline.h
+++ b/cpp/src/arrow/visitor_inline.h
@@ -23,6 +23,7 @@
 
 #include "arrow/array.h"
 #include "arrow/extension_type.h"
+#include "arrow/extensions/complex_type.h"
 #include "arrow/scalar.h"
 #include "arrow/status.h"
 #include "arrow/type.h"

From 5b72101949941d9f2d3b2c8cbae94c347ab7d308 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Mon, 21 Jun 2021 15:48:54 +0200
Subject: [PATCH 08/61] Fix header include

---
 cpp/src/arrow/extensions/complex_type.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/arrow/extensions/complex_type.h b/cpp/src/arrow/extensions/complex_type.h
index 8b868fad90a..22f584b1066 100644
--- a/cpp/src/arrow/extensions/complex_type.h
+++ b/cpp/src/arrow/extensions/complex_type.h
@@ -16,6 +16,7 @@
 // under the License.
 
 // Complex Number Extension Type
+#pragma once
 
 #include "arrow/extension_type.h"
 

From 0d7524eece221570e8c4f9cd32eee89622be9179 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Mon, 21 Jun 2021 15:49:23 +0200
Subject: [PATCH 09/61] Remove Complex{Float,Double} Types in Visitor Actions

---
 cpp/src/arrow/visitor_inline.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h
index 665217221b1..303c4361633 100644
--- a/cpp/src/arrow/visitor_inline.h
+++ b/cpp/src/arrow/visitor_inline.h
@@ -51,15 +51,17 @@ namespace arrow {
   ACTION(Float);                                     \
   ACTION(Double)
 
+
+// ACTION(ComplexFloat);                        
+// ACTION(ComplexDouble);                        
+
 #define ARROW_GENERATE_FOR_ALL_TYPES(ACTION)    \
   ACTION(Null);                                 \
   ACTION(Boolean);                              \
   ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \
   ACTION(String);                               \
   ACTION(Binary);                               \
-  ACTION(ComplexFloat);                         \
-  ACTION(ComplexDouble);                        \
-  ACTION(LargeString);                          \
+    ACTION(LargeString);                          \
   ACTION(LargeBinary);                          \
   ACTION(FixedSizeBinary);                      \
   ACTION(Duration);                             \

From ae58a3a8764537ea9912f3036028787ec8c181cf Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Mon, 21 Jun 2021 16:10:57 +0200
Subject: [PATCH 10/61] cleanup

---
 cpp/src/arrow/visitor_inline.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h
index 303c4361633..dbd8c09112d 100644
--- a/cpp/src/arrow/visitor_inline.h
+++ b/cpp/src/arrow/visitor_inline.h
@@ -52,16 +52,13 @@ namespace arrow {
   ACTION(Double)
 
 
-// ACTION(ComplexFloat);                        
-// ACTION(ComplexDouble);                        
-
 #define ARROW_GENERATE_FOR_ALL_TYPES(ACTION)    \
   ACTION(Null);                                 \
   ACTION(Boolean);                              \
   ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \
   ACTION(String);                               \
   ACTION(Binary);                               \
-    ACTION(LargeString);                          \
+  ACTION(LargeString);                          \
   ACTION(LargeBinary);                          \
   ACTION(FixedSizeBinary);                      \
   ACTION(Duration);                             \

From dc2125b8712ddab2e5638a6469636116a8e77c54 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Mon, 21 Jun 2021 17:31:13 +0200
Subject: [PATCH 11/61] Remove COMPLEX_{FLOAT,DOUBLE} from Type

---
 cpp/src/arrow/testing/gtest_util.cc |  2 --
 cpp/src/arrow/type.cc               |  2 --
 cpp/src/arrow/type_fwd.h            |  6 ------
 cpp/src/arrow/type_traits.h         | 24 ------------------------
 4 files changed, 34 deletions(-)

diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc
index 85c1129e3a8..eb0edd56566 100644
--- a/cpp/src/arrow/testing/gtest_util.cc
+++ b/cpp/src/arrow/testing/gtest_util.cc
@@ -73,8 +73,6 @@ std::vector<Type::type> AllTypeIds() {
           Type::HALF_FLOAT,
           Type::FLOAT,
           Type::DOUBLE,
-          Type::COMPLEX_FLOAT,
-          Type::COMPLEX_DOUBLE,
           Type::DECIMAL128,
           Type::DECIMAL256,
           Type::DATE32,
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 4489c116e6d..344585446fc 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -130,8 +130,6 @@ std::string ToString(Type::type id) {
     TO_STRING_CASE(HALF_FLOAT)
     TO_STRING_CASE(FLOAT)
     TO_STRING_CASE(DOUBLE)
-    TO_STRING_CASE(COMPLEX_FLOAT)
-    TO_STRING_CASE(COMPLEX_DOUBLE)
     TO_STRING_CASE(DECIMAL128)
     TO_STRING_CASE(DECIMAL256)
     TO_STRING_CASE(DATE32)
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index 14d0b7480ad..3001efef7be 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -400,12 +400,6 @@ struct Type {
     /// Like LIST, but with 64-bit offsets
     LARGE_LIST,
 
-    // Single-precision 32-bit complex numbers
-    COMPLEX_FLOAT,
-
-    // Double-precision 64-bit complex numbers
-    COMPLEX_DOUBLE,
-
     // Leave this at the end
     MAX_ID
   };
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index f6b8135ebc5..b74aa3b0adb 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -362,30 +362,6 @@ struct CTypeTraits<DayTimeIntervalType::DayMilliseconds>
   using ArrowType = DayTimeIntervalType;
 };
 
-template <>
-struct TypeTraits<ComplexFloatType> {
-  using ArrayType = ComplexFloatArray;
-  using BuilderType = FixedSizeListBuilder;
-  using ScalarType = FixedSizeListScalar;
-  using OffsetType = Int32Type;
-  using OffsetArrayType = Int32Array;
-  using OffsetBuilderType = Int32Builder;
-  using OffsetScalarType = Int32Scalar;
-  constexpr static bool is_parameter_free = true;
-};
-
-template <>
-struct TypeTraits<ComplexDoubleType> {
-  using ArrayType = ComplexDoubleArray;
-  using BuilderType = FixedSizeListBuilder;
-  using ScalarType = FixedSizeListScalar;
-  using OffsetType = Int32Type;
-  using OffsetArrayType = Int32Array;
-  using OffsetBuilderType = Int32Builder;
-  using OffsetScalarType = Int32Scalar;
-  constexpr static bool is_parameter_free = true;
-};
-
 template <>
 struct TypeTraits<ListType> {
   using ArrayType = ListArray;

From fbe7508cdf7479cac2c0ce74683b074aba884b1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Tue, 15 Jun 2021 18:36:56 +0900
Subject: [PATCH 12/61] ARROW-13080: [Release] Generate the API docs in ubuntu
 20.10
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pass ubuntu version as a docker build variable instead of a container runtime environment variable.

Closes #10532 from kszucs/post-docs-ubuntu-version

Authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 dev/release/post-09-docs.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dev/release/post-09-docs.sh b/dev/release/post-09-docs.sh
index c9f75b48b2c..8751b22887f 100755
--- a/dev/release/post-09-docs.sh
+++ b/dev/release/post-09-docs.sh
@@ -43,10 +43,9 @@ popd
 pushd "${ARROW_DIR}"
 git checkout "${release_tag}"
 
-archery docker run \
+UBUNTU=20.10 archery docker run \
   -v "${ARROW_SITE_DIR}/docs:/build/docs" \
   -e ARROW_DOCS_VERSION="${version}" \
-  -e UBUNTU=20.10 \
   ubuntu-docs
 
 : ${PUSH:=1}

From 5521edf2d1d37ae3542c342842a680d7b53eac2d Mon Sep 17 00:00:00 2001
From: Alessandro Molina <amol@turbogears.org>
Date: Tue, 15 Jun 2021 11:56:52 +0200
Subject: [PATCH 13/61] ARROW-12431: [Python] Mask is inverted when creating
 FixedSizeBinaryArray

Closes #10199 from amol-/ARROW-12431

Authored-by: Alessandro Molina <amol@turbogears.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/python/numpy_to_arrow.cc | 15 +++++++--
 python/pyarrow/tests/test_array.py     | 45 ++++++++++++++++++++++++++
 python/pyarrow/tests/test_pandas.py    |  2 +-
 3 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index c17e70823d5..a382f766333 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -594,9 +594,20 @@ Status NumPyConverter::Visit(const FixedSizeBinaryType& type) {
 
   if (mask_ != nullptr) {
     Ndarray1DIndexer<uint8_t> mask_values(mask_);
-    RETURN_NOT_OK(builder.AppendValues(data, length_, mask_values.data()));
+    RETURN_NOT_OK(builder.Reserve(length_));
+    for (int64_t i = 0; i < length_; ++i) {
+      if (mask_values[i]) {
+        RETURN_NOT_OK(builder.AppendNull());
+      } else {
+        RETURN_NOT_OK(builder.Append(data));
+      }
+      data += stride_;
+    }
   } else {
-    RETURN_NOT_OK(builder.AppendValues(data, length_));
+    for (int64_t i = 0; i < length_; ++i) {
+      RETURN_NOT_OK(builder.Append(data));
+      data += stride_;
+    }
   }
 
   std::shared_ptr<Array> result;
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 086ed4cb160..30500bc3c5b 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -2714,6 +2714,51 @@ def test_array_masked():
     assert arr.type == pa.int64()
 
 
+def test_binary_array_masked():
+    # ARROW-12431
+    masked_basic = pa.array([b'\x05'], type=pa.binary(1),
+                            mask=np.array([False]))
+    assert [b'\x05'] == masked_basic.to_pylist()
+
+    # Fixed Length Binary
+    masked = pa.array(np.array([b'\x05']), type=pa.binary(1),
+                      mask=np.array([False]))
+    assert [b'\x05'] == masked.to_pylist()
+
+    masked_nulls = pa.array(np.array([b'\x05']), type=pa.binary(1),
+                            mask=np.array([True]))
+    assert [None] == masked_nulls.to_pylist()
+
+    # Variable Length Binary
+    masked = pa.array(np.array([b'\x05']), type=pa.binary(),
+                      mask=np.array([False]))
+    assert [b'\x05'] == masked.to_pylist()
+
+    masked_nulls = pa.array(np.array([b'\x05']), type=pa.binary(),
+                            mask=np.array([True]))
+    assert [None] == masked_nulls.to_pylist()
+
+    # Fixed Length Binary, copy
+    npa = np.array([b'aaa', b'bbb', b'ccc']*10)
+    arrow_array = pa.array(npa, type=pa.binary(3),
+                           mask=np.array([False, False, False]*10))
+    npa[npa == b"bbb"] = b"XXX"
+    assert ([b'aaa', b'bbb', b'ccc']*10) == arrow_array.to_pylist()
+
+
+def test_binary_array_strided():
+    # Masked
+    nparray = np.array([b"ab", b"cd", b"ef"])
+    arrow_array = pa.array(nparray[::2], pa.binary(2),
+                           mask=np.array([False, False]))
+    assert [b"ab", b"ef"] == arrow_array.to_pylist()
+
+    # Unmasked
+    nparray = np.array([b"ab", b"cd", b"ef"])
+    arrow_array = pa.array(nparray[::2], pa.binary(2))
+    assert [b"ab", b"ef"] == arrow_array.to_pylist()
+
+
 def test_array_invalid_mask_raises():
     # ARROW-10742
     cases = [
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 77c18b839c6..7f904433fa2 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -1705,7 +1705,7 @@ def test_numpy_string_array_to_fixed_size_binary(self):
         expected = pa.array(list(arr), type=pa.binary(3))
         assert converted.equals(expected)
 
-        mask = np.array([True, False, True])
+        mask = np.array([False, True, False])
         converted = pa.array(arr, type=pa.binary(3), mask=mask)
         expected = pa.array([b'foo', None, b'baz'], type=pa.binary(3))
         assert converted.equals(expected)

From 40585c1ff28aaf55d4ec74cc1b58c35d39ae5d81 Mon Sep 17 00:00:00 2001
From: Yibo Cai <yibo.cai@arm.com>
Date: Tue, 15 Jun 2021 16:44:37 +0200
Subject: [PATCH 14/61] ARROW-13003: [C++] Fix key map unaligned access

Closes #10489 from cyb70289/13003-unaligned-access

Authored-by: Yibo Cai <yibo.cai@arm.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/compute/exec/key_compare.cc | 21 +++----
 cpp/src/arrow/compute/exec/key_map.cc     | 71 +++++++++++++----------
 cpp/src/arrow/compute/exec/util.cc        | 16 ++---
 3 files changed, 59 insertions(+), 49 deletions(-)

diff --git a/cpp/src/arrow/compute/exec/key_compare.cc b/cpp/src/arrow/compute/exec/key_compare.cc
index f8d74859b01..7a5b0be9990 100644
--- a/cpp/src/arrow/compute/exec/key_compare.cc
+++ b/cpp/src/arrow/compute/exec/key_compare.cc
@@ -21,6 +21,7 @@
 #include <cstdint>
 
 #include "arrow/compute/exec/util.h"
+#include "arrow/util/ubsan.h"
 
 namespace arrow {
 namespace compute {
@@ -170,19 +171,19 @@ void KeyCompare::CompareFixedLengthImp(uint32_t num_rows_already_processed,
     //
     if (num_64bit_words == 0) {
       for (; istripe < num_loops_less_one; ++istripe) {
-        uint64_t key_left = key_left_ptr[istripe];
-        uint64_t key_right = key_right_ptr[istripe];
+        uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+        uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
         result_or |= (key_left ^ key_right);
       }
     } else if (num_64bit_words == 2) {
-      uint64_t key_left = key_left_ptr[istripe];
-      uint64_t key_right = key_right_ptr[istripe];
+      uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+      uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
       result_or |= (key_left ^ key_right);
       ++istripe;
     }
 
-    uint64_t key_left = key_left_ptr[istripe];
-    uint64_t key_right = key_right_ptr[istripe];
+    uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+    uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
     result_or |= (tail_mask & (key_left ^ key_right));
 
     int result = (result_or == 0 ? 0xff : 0);
@@ -246,16 +247,16 @@ void KeyCompare::CompareVaryingLengthImp(
     int32_t istripe;
     // length can be zero
     for (istripe = 0; istripe < (static_cast<int32_t>(length) + 7) / 8 - 1; ++istripe) {
-      uint64_t key_left = key_left_ptr[istripe];
-      uint64_t key_right = key_right_ptr[istripe];
+      uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+      uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
       result_or |= (key_left ^ key_right);
     }
 
     uint32_t length_remaining = length - static_cast<uint32_t>(istripe) * 8;
     uint64_t tail_mask = tail_masks[length_remaining];
 
-    uint64_t key_left = key_left_ptr[istripe];
-    uint64_t key_right = key_right_ptr[istripe];
+    uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+    uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
     result_or |= (tail_mask & (key_left ^ key_right));
 
     int result = (result_or == 0 ? 0xff : 0);
diff --git a/cpp/src/arrow/compute/exec/key_map.cc b/cpp/src/arrow/compute/exec/key_map.cc
index c48487793e0..ac47c04403c 100644
--- a/cpp/src/arrow/compute/exec/key_map.cc
+++ b/cpp/src/arrow/compute/exec/key_map.cc
@@ -24,6 +24,7 @@
 
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_ops.h"
+#include "arrow/util/ubsan.h"
 
 namespace arrow {
 
@@ -153,7 +154,7 @@ void SwissTable::lookup_1(const uint16_t* selection, const int num_keys,
   for (int i = 0; i < num_keys; ++i) {
     int id;
     if (use_selection) {
-      id = selection[i];
+      id = util::SafeLoad(&selection[i]);
     } else {
       id = i;
     }
@@ -168,7 +169,7 @@ void SwissTable::lookup_1(const uint16_t* selection, const int num_keys,
     uint32_t num_block_bytes = num_groupid_bits + 8;
     const uint8_t* blockbase = reinterpret_cast<const uint8_t*>(blocks_) +
                                static_cast<uint64_t>(iblock) * num_block_bytes;
-    uint64_t block = *reinterpret_cast<const uint64_t*>(blockbase);
+    uint64_t block = util::SafeLoadAs<uint64_t>(blockbase);
 
     // Call helper functions to obtain the output triplet:
     // - match (of a stamp) found flag
@@ -182,8 +183,8 @@ void SwissTable::lookup_1(const uint16_t* selection, const int num_keys,
     uint64_t islot = next_slot_to_visit(iblock, islot_in_block, match_found);
 
     out_match_bitvector[id / 8] |= match_found << (id & 7);
-    out_groupids[id] = static_cast<uint32_t>(groupid);
-    out_slot_ids[id] = static_cast<uint32_t>(islot);
+    util::SafeStore(&out_groupids[id], static_cast<uint32_t>(groupid));
+    util::SafeStore(&out_slot_ids[id], static_cast<uint32_t>(islot));
   }
 }
 
@@ -239,7 +240,7 @@ Status SwissTable::lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected
   uint16_t* ids[3]{inout_selection, ids_for_comparison_buf.mutable_data(),
                    ids_inserted_buf.mutable_data()};
   auto push_id = [&num_ids, &ids](int category, int id) {
-    ids[category][num_ids[category]++] = static_cast<uint16_t>(id);
+    util::SafeStore(&ids[category][num_ids[category]++], static_cast<uint16_t>(id));
   };
 
   uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
@@ -256,9 +257,9 @@ Status SwissTable::lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected
        num_inserted_ + num_ids[category_inserted] < num_groups_limit;
        ++num_processed) {
     // row id in original batch
-    int id = inout_selection[num_processed];
+    int id = util::SafeLoad(&inout_selection[num_processed]);
 
-    uint64_t slot_id = wrap_global_slot_id(inout_next_slot_ids[id]);
+    uint64_t slot_id = wrap_global_slot_id(util::SafeLoad(&inout_next_slot_ids[id]));
     uint64_t block_id = slot_id >> 3;
     uint32_t hash = hashes[id];
     uint8_t* blockbase = blocks_ + num_block_bytes * block_id;
@@ -278,11 +279,13 @@ Status SwissTable::lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected
       // In that case we can insert group id value using aligned 64-bit word access.
       ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
                    num_groupid_bits == 32 || num_groupid_bits == 64);
-      reinterpret_cast<uint64_t*>(blockbase + 8)[groupid_bit_offset >> 6] |=
-          (static_cast<uint64_t>(group_id) << (groupid_bit_offset & 63));
+      uint64_t* ptr =
+          &reinterpret_cast<uint64_t*>(blockbase + 8)[groupid_bit_offset >> 6];
+      util::SafeStore(ptr, util::SafeLoad(ptr) | (static_cast<uint64_t>(group_id)
+                                                  << (groupid_bit_offset & 63)));
 
       hashes_[slot_id] = hash;
-      out_group_ids[id] = group_id;
+      util::SafeStore(&out_group_ids[id], group_id);
       push_id(category_inserted, id);
     } else {
       // We search for a slot with a matching stamp within a single block.
@@ -298,8 +301,8 @@ Status SwissTable::lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected
       ARROW_DCHECK(new_groupid < num_inserted_ + num_ids[category_inserted]);
       new_slot =
           static_cast<int>(next_slot_to_visit(block_id, new_slot, new_match_found));
-      inout_next_slot_ids[id] = new_slot;
-      out_group_ids[id] = new_groupid;
+      util::SafeStore(&inout_next_slot_ids[id], new_slot);
+      util::SafeStore(&out_group_ids[id], new_groupid);
       push_id(new_match_found, id);
     }
   }
@@ -410,7 +413,8 @@ Status SwissTable::map(const int num_keys, const uint32_t* hashes,
       //
       for (uint32_t i = 0; i < num_ids; ++i) {
         // First slot in the new starting block
-        slot_ids[ids[i]] = (hashes[ids[i]] >> (bits_hash_ - log_blocks_)) * 8;
+        const int16_t id = util::SafeLoad(&ids[i]);
+        util::SafeStore(&slot_ids[id], (hashes[id] >> (bits_hash_ - log_blocks_)) * 8);
       }
     }
   } while (num_ids > 0);
@@ -457,9 +461,8 @@ Status SwissTable::grow_double() {
         static_cast<int>(CountLeadingZeros(block & kHighBitOfEachByte) >> 3);
     int full_slots_new[2];
     full_slots_new[0] = full_slots_new[1] = 0;
-    *reinterpret_cast<uint64_t*>(double_block_base_new) = kHighBitOfEachByte;
-    *reinterpret_cast<uint64_t*>(double_block_base_new + block_size_after) =
-        kHighBitOfEachByte;
+    util::SafeStore(double_block_base_new, kHighBitOfEachByte);
+    util::SafeStore(double_block_base_new + block_size_after, kHighBitOfEachByte);
 
     for (int j = 0; j < full_slots; ++j) {
       uint64_t slot_id = i * 8 + j;
@@ -474,18 +477,20 @@ Status SwissTable::grow_double() {
       uint8_t stamp_new =
           hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask;
       uint64_t group_id_bit_offs = j * num_group_id_bits_before;
-      uint64_t group_id = (*reinterpret_cast<const uint64_t*>(block_base + 8 +
-                                                              (group_id_bit_offs >> 3)) >>
-                           (group_id_bit_offs & 7)) &
-                          group_id_mask_before;
+      uint64_t group_id =
+          (util::SafeLoadAs<uint64_t>(block_base + 8 + (group_id_bit_offs >> 3)) >>
+           (group_id_bit_offs & 7)) &
+          group_id_mask_before;
 
       uint64_t slot_id_new = i * 16 + ihalf * 8 + full_slots_new[ihalf];
       hashes_new[slot_id_new] = hash;
       uint8_t* block_base_new = double_block_base_new + ihalf * block_size_after;
       block_base_new[7 - full_slots_new[ihalf]] = stamp_new;
       int group_id_bit_offs_new = full_slots_new[ihalf] * num_group_id_bits_after;
-      *reinterpret_cast<uint64_t*>(block_base_new + 8 + (group_id_bit_offs_new >> 3)) |=
-          (group_id << (group_id_bit_offs_new & 7));
+      uint64_t* ptr =
+          reinterpret_cast<uint64_t*>(block_base_new + 8 + (group_id_bit_offs_new >> 3));
+      util::SafeStore(ptr,
+                      util::SafeLoad(ptr) | (group_id << (group_id_bit_offs_new & 7)));
       full_slots_new[ihalf]++;
     }
   }
@@ -495,7 +500,7 @@ Status SwissTable::grow_double() {
   for (int i = 0; i < (1 << log_blocks_); ++i) {
     // How many full slots in this block
     uint8_t* block_base = blocks_ + i * block_size_before;
-    uint64_t block = *reinterpret_cast<const uint64_t*>(block_base);
+    uint64_t block = util::SafeLoadAs<uint64_t>(block_base);
     int full_slots = static_cast<int>(CountLeadingZeros(block & kHighBitOfEachByte) >> 3);
 
     for (int j = 0; j < full_slots; ++j) {
@@ -508,21 +513,21 @@ Status SwissTable::grow_double() {
       }
 
       uint64_t group_id_bit_offs = j * num_group_id_bits_before;
-      uint64_t group_id = (*reinterpret_cast<const uint64_t*>(block_base + 8 +
-                                                              (group_id_bit_offs >> 3)) >>
-                           (group_id_bit_offs & 7)) &
-                          group_id_mask_before;
+      uint64_t group_id =
+          (util::SafeLoadAs<uint64_t>(block_base + 8 + (group_id_bit_offs >> 3)) >>
+           (group_id_bit_offs & 7)) &
+          group_id_mask_before;
       uint8_t stamp_new =
           hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask;
 
       uint8_t* block_base_new = blocks_new + block_id_new * block_size_after;
-      uint64_t block_new = *reinterpret_cast<const uint64_t*>(block_base_new);
+      uint64_t block_new = util::SafeLoadAs<uint64_t>(block_base_new);
       int full_slots_new =
           static_cast<int>(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3);
       while (full_slots_new == 8) {
         block_id_new = (block_id_new + 1) & ((1 << log_blocks_after) - 1);
         block_base_new = blocks_new + block_id_new * block_size_after;
-        block_new = *reinterpret_cast<const uint64_t*>(block_base_new);
+        block_new = util::SafeLoadAs<uint64_t>(block_base_new);
         full_slots_new =
             static_cast<int>(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3);
       }
@@ -530,8 +535,10 @@ Status SwissTable::grow_double() {
       hashes_new[block_id_new * 8 + full_slots_new] = hash;
       block_base_new[7 - full_slots_new] = stamp_new;
       int group_id_bit_offs_new = full_slots_new * num_group_id_bits_after;
-      *reinterpret_cast<uint64_t*>(block_base_new + 8 + (group_id_bit_offs_new >> 3)) |=
-          (group_id << (group_id_bit_offs_new & 7));
+      uint64_t* ptr =
+          reinterpret_cast<uint64_t*>(block_base_new + 8 + (group_id_bit_offs_new >> 3));
+      util::SafeStore(ptr,
+                      util::SafeLoad(ptr) | (group_id << (group_id_bit_offs_new & 7)));
     }
   }
 
@@ -567,7 +574,7 @@ Status SwissTable::init(int64_t hardware_flags, MemoryPool* pool,
 
   // Initialize all status bytes to represent an empty slot.
   for (uint64_t i = 0; i < (static_cast<uint64_t>(1) << log_blocks_); ++i) {
-    *reinterpret_cast<uint64_t*>(blocks_ + i * block_bytes) = kHighBitOfEachByte;
+    util::SafeStore(blocks_ + i * block_bytes, kHighBitOfEachByte);
   }
 
   uint64_t num_slots = 1ULL << (log_blocks_ + 3);
diff --git a/cpp/src/arrow/compute/exec/util.cc b/cpp/src/arrow/compute/exec/util.cc
index 5f1c0776c56..88303348645 100644
--- a/cpp/src/arrow/compute/exec/util.cc
+++ b/cpp/src/arrow/compute/exec/util.cc
@@ -19,6 +19,7 @@
 
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_ops.h"
+#include "arrow/util/ubsan.h"
 
 namespace arrow {
 
@@ -66,7 +67,7 @@ void BitUtil::bits_to_indexes_internal(int64_t hardware_flags, const int num_bit
 #endif
     *num_indexes = 0;
     for (int i = 0; i < num_bits / unroll; ++i) {
-      uint64_t word = reinterpret_cast<const uint64_t*>(bits)[i];
+      uint64_t word = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bits)[i]);
       if (bit_to_search == 0) {
         word = ~word;
       }
@@ -81,7 +82,8 @@ void BitUtil::bits_to_indexes_internal(int64_t hardware_flags, const int num_bit
 #endif
   // Optionally process the last partial word with masking out bits outside range
   if (tail) {
-    uint64_t word = reinterpret_cast<const uint64_t*>(bits)[num_bits / unroll];
+    uint64_t word =
+        util::SafeLoad(&reinterpret_cast<const uint64_t*>(bits)[num_bits / unroll]);
     if (bit_to_search == 0) {
       word = ~word;
     }
@@ -144,7 +146,7 @@ void BitUtil::bits_to_bytes_internal(const int num_bits, const uint8_t* bits,
     unpacked |= (bits_next & 1);
     unpacked &= 0x0101010101010101ULL;
     unpacked *= 255;
-    reinterpret_cast<uint64_t*>(bytes)[i] = unpacked;
+    util::SafeStore(&reinterpret_cast<uint64_t*>(bytes)[i], unpacked);
   }
 }
 
@@ -153,7 +155,7 @@ void BitUtil::bytes_to_bits_internal(const int num_bits, const uint8_t* bytes,
   constexpr int unroll = 8;
   // Process 8 bits at a time
   for (int i = 0; i < (num_bits + unroll - 1) / unroll; ++i) {
-    uint64_t bytes_next = reinterpret_cast<const uint64_t*>(bytes)[i];
+    uint64_t bytes_next = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
     bytes_next &= 0x0101010101010101ULL;
     bytes_next |= (bytes_next >> 7);  // Pairs of adjacent output bits in individual bytes
     bytes_next |= (bytes_next >> 14);  // 4 adjacent output bits in individual bytes
@@ -184,7 +186,7 @@ void BitUtil::bits_to_bytes(int64_t hardware_flags, const int num_bits,
     unpacked |= (bits_next & 1);
     unpacked &= 0x0101010101010101ULL;
     unpacked *= 255;
-    reinterpret_cast<uint64_t*>(bytes)[i] = unpacked;
+    util::SafeStore(&reinterpret_cast<uint64_t*>(bytes)[i], unpacked);
   }
 }
 
@@ -201,7 +203,7 @@ void BitUtil::bytes_to_bits(int64_t hardware_flags, const int num_bits,
   // Process 8 bits at a time
   constexpr int unroll = 8;
   for (int i = num_processed / unroll; i < (num_bits + unroll - 1) / unroll; ++i) {
-    uint64_t bytes_next = reinterpret_cast<const uint64_t*>(bytes)[i];
+    uint64_t bytes_next = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
     bytes_next &= 0x0101010101010101ULL;
     bytes_next |= (bytes_next >> 7);  // Pairs of adjacent output bits in individual bytes
     bytes_next |= (bytes_next >> 14);  // 4 adjacent output bits in individual bytes
@@ -220,7 +222,7 @@ bool BitUtil::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
   uint64_t result_or = 0;
   uint32_t i;
   for (i = 0; i < num_bytes / 8; ++i) {
-    uint64_t x = reinterpret_cast<const uint64_t*>(bytes)[i];
+    uint64_t x = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
     result_or |= x;
   }
   if (num_bytes % 8 > 0) {

From 522696b164efee500d7046a65657ba61e34162a8 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Tue, 15 Jun 2021 11:22:25 -0400
Subject: [PATCH 15/61] ARROW-12597: [C++] Enable per-row-group parallelism in
 async Parquet reader

This adds an OptionalParallelForAsync which lets us have per-row-group parallelism without nested parallelism in the async Parquet reader. This also uses TransferAlways, taking care of ARROW-12916. `enable_parallel_column_conversion` is kept as it still affects the threaded scanner.

Closes #10482 from lidavidm/arrow-12597

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/dataset/file_parquet.cc |   6 +-
 cpp/src/arrow/dataset/file_parquet.h  |   3 +-
 cpp/src/arrow/dataset/test_util.h     |  14 +++-
 cpp/src/arrow/ipc/reader.cc           |   9 +--
 cpp/src/arrow/util/parallel.h         |  37 +++++++++
 cpp/src/arrow/util/vector.h           |  13 +++
 cpp/src/parquet/arrow/reader.cc       | 110 ++++++++++++++------------
 7 files changed, 128 insertions(+), 64 deletions(-)

diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
index 8c325d21da1..0ebbd0a5333 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -362,11 +362,7 @@ Future<std::shared_ptr<parquet::arrow::FileReader>> ParquetFileFormat::GetReader
             parquet_scan_options->arrow_reader_properties->cache_options());
         arrow_properties.set_io_context(
             parquet_scan_options->arrow_reader_properties->io_context());
-        // TODO: ARROW-12597 will let us enable parallel conversion
-        if (!options->use_threads) {
-          arrow_properties.set_use_threads(
-              parquet_scan_options->enable_parallel_column_conversion);
-        }
+        arrow_properties.set_use_threads(options->use_threads);
         std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
         RETURN_NOT_OK(parquet::arrow::FileReader::Make(options->pool, std::move(reader),
                                                        std::move(arrow_properties),
diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h
index 8286e2776cb..347f4032046 100644
--- a/cpp/src/arrow/dataset/file_parquet.h
+++ b/cpp/src/arrow/dataset/file_parquet.h
@@ -222,7 +222,8 @@ class ARROW_DS_EXPORT ParquetFragmentScanOptions : public FragmentScanOptions {
   /// EXPERIMENTAL: Parallelize conversion across columns. This option is ignored if a
   /// scan is already parallelized across input files to avoid thread contention. This
   /// option will be removed after support is added for simultaneous parallelization
-  /// across files and columns.
+  /// across files and columns. Only affects the threaded reader; the async reader
+  /// will parallelize across columns if use_threads is enabled.
   bool enable_parallel_column_conversion = false;
 };
 
diff --git a/cpp/src/arrow/dataset/test_util.h b/cpp/src/arrow/dataset/test_util.h
index 1e4222eec8c..39223eba35b 100644
--- a/cpp/src/arrow/dataset/test_util.h
+++ b/cpp/src/arrow/dataset/test_util.h
@@ -310,6 +310,7 @@ class DatasetFixtureMixinWithParam : public DatasetFixtureMixin,
 
 struct TestFormatParams {
   bool use_async;
+  bool use_threads;
   int num_batches;
   int items_per_batch;
 
@@ -318,7 +319,8 @@ struct TestFormatParams {
   std::string ToString() const {
     // GTest requires this to be alphanumeric
     std::stringstream ss;
-    ss << (use_async ? "Async" : "Sync") << num_batches << "b" << items_per_batch << "r";
+    ss << (use_async ? "Async" : "Sync") << (use_threads ? "Threaded" : "Serial")
+       << num_batches << "b" << items_per_batch << "r";
     return ss.str();
   }
 
@@ -328,8 +330,12 @@ struct TestFormatParams {
   }
 
   static std::vector<TestFormatParams> Values() {
-    std::vector<TestFormatParams> values{{/*async=*/false, 16, 1024},
-                                         {/*async=*/true, 16, 1024}};
+    std::vector<TestFormatParams> values;
+    for (const bool async : std::vector<bool>{true, false}) {
+      for (const bool use_threads : std::vector<bool>{true, false}) {
+        values.push_back(TestFormatParams{async, use_threads, 16, 1024});
+      }
+    }
     return values;
   }
 };
@@ -511,6 +517,7 @@ class FileFormatScanMixin : public FileFormatFixtureMixin<FormatHelper>,
     auto dataset = std::make_shared<FragmentDataset>(schema, FragmentVector{fragment});
     ScannerBuilder builder(dataset, opts_);
     ARROW_EXPECT_OK(builder.UseAsync(GetParam().use_async));
+    ARROW_EXPECT_OK(builder.UseThreads(GetParam().use_threads));
     EXPECT_OK_AND_ASSIGN(auto scanner, builder.Finish());
     EXPECT_OK_AND_ASSIGN(auto batch_it, scanner->ScanBatches());
     return MakeMapIterator([](TaggedRecordBatch tagged) { return tagged.record_batch; },
@@ -519,6 +526,7 @@ class FileFormatScanMixin : public FileFormatFixtureMixin<FormatHelper>,
 
   // Scan the fragment directly, without using the scanner.
   RecordBatchIterator PhysicalBatches(std::shared_ptr<Fragment> fragment) {
+    opts_->use_threads = GetParam().use_threads;
     if (GetParam().use_async) {
       EXPECT_OK_AND_ASSIGN(auto batch_gen, fragment->ScanBatchesAsync(opts_));
       EXPECT_OK_AND_ASSIGN(auto batch_it, MakeGeneratorIterator(std::move(batch_gen)));
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index 7c26bce913d..a3c345cc440 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -54,6 +54,7 @@
 #include "arrow/util/string.h"
 #include "arrow/util/thread_pool.h"
 #include "arrow/util/ubsan.h"
+#include "arrow/util/vector.h"
 #include "arrow/visitor_inline.h"
 
 #include "generated/File_generated.h"  // IWYU pragma: export
@@ -1368,12 +1369,10 @@ Future<IpcFileRecordBatchGenerator::Item> IpcFileRecordBatchGenerator::operator(
     auto read_messages = All(std::move(messages));
     if (executor_) read_messages = executor_->Transfer(read_messages);
     read_dictionaries_ = read_messages.Then(
-        [=](const std::vector<Result<std::shared_ptr<Message>>> maybe_messages)
+        [=](const std::vector<Result<std::shared_ptr<Message>>>& maybe_messages)
             -> Status {
-          std::vector<std::shared_ptr<Message>> messages(state->num_dictionaries());
-          for (size_t i = 0; i < messages.size(); i++) {
-            ARROW_ASSIGN_OR_RAISE(messages[i], maybe_messages[i]);
-          }
+          ARROW_ASSIGN_OR_RAISE(auto messages,
+                                arrow::internal::UnwrapOrRaise(maybe_messages));
           return ReadDictionaries(state.get(), std::move(messages));
         });
   }
diff --git a/cpp/src/arrow/util/parallel.h b/cpp/src/arrow/util/parallel.h
index e56a71b91af..80f60fbdb36 100644
--- a/cpp/src/arrow/util/parallel.h
+++ b/cpp/src/arrow/util/parallel.h
@@ -21,7 +21,9 @@
 #include <vector>
 
 #include "arrow/status.h"
+#include "arrow/util/functional.h"
 #include "arrow/util/thread_pool.h"
+#include "arrow/util/vector.h"
 
 namespace arrow {
 namespace internal {
@@ -44,6 +46,21 @@ Status ParallelFor(int num_tasks, FUNCTION&& func,
   return st;
 }
 
+template <class FUNCTION, typename T,
+          typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
+Future<std::vector<R>> ParallelForAsync(
+    std::vector<T> inputs, FUNCTION&& func,
+    Executor* executor = internal::GetCpuThreadPool()) {
+  std::vector<Future<R>> futures(inputs.size());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    ARROW_ASSIGN_OR_RAISE(futures[i], executor->Submit(func, i, std::move(inputs[i])));
+  }
+  return All(std::move(futures))
+      .Then([](const std::vector<Result<R>>& results) -> Result<std::vector<R>> {
+        return UnwrapOrRaise(results);
+      });
+}
+
 // A parallelizer that takes a `Status(int)` function and calls it with
 // arguments between 0 and `num_tasks - 1`, in sequence or in parallel,
 // depending on the input boolean.
@@ -61,5 +78,25 @@ Status OptionalParallelFor(bool use_threads, int num_tasks, FUNCTION&& func,
   }
 }
 
+// A parallelizer that takes a `Result<R>(int index, T item)` function and
+// calls it with each item from the input array, in sequence or in parallel,
+// depending on the input boolean.
+
+template <class FUNCTION, typename T,
+          typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
+Future<std::vector<R>> OptionalParallelForAsync(
+    bool use_threads, std::vector<T> inputs, FUNCTION&& func,
+    Executor* executor = internal::GetCpuThreadPool()) {
+  if (use_threads) {
+    return ParallelForAsync(std::move(inputs), std::forward<FUNCTION>(func), executor);
+  } else {
+    std::vector<R> result(inputs.size());
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      ARROW_ASSIGN_OR_RAISE(result[i], func(i, inputs[i]));
+    }
+    return result;
+  }
+}
+
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/vector.h b/cpp/src/arrow/util/vector.h
index b9f2e2a45aa..3ef0074aa9d 100644
--- a/cpp/src/arrow/util/vector.h
+++ b/cpp/src/arrow/util/vector.h
@@ -133,5 +133,18 @@ Result<std::vector<T>> UnwrapOrRaise(std::vector<Result<T>>&& results) {
   return std::move(out);
 }
 
+template <typename T>
+Result<std::vector<T>> UnwrapOrRaise(const std::vector<Result<T>>& results) {
+  std::vector<T> out;
+  out.reserve(results.size());
+  for (const auto& result : results) {
+    if (!result.ok()) {
+      return result.status();
+    }
+    out.push_back(result.ValueUnsafe());
+  }
+  return std::move(out);
+}
+
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 14eb7495805..4f5f79c964a 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -293,10 +293,12 @@ class FileReaderImpl : public FileReader {
                        const std::vector<int>& indices,
                        std::shared_ptr<Table>* table) override;
 
-  // Helper method used by ReadRowGroups/Generator - read the given row groups/columns,
-  // skipping bounds checks and pre-buffering.
-  Status DecodeRowGroups(const std::vector<int>& row_groups,
-                         const std::vector<int>& indices, std::shared_ptr<Table>* table);
+  // Helper method used by ReadRowGroups - read the given row groups/columns, skipping
+  // bounds checks and pre-buffering. Takes a shared_ptr to self to keep the reader
+  // alive in async contexts.
+  Future<std::shared_ptr<Table>> DecodeRowGroups(
+      std::shared_ptr<FileReaderImpl> self, const std::vector<int>& row_groups,
+      const std::vector<int>& column_indices, ::arrow::internal::Executor* cpu_executor);
 
   Status ReadRowGroups(const std::vector<int>& row_groups,
                        std::shared_ptr<Table>* table) override {
@@ -1007,10 +1009,9 @@ class RowGroupGenerator {
       return SubmitRead(cpu_executor_, reader, row_group, column_indices);
     }
     auto ready = reader->parquet_reader()->WhenBuffered({row_group}, column_indices);
-    // TODO(ARROW-12916): always transfer here
-    if (cpu_executor_) ready = cpu_executor_->Transfer(ready);
-    return ready.Then([=]() -> ::arrow::Result<RecordBatchGenerator> {
-      return ReadOneRowGroup(reader, row_group, column_indices);
+    if (cpu_executor_) ready = cpu_executor_->TransferAlways(ready);
+    return ready.Then([=]() -> ::arrow::Future<RecordBatchGenerator> {
+      return ReadOneRowGroup(cpu_executor_, reader, row_group, column_indices);
     });
   }
 
@@ -1024,31 +1025,25 @@ class RowGroupGenerator {
       ::arrow::internal::Executor* cpu_executor, std::shared_ptr<FileReaderImpl> self,
       const int row_group, const std::vector<int>& column_indices) {
     if (!cpu_executor) {
-      return Future<RecordBatchGenerator>::MakeFinished(
-          ReadOneRowGroup(self, row_group, column_indices));
+      return ReadOneRowGroup(cpu_executor, self, row_group, column_indices);
     }
     // If we have an executor, then force transfer (even if I/O was complete)
-    return ::arrow::DeferNotOk(
-        cpu_executor->Submit(ReadOneRowGroup, self, row_group, column_indices));
+    return ::arrow::DeferNotOk(cpu_executor->Submit(ReadOneRowGroup, cpu_executor, self,
+                                                    row_group, column_indices));
   }
 
-  static ::arrow::Result<RecordBatchGenerator> ReadOneRowGroup(
-      std::shared_ptr<FileReaderImpl> self, const int row_group,
-      const std::vector<int>& column_indices) {
-    std::shared_ptr<::arrow::Table> table;
+  static ::arrow::Future<RecordBatchGenerator> ReadOneRowGroup(
+      ::arrow::internal::Executor* cpu_executor, std::shared_ptr<FileReaderImpl> self,
+      const int row_group, const std::vector<int>& column_indices) {
     // Skips bound checks/pre-buffering, since we've done that already
-    RETURN_NOT_OK(self->DecodeRowGroups({row_group}, column_indices, &table));
-    auto table_reader = std::make_shared<::arrow::TableBatchReader>(*table);
-    ::arrow::RecordBatchVector batches;
-    while (true) {
-      std::shared_ptr<::arrow::RecordBatch> batch;
-      RETURN_NOT_OK(table_reader->ReadNext(&batch));
-      if (!batch) {
-        break;
-      }
-      batches.push_back(batch);
-    }
-    return ::arrow::MakeVectorGenerator(std::move(batches));
+    return self->DecodeRowGroups(self, {row_group}, column_indices, cpu_executor)
+        .Then([](const std::shared_ptr<Table>& table)
+                  -> ::arrow::Result<RecordBatchGenerator> {
+          ::arrow::TableBatchReader table_reader(*table);
+          ::arrow::RecordBatchVector batches;
+          RETURN_NOT_OK(table_reader.ReadAll(&batches));
+          return ::arrow::MakeVectorGenerator(std::move(batches));
+        });
   }
 
   std::shared_ptr<FileReaderImpl> arrow_reader_;
@@ -1104,34 +1099,49 @@ Status FileReaderImpl::ReadRowGroups(const std::vector<int>& row_groups,
     END_PARQUET_CATCH_EXCEPTIONS
   }
 
-  return DecodeRowGroups(row_groups, column_indices, out);
+  auto fut = DecodeRowGroups(/*self=*/nullptr, row_groups, column_indices,
+                             /*cpu_executor=*/nullptr);
+  ARROW_ASSIGN_OR_RAISE(*out, fut.MoveResult());
+  return Status::OK();
 }
 
-// Also used by RowGroupGenerator - skip bounds check/pre-buffer to avoid doing that twice
-Status FileReaderImpl::DecodeRowGroups(const std::vector<int>& row_groups,
-                                       const std::vector<int>& column_indices,
-                                       std::shared_ptr<Table>* out) {
+Future<std::shared_ptr<Table>> FileReaderImpl::DecodeRowGroups(
+    std::shared_ptr<FileReaderImpl> self, const std::vector<int>& row_groups,
+    const std::vector<int>& column_indices, ::arrow::internal::Executor* cpu_executor) {
+  // `self` is used solely to keep `this` alive in an async context - but we use this
+  // in a sync context too so use `this` over `self`
   std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
   std::shared_ptr<::arrow::Schema> result_schema;
   RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &result_schema));
-
-  ::arrow::ChunkedArrayVector columns(readers.size());
-  RETURN_NOT_OK(::arrow::internal::OptionalParallelFor(
-      reader_properties_.use_threads(), static_cast<int>(readers.size()), [&](int i) {
-        return ReadColumn(static_cast<int>(i), row_groups, readers[i].get(), &columns[i]);
-      }));
-
-  int64_t num_rows = 0;
-  if (!columns.empty()) {
-    num_rows = columns[0]->length();
-  } else {
-    for (int i : row_groups) {
-      num_rows += parquet_reader()->metadata()->RowGroup(i)->num_rows();
+  // OptionalParallelForAsync requires an executor
+  if (!cpu_executor) cpu_executor = ::arrow::internal::GetCpuThreadPool();
+
+  auto read_column = [row_groups, self, this](size_t i,
+                                              std::shared_ptr<ColumnReaderImpl> reader)
+      -> ::arrow::Result<std::shared_ptr<::arrow::ChunkedArray>> {
+    std::shared_ptr<::arrow::ChunkedArray> column;
+    RETURN_NOT_OK(ReadColumn(static_cast<int>(i), row_groups, reader.get(), &column));
+    return column;
+  };
+  auto make_table = [result_schema, row_groups, self,
+                     this](const ::arrow::ChunkedArrayVector& columns)
+      -> ::arrow::Result<std::shared_ptr<Table>> {
+    int64_t num_rows = 0;
+    if (!columns.empty()) {
+      num_rows = columns[0]->length();
+    } else {
+      for (int i : row_groups) {
+        num_rows += parquet_reader()->metadata()->RowGroup(i)->num_rows();
+      }
     }
-  }
-
-  *out = Table::Make(std::move(result_schema), std::move(columns), num_rows);
-  return (*out)->Validate();
+    auto table = Table::Make(std::move(result_schema), columns, num_rows);
+    RETURN_NOT_OK(table->Validate());
+    return table;
+  };
+  return ::arrow::internal::OptionalParallelForAsync(reader_properties_.use_threads(),
+                                                     std::move(readers), read_column,
+                                                     cpu_executor)
+      .Then(std::move(make_table));
 }
 
 std::shared_ptr<RowGroupReader> FileReaderImpl::RowGroup(int row_group_index) {

From f216d62ddd89f804250424e6358e58e69b7b5770 Mon Sep 17 00:00:00 2001
From: Karik Isichei <karik.isichei@gmail.com>
Date: Tue, 15 Jun 2021 18:25:11 +0200
Subject: [PATCH 16/61] ARROW-12096: [C++] Allows users to define arrow
 timestamp unit for Parquet INT96 timestamp

Have added functionality in C++ code to allow users to define the arrow timestamp unit when reading parquet INT96 types. This avoids the overflow bug when trying to convert INT96 values which have dates which are out of bounds for Arrow NS Timestamp.

See added test: `TestArrowReadWrite.DownsampleDeprecatedInt96` which demonstrates use and expected results.

Main discussion of changes in [JIRA Issue ARROW-12096](https://issues.apache.org/jira/browse/ARROW-12096).

Closes #10461 from isichei/ARROW-12096

Lead-authored-by: Karik Isichei <karik.isichei@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../parquet/arrow/arrow_reader_writer_test.cc | 56 +++++++++++++++++++
 cpp/src/parquet/arrow/reader_internal.cc      | 43 +++++++++-----
 cpp/src/parquet/arrow/schema.cc               |  4 +-
 cpp/src/parquet/arrow/schema_internal.cc      | 19 +++----
 cpp/src/parquet/arrow/schema_internal.h       |  7 ++-
 cpp/src/parquet/properties.h                  | 14 ++++-
 cpp/src/parquet/types.h                       | 43 ++++++++++++--
 7 files changed, 150 insertions(+), 36 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 677458ce37e..6c82b8dee78 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -558,6 +558,35 @@ void ReadSingleColumnFileStatistics(std::unique_ptr<FileReader> file_reader,
   ASSERT_OK(StatisticsAsScalars(*statistics, min, max));
 }
 
+void DownsampleInt96RoundTrip(std::shared_ptr<Array> arrow_vector_in,
+                              std::shared_ptr<Array> arrow_vector_out,
+                              ::arrow::TimeUnit::type unit) {
+  // Create single input table of NS to be written to parquet with INT96
+  auto input_schema =
+      ::arrow::schema({::arrow::field("f", ::arrow::timestamp(TimeUnit::NANO))});
+  auto input = Table::Make(input_schema, {arrow_vector_in});
+
+  // Create an expected schema for each resulting table (one for each "downsampled" ts)
+  auto ex_schema = ::arrow::schema({::arrow::field("f", ::arrow::timestamp(unit))});
+  auto ex_result = Table::Make(ex_schema, {arrow_vector_out});
+
+  std::shared_ptr<Table> result;
+
+  ArrowReaderProperties arrow_reader_prop;
+  arrow_reader_prop.set_coerce_int96_timestamp_unit(unit);
+
+  ASSERT_NO_FATAL_FAILURE(DoRoundtrip(
+      input, input->num_rows(), &result, default_writer_properties(),
+      ArrowWriterProperties::Builder().enable_deprecated_int96_timestamps()->build(),
+      arrow_reader_prop));
+
+  ASSERT_NO_FATAL_FAILURE(::arrow::AssertSchemaEqual(*ex_result->schema(),
+                                                     *result->schema(),
+                                                     /*check_metadata=*/false));
+
+  ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_result, *result));
+}
+
 // Non-template base class for TestParquetIO, to avoid code duplication
 class ParquetIOTestBase : public ::testing::Test {
  public:
@@ -1671,6 +1700,33 @@ TEST(TestArrowReadWrite, UseDeprecatedInt96) {
   ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*ex_result, *result));
 }
 
+TEST(TestArrowReadWrite, DownsampleDeprecatedInt96) {
+  using ::arrow::ArrayFromJSON;
+  using ::arrow::field;
+  using ::arrow::schema;
+
+  // Timestamp values at 2000-01-01 00:00:00,
+  // then with increment unit of 1ns, 1us, 1ms and 1s.
+  auto a_nano =
+      ArrayFromJSON(timestamp(TimeUnit::NANO),
+                    "[946684800000000000, 946684800000000001, 946684800000001000, "
+                    "946684800001000000, 946684801000000000]");
+  auto a_micro = ArrayFromJSON(timestamp(TimeUnit::MICRO),
+                               "[946684800000000, 946684800000000, 946684800000001, "
+                               "946684800001000, 946684801000000]");
+  auto a_milli = ArrayFromJSON(
+      timestamp(TimeUnit::MILLI),
+      "[946684800000, 946684800000, 946684800000, 946684800001, 946684801000]");
+  auto a_second =
+      ArrayFromJSON(timestamp(TimeUnit::SECOND),
+                    "[946684800, 946684800, 946684800, 946684800, 946684801]");
+
+  ASSERT_NO_FATAL_FAILURE(DownsampleInt96RoundTrip(a_nano, a_nano, TimeUnit::NANO));
+  ASSERT_NO_FATAL_FAILURE(DownsampleInt96RoundTrip(a_nano, a_micro, TimeUnit::MICRO));
+  ASSERT_NO_FATAL_FAILURE(DownsampleInt96RoundTrip(a_nano, a_milli, TimeUnit::MILLI));
+  ASSERT_NO_FATAL_FAILURE(DownsampleInt96RoundTrip(a_nano, a_second, TimeUnit::SECOND));
+}
+
 TEST(TestArrowReadWrite, CoerceTimestamps) {
   using ::arrow::ArrayFromVector;
   using ::arrow::field;
diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
index 1410a5f89e2..0ffa3e89970 100644
--- a/cpp/src/parquet/arrow/reader_internal.cc
+++ b/cpp/src/parquet/arrow/reader_internal.cc
@@ -353,7 +353,8 @@ Status TransferBool(RecordReader* reader, MemoryPool* pool, Datum* out) {
 }
 
 Status TransferInt96(RecordReader* reader, MemoryPool* pool,
-                     const std::shared_ptr<DataType>& type, Datum* out) {
+                     const std::shared_ptr<DataType>& type, Datum* out,
+                     const ::arrow::TimeUnit::type int96_arrow_time_unit) {
   int64_t length = reader->values_written();
   auto values = reinterpret_cast<const Int96*>(reader->values());
   ARROW_ASSIGN_OR_RAISE(auto data,
@@ -365,7 +366,20 @@ Status TransferInt96(RecordReader* reader, MemoryPool* pool,
       // isn't representable as a 64-bit Unix timestamp.
       *data_ptr++ = 0;
     } else {
-      *data_ptr++ = Int96GetNanoSeconds(values[i]);
+      switch (int96_arrow_time_unit) {
+        case ::arrow::TimeUnit::NANO:
+          *data_ptr++ = Int96GetNanoSeconds(values[i]);
+          break;
+        case ::arrow::TimeUnit::MICRO:
+          *data_ptr++ = Int96GetMicroSeconds(values[i]);
+          break;
+        case ::arrow::TimeUnit::MILLI:
+          *data_ptr++ = Int96GetMilliSeconds(values[i]);
+          break;
+        case ::arrow::TimeUnit::SECOND:
+          *data_ptr++ = Int96GetSeconds(values[i]);
+          break;
+      }
     }
   }
   *out = std::make_shared<TimestampArray>(type, length, std::move(data),
@@ -742,20 +756,19 @@ Status TransferColumnData(RecordReader* reader, std::shared_ptr<DataType> value_
     case ::arrow::Type::TIMESTAMP: {
       const ::arrow::TimestampType& timestamp_type =
           checked_cast<::arrow::TimestampType&>(*value_type);
-      switch (timestamp_type.unit()) {
-        case ::arrow::TimeUnit::MILLI:
-        case ::arrow::TimeUnit::MICRO: {
-          result = TransferZeroCopy(reader, value_type);
-        } break;
-        case ::arrow::TimeUnit::NANO: {
-          if (descr->physical_type() == ::parquet::Type::INT96) {
-            RETURN_NOT_OK(TransferInt96(reader, pool, value_type, &result));
-          } else {
+      if (descr->physical_type() == ::parquet::Type::INT96) {
+        RETURN_NOT_OK(
+            TransferInt96(reader, pool, value_type, &result, timestamp_type.unit()));
+      } else {
+        switch (timestamp_type.unit()) {
+          case ::arrow::TimeUnit::MILLI:
+          case ::arrow::TimeUnit::MICRO:
+          case ::arrow::TimeUnit::NANO:
             result = TransferZeroCopy(reader, value_type);
-          }
-        } break;
-        default:
-          return Status::NotImplemented("TimeUnit not supported");
+            break;
+          default:
+            return Status::NotImplemented("TimeUnit not supported");
+        }
       }
     } break;
     default:
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index 7610ce17605..eb7fd628dfc 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -454,7 +454,9 @@ bool IsDictionaryReadSupported(const ArrowType& type) {
 ::arrow::Result<std::shared_ptr<ArrowType>> GetTypeForNode(
     int column_index, const schema::PrimitiveNode& primitive_node,
     SchemaTreeContext* ctx) {
-  ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> storage_type, GetArrowType(primitive_node));
+  ASSIGN_OR_RAISE(
+      std::shared_ptr<ArrowType> storage_type,
+      GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit()));
   if (ctx->properties.read_dictionary(column_index) &&
       IsDictionaryReadSupported(*storage_type)) {
     return ::arrow::dictionary(::arrow::int32(), storage_type);
diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc
index fbdfa09a040..064bf4f55cc 100644
--- a/cpp/src/parquet/arrow/schema_internal.cc
+++ b/cpp/src/parquet/arrow/schema_internal.cc
@@ -179,9 +179,9 @@ Result<std::shared_ptr<ArrowType>> FromInt64(const LogicalType& logical_type) {
   }
 }
 
-Result<std::shared_ptr<ArrowType>> GetArrowType(Type::type physical_type,
-                                                const LogicalType& logical_type,
-                                                int type_length) {
+Result<std::shared_ptr<ArrowType>> GetArrowType(
+    Type::type physical_type, const LogicalType& logical_type, int type_length,
+    const ::arrow::TimeUnit::type int96_arrow_time_unit) {
   if (logical_type.is_invalid() || logical_type.is_null()) {
     return ::arrow::null();
   }
@@ -194,7 +194,7 @@ Result<std::shared_ptr<ArrowType>> GetArrowType(Type::type physical_type,
     case ParquetType::INT64:
       return FromInt64(logical_type);
     case ParquetType::INT96:
-      return ::arrow::timestamp(::arrow::TimeUnit::NANO);
+      return ::arrow::timestamp(int96_arrow_time_unit);
     case ParquetType::FLOAT:
       return ::arrow::float32();
     case ParquetType::DOUBLE:
@@ -211,14 +211,11 @@ Result<std::shared_ptr<ArrowType>> GetArrowType(Type::type physical_type,
   }
 }
 
-Result<std::shared_ptr<ArrowType>> GetArrowType(const schema::PrimitiveNode& primitive) {
+Result<std::shared_ptr<ArrowType>> GetArrowType(
+    const schema::PrimitiveNode& primitive,
+    const ::arrow::TimeUnit::type int96_arrow_time_unit) {
   return GetArrowType(primitive.physical_type(), *primitive.logical_type(),
-                      primitive.type_length());
-}
-
-Result<std::shared_ptr<ArrowType>> GetArrowType(const ColumnDescriptor& descriptor) {
-  return GetArrowType(descriptor.physical_type(), *descriptor.logical_type(),
-                      descriptor.type_length());
+                      primitive.type_length(), int96_arrow_time_unit);
 }
 
 }  // namespace arrow
diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h
index ec0d9571304..fb837c3ee6c 100644
--- a/cpp/src/parquet/arrow/schema_internal.h
+++ b/cpp/src/parquet/arrow/schema_internal.h
@@ -40,9 +40,12 @@ Result<std::shared_ptr<::arrow::DataType>> GetArrowType(Type::type physical_type
                                                         int type_length);
 
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
-    const schema::PrimitiveNode& primitive);
+    Type::type physical_type, const LogicalType& logical_type, int type_length,
+    ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
+
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
-    const ColumnDescriptor& descriptor);
+    const schema::PrimitiveNode& primitive,
+    ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
 
 }  // namespace arrow
 }  // namespace parquet
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index 5018fff9531..d217b8efa52 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -575,7 +575,8 @@ class PARQUET_EXPORT ArrowReaderProperties {
         read_dict_indices_(),
         batch_size_(kArrowDefaultBatchSize),
         pre_buffer_(false),
-        cache_options_(::arrow::io::CacheOptions::Defaults()) {}
+        cache_options_(::arrow::io::CacheOptions::Defaults()),
+        coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO) {}
 
   void set_use_threads(bool use_threads) { use_threads_ = use_threads; }
 
@@ -620,6 +621,16 @@ class PARQUET_EXPORT ArrowReaderProperties {
 
   const ::arrow::io::IOContext& io_context() const { return io_context_; }
 
+  /// Set timestamp unit to use for deprecated INT96-encoded timestamps
+  /// (default is NANO).
+  void set_coerce_int96_timestamp_unit(::arrow::TimeUnit::type unit) {
+    coerce_int96_timestamp_unit_ = unit;
+  }
+
+  ::arrow::TimeUnit::type coerce_int96_timestamp_unit() const {
+    return coerce_int96_timestamp_unit_;
+  }
+
  private:
   bool use_threads_;
   std::unordered_set<int> read_dict_indices_;
@@ -627,6 +638,7 @@ class PARQUET_EXPORT ArrowReaderProperties {
   bool pre_buffer_;
   ::arrow::io::IOContext io_context_;
   ::arrow::io::CacheOptions cache_options_;
+  ::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
 };
 
 /// EXPERIMENTAL: Constructs the default ArrowReaderProperties
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index 4529dbe6133..6bd67f1ee5f 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -591,15 +591,46 @@ static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds)
   std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds));
 }
 
-static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) {
+struct DecodedInt96 {
+  uint64_t days_since_epoch;
+  uint64_t nanoseconds;
+};
+
+static inline DecodedInt96 DecodeInt96Timestamp(const parquet::Int96& i96) {
   // We do the computations in the unsigned domain to avoid unsigned behaviour
   // on overflow.
-  uint64_t days_since_epoch =
-      i96.value[2] - static_cast<uint64_t>(kJulianToUnixEpochDays);
-  uint64_t nanoseconds = 0;
+  DecodedInt96 result;
+  result.days_since_epoch = i96.value[2] - static_cast<uint64_t>(kJulianToUnixEpochDays);
+  result.nanoseconds = 0;
+
+  memcpy(&result.nanoseconds, &i96.value, sizeof(uint64_t));
+  return result;
+}
+
+static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) {
+  const auto decoded = DecodeInt96Timestamp(i96);
+  return static_cast<int64_t>(decoded.days_since_epoch * kNanosecondsPerDay +
+                              decoded.nanoseconds);
+}
+
+static inline int64_t Int96GetMicroSeconds(const parquet::Int96& i96) {
+  const auto decoded = DecodeInt96Timestamp(i96);
+  uint64_t microseconds = decoded.nanoseconds / static_cast<uint64_t>(1000);
+  return static_cast<int64_t>(decoded.days_since_epoch * kMicrosecondsPerDay +
+                              microseconds);
+}
+
+static inline int64_t Int96GetMilliSeconds(const parquet::Int96& i96) {
+  const auto decoded = DecodeInt96Timestamp(i96);
+  uint64_t milliseconds = decoded.nanoseconds / static_cast<uint64_t>(1000000);
+  return static_cast<int64_t>(decoded.days_since_epoch * kMillisecondsPerDay +
+                              milliseconds);
+}
 
-  memcpy(&nanoseconds, &i96.value, sizeof(uint64_t));
-  return static_cast<int64_t>(days_since_epoch * kNanosecondsPerDay + nanoseconds);
+static inline int64_t Int96GetSeconds(const parquet::Int96& i96) {
+  const auto decoded = DecodeInt96Timestamp(i96);
+  uint64_t seconds = decoded.nanoseconds / static_cast<uint64_t>(1000000000);
+  return static_cast<int64_t>(decoded.days_since_epoch * kSecondsPerDay + seconds);
 }
 
 static inline std::string Int96ToString(const Int96& a) {

From 71bcfae48d6b951d2bf6e2f5d57cff4bd75ce33f Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Tue, 15 Jun 2021 18:33:01 +0200
Subject: [PATCH 17/61] ARROW-13027: [C++] Fix ASAN stack traces in CI

Before change:

```
Direct leak of 65536 byte(s) in 1 object(s) allocated from:
    #0 0x522f09 in
    #1 0x7f28ae5826f4 in
    #2 0x7f28ae57fa5d in
    #3 0x7f28ae58cb0f in
    #4 0x7f28ae58bda0 in
    ...
```

After change:
```
Direct leak of 65536 byte(s) in 1 object(s) allocated from:
    #0 0x522f09 in posix_memalign (/build/cpp/debug/arrow-dataset-file-csv-test+0x522f09)
    #1 0x7f28ae5826f4 in arrow::(anonymous namespace)::SystemAllocator::AllocateAligned(long, unsigned char**) /arrow/cpp/src/arrow/memory_pool.cc:213:24
    #2 0x7f28ae57fa5d in arrow::BaseMemoryPoolImpl<arrow::(anonymous namespace)::SystemAllocator>::Allocate(long, unsigned char**) /arrow/cpp/src/arrow/memory_pool.cc:405:5
    #3 0x7f28ae58cb0f in arrow::PoolBuffer::Reserve(long) /arrow/cpp/src/arrow/memory_pool.cc:717:9
    #4 0x7f28ae58bda0 in arrow::PoolBuffer::Resize(long, bool) /arrow/cpp/src/arrow/memory_pool.cc:741:7
    ...
```

Closes #10498 from westonpace/feature/ARROW-13027--c-fix-asan-stack-traces-in-ci

Authored-by: Weston Pace <weston.pace@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 ci/docker/ubuntu-20.04-cpp.dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile
index c75c013799a..c2a468d9e35 100644
--- a/ci/docker/ubuntu-20.04-cpp.dockerfile
+++ b/ci/docker/ubuntu-20.04-cpp.dockerfile
@@ -127,6 +127,7 @@ ENV ARROW_BUILD_TESTS=ON \
     ARROW_WITH_SNAPPY=ON \
     ARROW_WITH_ZLIB=ON \
     ARROW_WITH_ZSTD=ON \
+    ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-${llvm}/bin/llvm-symbolizer \
     AWSSDK_SOURCE=BUNDLED \
     GTest_SOURCE=BUNDLED \
     gRPC_SOURCE=BUNDLED \

From 322d2c81acd12330a8309f612406195183d13cdf Mon Sep 17 00:00:00 2001
From: Giordon Stark <kratsg@gmail.com>
Date: Tue, 15 Jun 2021 21:24:49 +0200
Subject: [PATCH 18/61] ARROW-13085: [Python] Document compatible toolchains
 for python bindings

This is a documentation-only PR that adds an additional note for users compibiling C++ extensions using the shared libraries bundled with the python package. Adding this note on the toolchain will help resolve (confusing?) segfaults that occur.

Before (toolchain) change:

- segfault when running the minimal cpp example

After (toolchain) change:

- no segfault when running the minimal cpp example

Please see the linked JIRA for more details.

Closes #10535 from kratsg/docs/pythonBindingExtensions

Lead-authored-by: Giordon Stark <kratsg@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 docs/source/python/extending.rst | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/docs/source/python/extending.rst b/docs/source/python/extending.rst
index 738a7369f70..5f6ddb154e6 100644
--- a/docs/source/python/extending.rst
+++ b/docs/source/python/extending.rst
@@ -466,3 +466,14 @@ installed. This function will attempt to create symlinks like
 
    pip install pyarrow
    python -c "import pyarrow; pyarrow.create_library_symlinks()"
+
+Toolchain Compatibility (Linux)
+"""""""""""""""""""""""""""""""
+
+The Python wheels for Linux are built using the
+`PyPA manylinux images <https://quay.io/organization/pypa>`_ which use
+the CentOS `devtoolset-8` or `devtoolset-9` depending on which manylinux
+wheel version (2010 or 2014) is being used. In addition to the other notes
+above, if you are compiling C++ using these shared libraries, you will need
+to make sure you use a compatible toolchain as well or you might see a
+segfault during runtime.

From b33a1a9c803fa4b44b684c280782629a46cdb7db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Wed, 16 Jun 2021 08:39:22 +0900
Subject: [PATCH 19/61] ARROW-13082: [CI] Forward R argument to ubuntu-docs
 build
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`R=4.1 archery docker run ubuntu-docs`

Closes #10534 from kszucs/forward-r-arg-to-docs-build

Authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 docker-compose.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker-compose.yml b/docker-compose.yml
index 1133bfa3b29..fa0f0a28ad1 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1294,6 +1294,7 @@ services:
       cache_from:
         - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-docs
       args:
+        r: ${R}
         jdk: ${JDK}
         node: ${NODE}
         base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3

From e5d450e6b4fe06022ea82e631eec18338c87d7b5 Mon Sep 17 00:00:00 2001
From: Diana Clarke <diana.joan.clarke@gmail.com>
Date: Wed, 16 Jun 2021 01:32:21 +0000
Subject: [PATCH 20/61] ARROW-13073: [Developer] archery benchmark list:
 unexpected keyword 'benchmark_filter'

```
$ archery benchmark list
Traceback (most recent call last):
  File "/Users/diana/envs/arrow/bin/archery", line 33, in <module>
    sys.exit(load_entry_point('archery', 'console_scripts', 'archery')())
  File "/Users/diana/envs/arrow/lib/python3.9/site-packages/click/core.py", line 1137, in __call__
    return self.main(*args, **kwargs)
  File "/Users/diana/envs/arrow/lib/python3.9/site-packages/click/core.py", line 1062, in main
    rv = self.invoke(ctx)
  File "/Users/diana/envs/arrow/lib/python3.9/site-packages/click/core.py", line 1668, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/Users/diana/envs/arrow/lib/python3.9/site-packages/click/core.py", line 1668, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/Users/diana/envs/arrow/lib/python3.9/site-packages/click/core.py", line 1404, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/Users/diana/envs/arrow/lib/python3.9/site-packages/click/core.py", line 763, in invoke
    return __callback(*args, **kwargs)
  File "/Users/diana/envs/arrow/lib/python3.9/site-packages/click/decorators.py", line 26, in new_func
    return f(get_current_context(), *args, **kwargs)
  File "/Users/diana/workspace/arrow/dev/archery/archery/cli.py", line 430, in benchmark_list
    conf = CppBenchmarkRunner.default_configuration(
  File "/Users/diana/workspace/arrow/dev/archery/archery/benchmark/runner.py", line 118, in default_configuration
    return CppConfiguration(
TypeError: __init__() got an unexpected keyword argument 'benchmark_filter'
```

Closes #10528 from dianaclarke/ARROW-13073

Authored-by: Diana Clarke <diana.joan.clarke@gmail.com>
Signed-off-by: Yibo Cai <yibo.cai@arm.com>
---
 dev/archery/archery/cli.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index c35b0864900..9442b2917e0 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -416,7 +416,6 @@ def benchmark_filter_options(cmd):
 @click.argument("rev_or_path", metavar="[<rev_or_path>]",
                 default="WORKSPACE", required=False)
 @benchmark_common_options
-@benchmark_filter_options
 @click.pass_context
 def benchmark_list(ctx, rev_or_path, src, preserve, output, cmake_extras,
                    java_home, java_options, build_extras, benchmark_extras,

From 4fc96b54902d3737857dc567a160c6336b6bbf3e Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Wed, 16 Jun 2021 14:03:23 +0900
Subject: [PATCH 21/61] ARROW-11782: [GLib][Ruby][Dataset] Remove bindings for
 internal classes

Closes #10533 from kou/glib-dataset-factory

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 .../arrow-dataset-glib/arrow-dataset-glib.h   |   2 +
 .../arrow-dataset-glib/arrow-dataset-glib.hpp |   2 +
 c_glib/arrow-dataset-glib/dataset-factory.cpp | 468 ++++++++++++++++
 c_glib/arrow-dataset-glib/dataset-factory.h   |  98 ++++
 c_glib/arrow-dataset-glib/dataset-factory.hpp |  27 +
 c_glib/arrow-dataset-glib/dataset.cpp         | 365 ++++++++++++
 c_glib/arrow-dataset-glib/dataset.h           |  65 +++
 c_glib/arrow-dataset-glib/dataset.hpp         |  48 ++
 c_glib/arrow-dataset-glib/meson.build         |   6 +
 c_glib/arrow-dataset-glib/scanner.cpp         | 527 +++++-------------
 c_glib/arrow-dataset-glib/scanner.h           |  77 +--
 c_glib/arrow-dataset-glib/scanner.hpp         |  20 +-
 c_glib/arrow-glib/basic-array.cpp             |   6 +-
 .../arrow-dataset-glib-docs.xml               |  20 +-
 .../test-file-system-dataset-factory.rb       |  55 ++
 .../test/dataset/test-file-system-dataset.rb  |  23 +-
 .../test/dataset/test-in-memory-scan-task.rb  |  59 --
 c_glib/test/dataset/test-scan-options.rb      |  47 --
 c_glib/test/dataset/test-scanner.rb           |  48 ++
 c_glib/test/helper/buildable.rb               |  19 +-
 .../test/helper/writable.rb                   |  27 +-
 c_glib/test/run-test.rb                       |   5 +-
 cpp/src/arrow/dataset/discovery.h             |  15 +-
 .../{scan-options.rb => dataset.rb}           |  20 +-
 .../lib/arrow-dataset/in-memory-scan-task.rb  |  35 --
 .../lib/arrow-dataset/loader.rb               |   3 +-
 ruby/red-arrow-dataset/test/helper.rb         |   2 +
 ...options.rb => test-file-system-dataset.rb} |  28 +-
 28 files changed, 1462 insertions(+), 655 deletions(-)
 create mode 100644 c_glib/arrow-dataset-glib/dataset-factory.cpp
 create mode 100644 c_glib/arrow-dataset-glib/dataset-factory.h
 create mode 100644 c_glib/arrow-dataset-glib/dataset-factory.hpp
 create mode 100644 c_glib/arrow-dataset-glib/dataset.cpp
 create mode 100644 c_glib/arrow-dataset-glib/dataset.h
 create mode 100644 c_glib/arrow-dataset-glib/dataset.hpp
 create mode 100644 c_glib/test/dataset/test-file-system-dataset-factory.rb
 rename ruby/red-arrow-dataset/test/test-in-memory-scan-task.rb => c_glib/test/dataset/test-file-system-dataset.rb (64%)
 delete mode 100644 c_glib/test/dataset/test-in-memory-scan-task.rb
 delete mode 100644 c_glib/test/dataset/test-scan-options.rb
 create mode 100644 c_glib/test/dataset/test-scanner.rb
 rename ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-fragment.rb => c_glib/test/helper/writable.rb (63%)
 rename ruby/red-arrow-dataset/lib/arrow-dataset/{scan-options.rb => dataset.rb} (69%)
 delete mode 100644 ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-scan-task.rb
 rename ruby/red-arrow-dataset/test/{test-scan-options.rb => test-file-system-dataset.rb} (58%)

diff --git a/c_glib/arrow-dataset-glib/arrow-dataset-glib.h b/c_glib/arrow-dataset-glib/arrow-dataset-glib.h
index ff160452845..03e56516112 100644
--- a/c_glib/arrow-dataset-glib/arrow-dataset-glib.h
+++ b/c_glib/arrow-dataset-glib/arrow-dataset-glib.h
@@ -21,6 +21,8 @@
 
 #include <arrow-glib/arrow-glib.h>
 
+#include <arrow-dataset-glib/dataset-factory.h>
+#include <arrow-dataset-glib/dataset.h>
 #include <arrow-dataset-glib/file-format.h>
 #include <arrow-dataset-glib/fragment.h>
 #include <arrow-dataset-glib/scanner.h>
diff --git a/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp b/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp
index c221825bc2a..65341b9b77e 100644
--- a/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp
+++ b/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp
@@ -21,6 +21,8 @@
 
 #include <arrow-glib/arrow-glib.hpp>
 
+#include <arrow-dataset-glib/dataset-factory.hpp>
+#include <arrow-dataset-glib/dataset.hpp>
 #include <arrow-dataset-glib/file-format.hpp>
 #include <arrow-dataset-glib/fragment.hpp>
 #include <arrow-dataset-glib/scanner.hpp>
diff --git a/c_glib/arrow-dataset-glib/dataset-factory.cpp b/c_glib/arrow-dataset-glib/dataset-factory.cpp
new file mode 100644
index 00000000000..146db69adfc
--- /dev/null
+++ b/c_glib/arrow-dataset-glib/dataset-factory.cpp
@@ -0,0 +1,468 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <arrow-glib/error.hpp>
+#include <arrow-glib/file-system.hpp>
+
+#include <arrow-dataset-glib/dataset-factory.hpp>
+#include <arrow-dataset-glib/dataset.hpp>
+#include <arrow-dataset-glib/file-format.hpp>
+
+G_BEGIN_DECLS
+
+/**
+ * SECTION: dataset-factory
+ * @section_id: dataset-factory
+ * @title: Dataset factory related classes
+ * @include: arrow-dataset-glib/arrow-dataset-glib.h
+ *
+ * #GADatasetDatasetFactory is a base class for dataset factories.
+ *
+ * #GADatasetFileSystemDatasetFactory is a class for
+ * #GADatasetFileSystemDataset factory.
+ *
+ * Since: 5.0.0
+ */
+
+typedef struct GADatasetDatasetFactoryPrivate_ {
+  std::shared_ptr<arrow::dataset::DatasetFactory> factory;
+} GADatasetDatasetFactoryPrivate;
+
+enum {
+  PROP_DATASET_FACTORY = 1,
+};
+
+G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADatasetDatasetFactory,
+                                    gadataset_dataset_factory,
+                                    G_TYPE_OBJECT)
+
+#define GADATASET_DATASET_FACTORY_GET_PRIVATE(obj)        \
+  static_cast<GADatasetDatasetFactoryPrivate *>(          \
+    gadataset_dataset_factory_get_instance_private(       \
+      GADATASET_DATASET_FACTORY(obj)))
+
+static void
+gadataset_dataset_factory_finalize(GObject *object)
+{
+  auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(object);
+  priv->factory.~shared_ptr();
+  G_OBJECT_CLASS(gadataset_dataset_factory_parent_class)->finalize(object);
+}
+
+static void
+gadataset_dataset_factory_set_property(GObject *object,
+                                       guint prop_id,
+                                       const GValue *value,
+                                       GParamSpec *pspec)
+{
+  auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_DATASET_FACTORY:
+    {
+      auto arrow_factory_pointer =
+        static_cast<std::shared_ptr<arrow::dataset::DatasetFactory> *>(
+          g_value_get_pointer(value));
+      if (arrow_factory_pointer) {
+        priv->factory = *arrow_factory_pointer;
+      }
+    }
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gadataset_dataset_factory_init(GADatasetDatasetFactory *object)
+{
+  auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(object);
+  new(&priv->factory) std::shared_ptr<arrow::dataset::DatasetFactory>;
+}
+
+static void
+gadataset_dataset_factory_class_init(GADatasetDatasetFactoryClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+  gobject_class->finalize     = gadataset_dataset_factory_finalize;
+  gobject_class->set_property = gadataset_dataset_factory_set_property;
+
+  GParamSpec *spec;
+  spec = g_param_spec_pointer("dataset-factory",
+                              "Dataset factory",
+                              "The raw "
+                              "std::shared<arrow::dataset::DatasetFactory> *",
+                              static_cast<GParamFlags>(G_PARAM_WRITABLE |
+                                                       G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_DATASET_FACTORY, spec);
+}
+
+/**
+ * gadataset_dataset_factory_finish:
+ * @factory: A #GADatasetDatasetFactory.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: (transfer full) (nullable):
+ *   A newly created #GADatasetDataset on success, %NULL on error.
+ *
+ * Since: 5.0.0
+ */
+GADatasetDataset *
+gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory,
+                                 GError **error)
+{
+  auto arrow_factory = gadataset_dataset_factory_get_raw(factory);
+  auto arrow_dataset_result = arrow_factory->Finish();
+  if (garrow::check(error, arrow_dataset_result, "[dataset-factory][finish]")) {
+    auto arrow_dataset = *arrow_dataset_result;
+    return gadataset_dataset_new_raw(&arrow_dataset);
+  } else {
+    return NULL;
+  }
+}
+
+
+typedef struct GADatasetFileSystemDatasetFactoryPrivate_ {
+  GADatasetFileFormat *format;
+  GArrowFileSystem *file_system;
+  GList *files;
+  arrow::dataset::FileSystemFactoryOptions options;
+} GADatasetFileSystemDatasetFactoryPrivate;
+
+enum {
+  PROP_FORMAT = 1,
+  PROP_FILE_SYSTEM,
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileSystemDatasetFactory,
+                           gadataset_file_system_dataset_factory,
+                           GADATASET_TYPE_DATASET_FACTORY)
+
+#define GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(obj)  \
+  static_cast<GADatasetFileSystemDatasetFactoryPrivate *>(      \
+    gadataset_file_system_dataset_factory_get_instance_private( \
+      GADATASET_FILE_SYSTEM_DATASET_FACTORY(obj)))
+
+static void
+gadataset_file_system_dataset_factory_dispose(GObject *object)
+{
+  auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object);
+
+  if (priv->format) {
+    g_object_unref(priv->format);
+    priv->format = NULL;
+  }
+
+  if (priv->file_system) {
+    g_object_unref(priv->file_system);
+    priv->file_system = NULL;
+  }
+
+  if (priv->files) {
+    g_list_free_full(priv->files, g_object_unref);
+    priv->files = NULL;
+  }
+
+  G_OBJECT_CLASS(
+    gadataset_file_system_dataset_factory_parent_class)->dispose(object);
+}
+
+static void
+gadataset_file_system_dataset_factory_finalize(GObject *object)
+{
+  auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object);
+  priv->options.~FileSystemFactoryOptions();
+  G_OBJECT_CLASS(
+    gadataset_file_system_dataset_factory_parent_class)->finalize(object);
+}
+
+static void
+gadataset_file_system_dataset_factory_set_property(GObject *object,
+                                                   guint prop_id,
+                                                   const GValue *value,
+                                                   GParamSpec *pspec)
+{
+  auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_FORMAT:
+    priv->format = GADATASET_FILE_FORMAT(g_value_dup_object(value));
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gadataset_file_system_dataset_factory_get_property(GObject *object,
+                                                   guint prop_id,
+                                                   GValue *value,
+                                                   GParamSpec *pspec)
+{
+  auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_FORMAT:
+    g_value_set_object(value, priv->format);
+    break;
+  case PROP_FILE_SYSTEM:
+    g_value_set_object(value, priv->file_system);
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gadataset_file_system_dataset_factory_init(
+  GADatasetFileSystemDatasetFactory *object)
+{
+  auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object);
+  new(&priv->options) arrow::dataset::FileSystemFactoryOptions;
+}
+
+static void
+gadataset_file_system_dataset_factory_class_init(
+  GADatasetFileSystemDatasetFactoryClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+  gobject_class->dispose      = gadataset_file_system_dataset_factory_dispose;
+  gobject_class->finalize     = gadataset_file_system_dataset_factory_finalize;
+  gobject_class->set_property = gadataset_file_system_dataset_factory_set_property;
+  gobject_class->get_property = gadataset_file_system_dataset_factory_get_property;
+
+  GParamSpec *spec;
+  /**
+   * GADatasetFileSystemDatasetFactory:format:
+   *
+   * Format passed to #GADatasetFileSystemDataset.
+   *
+   * Since: 5.0.0
+   */
+  spec = g_param_spec_object("format",
+                             "Format",
+                             "Format passed to GADatasetFileSystemDataset",
+                             GADATASET_TYPE_FILE_FORMAT,
+                             static_cast<GParamFlags>(G_PARAM_READWRITE |
+                                                      G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_FORMAT, spec);
+
+  /**
+   * GADatasetFileSystemDatasetFactory:file-system:
+   *
+   * File system passed to #GADatasetFileSystemDataset.
+   *
+   * Since: 5.0.0
+   */
+  spec = g_param_spec_object("file-system",
+                             "File system",
+                             "File system passed to GADatasetFileSystemDataset",
+                             GARROW_TYPE_FILE_SYSTEM,
+                             static_cast<GParamFlags>(G_PARAM_READABLE));
+  g_object_class_install_property(gobject_class, PROP_FILE_SYSTEM, spec);
+}
+
+/**
+ * gadataset_file_system_factory_new:
+ * @format: A #GADatasetFileFormat.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: A newly created #GADatasetDatasetFileSystemFactory on success,
+ *   %NULL on error.
+ *
+ * Since: 5.0.0
+ */
+GADatasetFileSystemDatasetFactory *
+gadataset_file_system_dataset_factory_new(GADatasetFileFormat *format)
+{
+  return GADATASET_FILE_SYSTEM_DATASET_FACTORY(
+    g_object_new(GADATASET_TYPE_FILE_SYSTEM_DATASET_FACTORY,
+                 "format", format,
+                 NULL));
+}
+
+/**
+ * gadataset_file_system_dataset_factory_set_file_system:
+ * @factory: A #GADatasetFileSystemDatasetFactory.
+ * @file_system: A #GArrowFileSystem.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: %TRUE on success, %FALSE otherwise.
+ *
+ * Since: 5.0.0
+ */
+gboolean
+gadataset_file_system_dataset_factory_set_file_system(
+  GADatasetFileSystemDatasetFactory *factory,
+  GArrowFileSystem *file_system,
+  GError **error)
+{
+  const gchar *context = "[file-system-dataset-factory][set-file-system]";
+  auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory);
+  if (priv->file_system) {
+    garrow::check(error,
+                  arrow::Status::Invalid("file system is already set"),
+                  context);
+    return FALSE;
+  }
+  priv->file_system = file_system;
+  g_object_ref(priv->file_system);
+  return TRUE;
+}
+
+/**
+ * gadataset_file_system_dataset_factory_set_file_system_uri:
+ * @factory: A #GADatasetFileSystemDatasetFactory.
+ * @uri: An URI for file system.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: %TRUE on success, %FALSE otherwise.
+ *
+ * Since: 5.0.0
+ */
+gboolean
+gadataset_file_system_dataset_factory_set_file_system_uri(
+  GADatasetFileSystemDatasetFactory *factory,
+  const gchar *uri,
+  GError **error)
+{
+  const gchar *context = "[file-system-dataset-factory][set-file-system-uri]";
+  auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory);
+  if (priv->file_system) {
+    garrow::check(error,
+                  arrow::Status::Invalid("file system is already set"),
+                  context);
+    return FALSE;
+  }
+  std::string internal_path;
+  auto arrow_file_system_result =
+    arrow::fs::FileSystemFromUri(uri, &internal_path);
+  if (!garrow::check(error, arrow_file_system_result, context)) {
+    return FALSE;
+  }
+  auto arrow_file_system = *arrow_file_system_result;
+  auto arrow_file_info_result = arrow_file_system->GetFileInfo(internal_path);
+  if (!garrow::check(error, arrow_file_info_result, context)) {
+    return FALSE;
+  }
+  priv->file_system = garrow_file_system_new_raw(&arrow_file_system);
+  auto file_info = garrow_file_info_new_raw(*arrow_file_info_result);
+  priv->files = g_list_prepend(priv->files, file_info);
+  return TRUE;
+}
+
+/**
+ * gadataset_file_system_dataset_factory_add_path:
+ * @factory: A #GADatasetFileSystemDatasetFactory.
+ * @path: A path to be added.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: %TRUE on success, %FALSE otherwise.
+ *
+ * Since: 5.0.0
+ */
+gboolean
+gadataset_file_system_dataset_factory_add_path(
+  GADatasetFileSystemDatasetFactory *factory,
+  const gchar *path,
+  GError **error)
+{
+  const gchar *context = "[file-system-dataset-factory][add-path]";
+  auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory);
+  if (!priv->file_system) {
+    garrow::check(error,
+                  arrow::Status::Invalid("file system isn't set"),
+                  context);
+    return FALSE;
+  }
+  auto arrow_file_system = garrow_file_system_get_raw(priv->file_system);
+  auto arrow_file_info_result = arrow_file_system->GetFileInfo(path);
+  if (!garrow::check(error, arrow_file_info_result, context)) {
+    return FALSE;
+  }
+  auto file_info = garrow_file_info_new_raw(*arrow_file_info_result);
+  priv->files = g_list_prepend(priv->files, file_info);
+  return TRUE;
+}
+
+/**
+ * gadataset_file_system_dataset_factory_finish:
+ * @factory: A #GADatasetFileSystemDatasetFactory.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: (transfer full) (nullable):
+ *   A newly created #GADatasetFileSystemDataset on success, %NULL on error.
+ *
+ * Since: 5.0.0
+ */
+GADatasetFileSystemDataset *
+gadataset_file_system_dataset_factory_finish(
+  GADatasetFileSystemDatasetFactory *factory,
+  GError **error)
+{
+  const gchar *context = "[file-system-dataset-factory][finish]";
+  auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory);
+  if (!priv->file_system) {
+    garrow::check(error,
+                  arrow::Status::Invalid("file system isn't set"),
+                  context);
+    return NULL;
+  }
+  auto arrow_file_system = garrow_file_system_get_raw(priv->file_system);
+  auto arrow_format = gadataset_file_format_get_raw(priv->format);
+  std::vector<arrow::fs::FileInfo> arrow_files;
+  priv->files = g_list_reverse(priv->files);
+  for (auto node = priv->files; node; node = node->next) {
+    auto file = GARROW_FILE_INFO(node->data);
+    arrow_files.push_back(*garrow_file_info_get_raw(file));
+  }
+  priv->files = g_list_reverse(priv->files);
+  auto arrow_factory_result =
+    arrow::dataset::FileSystemDatasetFactory::Make(arrow_file_system,
+                                                   arrow_files,
+                                                   arrow_format,
+                                                   priv->options);
+  if (!garrow::check(error, arrow_factory_result, context)) {
+    return NULL;
+  }
+  auto arrow_dataset_result = (*arrow_factory_result)->Finish();
+  if (!garrow::check(error, arrow_dataset_result, context)) {
+    return NULL;
+  }
+  auto arrow_dataset = *arrow_dataset_result;
+  return GADATASET_FILE_SYSTEM_DATASET(
+    gadataset_dataset_new_raw(&arrow_dataset,
+                              "dataset", &arrow_dataset,
+                              "file-system", priv->file_system,
+                              "format", priv->format,
+                              NULL));
+}
+
+
+G_END_DECLS
+
+std::shared_ptr<arrow::dataset::DatasetFactory>
+gadataset_dataset_factory_get_raw(GADatasetDatasetFactory *factory)
+{
+  auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(factory);
+  return priv->factory;
+}
diff --git a/c_glib/arrow-dataset-glib/dataset-factory.h b/c_glib/arrow-dataset-glib/dataset-factory.h
new file mode 100644
index 00000000000..e2ee3ed9806
--- /dev/null
+++ b/c_glib/arrow-dataset-glib/dataset-factory.h
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <arrow-dataset-glib/dataset.h>
+
+G_BEGIN_DECLS
+
+#define GADATASET_TYPE_DATASET_FACTORY (gadataset_dataset_factory_get_type())
+G_DECLARE_DERIVABLE_TYPE(GADatasetDatasetFactory,
+                         gadataset_dataset_factory,
+                         GADATASET,
+                         DATASET_FACTORY,
+                         GObject)
+struct _GADatasetDatasetFactoryClass
+{
+  GObjectClass parent_class;
+};
+
+GARROW_AVAILABLE_IN_5_0
+GADatasetDataset *
+gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory,
+                                 GError **error);
+
+
+#define GADATASET_TYPE_FILE_SYSTEM_DATASET_FACTORY      \
+  (gadataset_file_system_dataset_factory_get_type())
+G_DECLARE_DERIVABLE_TYPE(GADatasetFileSystemDatasetFactory,
+                         gadataset_file_system_dataset_factory,
+                         GADATASET,
+                         FILE_SYSTEM_DATASET_FACTORY,
+                         GADatasetDatasetFactory)
+struct _GADatasetFileSystemDatasetFactoryClass
+{
+  GADatasetDatasetFactoryClass parent_class;
+};
+
+GARROW_AVAILABLE_IN_5_0
+GADatasetFileSystemDatasetFactory *
+gadataset_file_system_dataset_factory_new(GADatasetFileFormat *file_format);
+GARROW_AVAILABLE_IN_5_0
+gboolean
+gadataset_file_system_dataset_factory_set_file_system(
+  GADatasetFileSystemDatasetFactory *factory,
+  GArrowFileSystem *file_system,
+  GError **error);
+gboolean
+gadataset_file_system_dataset_factory_set_file_system_uri(
+  GADatasetFileSystemDatasetFactory *factory,
+  const gchar *uri,
+  GError **error);
+
+GARROW_AVAILABLE_IN_5_0
+gboolean
+gadataset_file_system_dataset_factory_add_path(
+  GADatasetFileSystemDatasetFactory *factory,
+  const gchar *path,
+  GError **error);
+/*
+GARROW_AVAILABLE_IN_5_0
+gboolean
+gadataset_file_system_dataset_factory_add_file(
+  GADatasetFileSystemDatasetFactory *factory,
+  GArrowFileInfo *file,
+  GError **error);
+GARROW_AVAILABLE_IN_5_0
+gboolean
+gadataset_file_system_dataset_factory_add_selector(
+  GADatasetFileSystemDatasetFactory *factory,
+  GArrorFileSelector *selector,
+  GError **error);
+*/
+
+GARROW_AVAILABLE_IN_5_0
+GADatasetFileSystemDataset *
+gadataset_file_system_dataset_factory_finish(
+  GADatasetFileSystemDatasetFactory *factory,
+  GError **error);
+
+
+G_END_DECLS
diff --git a/c_glib/arrow-dataset-glib/dataset-factory.hpp b/c_glib/arrow-dataset-glib/dataset-factory.hpp
new file mode 100644
index 00000000000..114db35bc59
--- /dev/null
+++ b/c_glib/arrow-dataset-glib/dataset-factory.hpp
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <arrow/dataset/api.h>
+
+#include <arrow-dataset-glib/dataset-factory.h>
+
+std::shared_ptr<arrow::dataset::DatasetFactory>
+gadataset_dataset_factory_get_raw(GADatasetDatasetFactory *factory);
diff --git a/c_glib/arrow-dataset-glib/dataset.cpp b/c_glib/arrow-dataset-glib/dataset.cpp
new file mode 100644
index 00000000000..3bd62f99ef3
--- /dev/null
+++ b/c_glib/arrow-dataset-glib/dataset.cpp
@@ -0,0 +1,365 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <arrow-glib/error.hpp>
+#include <arrow-glib/table.hpp>
+
+#include <arrow-dataset-glib/dataset-factory.hpp>
+#include <arrow-dataset-glib/dataset.hpp>
+#include <arrow-dataset-glib/scanner.h>
+
+G_BEGIN_DECLS
+
+/**
+ * SECTION: dataset
+ * @section_id: dataset
+ * @title: Dataset related classes
+ * @include: arrow-dataset-glib/arrow-dataset-glib.h
+ *
+ * #GADatasetDataset is a base class for datasets.
+ *
+ * #GADatasetFileSystemDataset is a class for file system dataset.
+ *
+ * #GADatasetFileFormat is a base class for file formats.
+ *
+ * #GADatasetCSVFileFormat is a class for CSV file format.
+ *
+ * #GADatasetIPCFileFormat is a class for IPC file format.
+ *
+ * #GADatasetParquetFileFormat is a class for Apache Parquet file format.
+ *
+ * Since: 5.0.0
+ */
+
+typedef struct GADatasetDatasetPrivate_ {
+  std::shared_ptr<arrow::dataset::Dataset> dataset;
+} GADatasetDatasetPrivate;
+
+enum {
+  PROP_DATASET = 1,
+};
+
+G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADatasetDataset,
+                                    gadataset_dataset,
+                                    G_TYPE_OBJECT)
+
+#define GADATASET_DATASET_GET_PRIVATE(obj)         \
+  static_cast<GADatasetDatasetPrivate *>(          \
+    gadataset_dataset_get_instance_private(        \
+      GADATASET_DATASET(obj)))
+
+static void
+gadataset_dataset_finalize(GObject *object)
+{
+  auto priv = GADATASET_DATASET_GET_PRIVATE(object);
+  priv->dataset.~shared_ptr();
+  G_OBJECT_CLASS(gadataset_dataset_parent_class)->finalize(object);
+}
+
+static void
+gadataset_dataset_set_property(GObject *object,
+                               guint prop_id,
+                               const GValue *value,
+                               GParamSpec *pspec)
+{
+  auto priv = GADATASET_DATASET_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_DATASET:
+    priv->dataset =
+      *static_cast<std::shared_ptr<arrow::dataset::Dataset> *>(
+        g_value_get_pointer(value));
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gadataset_dataset_init(GADatasetDataset *object)
+{
+  auto priv = GADATASET_DATASET_GET_PRIVATE(object);
+  new(&priv->dataset) std::shared_ptr<arrow::dataset::Dataset>;
+}
+
+static void
+gadataset_dataset_class_init(GADatasetDatasetClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+  gobject_class->finalize     = gadataset_dataset_finalize;
+  gobject_class->set_property = gadataset_dataset_set_property;
+
+  GParamSpec *spec;
+  spec = g_param_spec_pointer("dataset",
+                              "Dataset",
+                              "The raw "
+                              "std::shared<arrow::dataset::Dataset> *",
+                              static_cast<GParamFlags>(G_PARAM_WRITABLE |
+                                                       G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_DATASET, spec);
+}
+
+/**
+ * gadataset_dataset_begin_scan:
+ * @dataset: A #GADatasetDataset.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: (transfer full) (nullable):
+ *   A newly created #GADatasetScannerBuilder on success, %NULL on error.
+ *
+ * Since: 5.0.0
+ */
+GADatasetScannerBuilder *
+gadataset_dataset_begin_scan(GADatasetDataset *dataset,
+                             GError **error)
+{
+  return gadataset_scanner_builder_new(dataset, error);
+}
+
+/**
+ * gadataset_dataset_to_table:
+ * @dataset: A #GADatasetDataset.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: (transfer full) (nullable):
+ *   A loaded #GArrowTable on success, %NULL on error.
+ *
+ * Since: 5.0.0
+ */
+GArrowTable *
+gadataset_dataset_to_table(GADatasetDataset *dataset,
+                           GError **error)
+{
+  auto arrow_dataset = gadataset_dataset_get_raw(dataset);
+  auto arrow_scanner_builder_result = arrow_dataset->NewScan();
+  if (!garrow::check(error,
+                     arrow_scanner_builder_result,
+                     "[dataset][to-table]")) {
+    return NULL;
+  }
+  auto arrow_scanner_builder = *arrow_scanner_builder_result;
+  auto arrow_scanner_result = arrow_scanner_builder->Finish();
+  if (!garrow::check(error,
+                     arrow_scanner_result,
+                     "[dataset][to-table]")) {
+    return NULL;
+  }
+  auto arrow_scanner = *arrow_scanner_result;
+  auto arrow_table_result = arrow_scanner->ToTable();
+  if (!garrow::check(error,
+                     arrow_scanner_result,
+                     "[dataset][to-table]")) {
+    return NULL;
+  }
+  return garrow_table_new_raw(&(*arrow_table_result));
+}
+
+/**
+ * gadataset_dataset_get_type_name:
+ * @dataset: A #GADatasetDataset.
+ *
+ * Returns: The type name of @dataset.
+ *
+ *   It should be freed with g_free() when no longer needed.
+ *
+ * Since: 5.0.0
+ */
+gchar *
+gadataset_dataset_get_type_name(GADatasetDataset *dataset)
+{
+  const auto arrow_dataset = gadataset_dataset_get_raw(dataset);
+  const auto &type_name = arrow_dataset->type_name();
+  return g_strndup(type_name.data(), type_name.size());
+}
+
+
+typedef struct GADatasetFileSystemDatasetPrivate_ {
+  GADatasetFileFormat *format;
+  GArrowFileSystem *file_system;
+} GADatasetFileSystemDatasetPrivate;
+
+enum {
+  PROP_FORMAT = 1,
+  PROP_FILE_SYSTEM,
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileSystemDataset,
+                           gadataset_file_system_dataset,
+                           GADATASET_TYPE_DATASET)
+
+#define GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(obj)   \
+  static_cast<GADatasetFileSystemDatasetPrivate *>(      \
+    gadataset_file_system_dataset_get_instance_private(  \
+      GADATASET_FILE_SYSTEM_DATASET(obj)))
+
+static void
+gadataset_file_system_dataset_dispose(GObject *object)
+{
+  auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object);
+
+  if (priv->format) {
+    g_object_unref(priv->format);
+    priv->format = NULL;
+  }
+
+  if (priv->file_system) {
+    g_object_unref(priv->file_system);
+    priv->file_system = NULL;
+  }
+
+  G_OBJECT_CLASS(gadataset_file_system_dataset_parent_class)->dispose(object);
+}
+
+static void
+gadataset_file_system_dataset_set_property(GObject *object,
+                                           guint prop_id,
+                                           const GValue *value,
+                                           GParamSpec *pspec)
+{
+  auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_FORMAT:
+    priv->format = GADATASET_FILE_FORMAT(g_value_dup_object(value));
+    break;
+  case PROP_FILE_SYSTEM:
+    priv->file_system = GARROW_FILE_SYSTEM(g_value_dup_object(value));
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gadataset_file_system_dataset_get_property(GObject *object,
+                                           guint prop_id,
+                                           GValue *value,
+                                           GParamSpec *pspec)
+{
+  auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_FORMAT:
+    g_value_set_object(value, priv->format);
+    break;
+  case PROP_FILE_SYSTEM:
+    g_value_set_object(value, priv->file_system);
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+gadataset_file_system_dataset_init(GADatasetFileSystemDataset *object)
+{
+}
+
+static void
+gadataset_file_system_dataset_class_init(GADatasetFileSystemDatasetClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+  gobject_class->dispose      = gadataset_file_system_dataset_dispose;
+  gobject_class->set_property = gadataset_file_system_dataset_set_property;
+  gobject_class->get_property = gadataset_file_system_dataset_get_property;
+
+  GParamSpec *spec;
+  /**
+   * GADatasetFileSystemDataset:format:
+   *
+   * Format of the dataset.
+   *
+   * Since: 5.0.0
+   */
+  spec = g_param_spec_object("format",
+                             "Format",
+                             "Format of the dataset",
+                             GADATASET_TYPE_FILE_FORMAT,
+                             static_cast<GParamFlags>(G_PARAM_READWRITE |
+                                                      G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_FORMAT, spec);
+
+  /**
+   * GADatasetFileSystemDataset:file-system:
+   *
+   * File system of the dataset.
+   *
+   * Since: 5.0.0
+   */
+  spec = g_param_spec_object("file-system",
+                             "File system",
+                             "File system of the dataset",
+                             GARROW_TYPE_FILE_SYSTEM,
+                             static_cast<GParamFlags>(G_PARAM_READWRITE |
+                                                      G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_FILE_SYSTEM, spec);
+}
+
+
+G_END_DECLS
+
+GADatasetDataset *
+gadataset_dataset_new_raw(
+  std::shared_ptr<arrow::dataset::Dataset> *arrow_dataset)
+{
+  return gadataset_dataset_new_raw(arrow_dataset,
+                                   "dataset", arrow_dataset,
+                                   NULL);
+}
+
+GADatasetDataset *
+gadataset_dataset_new_raw(
+  std::shared_ptr<arrow::dataset::Dataset> *arrow_dataset,
+  const gchar *first_property_name,
+  ...)
+{
+  va_list args;
+  va_start(args, first_property_name);
+  auto array = gadataset_dataset_new_raw_valist(arrow_dataset,
+                                                first_property_name,
+                                                args);
+  va_end(args);
+  return array;
+}
+
+GADatasetDataset *
+gadataset_dataset_new_raw_valist(
+  std::shared_ptr<arrow::dataset::Dataset> *arrow_dataset,
+  const gchar *first_property_name,
+  va_list args)
+{
+  GType type = GADATASET_TYPE_DATASET;
+  const auto type_name = (*arrow_dataset)->type_name();
+  if (type_name == "filesystem") {
+    type = GADATASET_TYPE_FILE_SYSTEM_DATASET;
+  }
+  return GADATASET_DATASET(g_object_new_valist(type,
+                                               first_property_name,
+                                               args));
+}
+
+std::shared_ptr<arrow::dataset::Dataset>
+gadataset_dataset_get_raw(GADatasetDataset *dataset)
+{
+  auto priv = GADATASET_DATASET_GET_PRIVATE(dataset);
+  return priv->dataset;
+}
diff --git a/c_glib/arrow-dataset-glib/dataset.h b/c_glib/arrow-dataset-glib/dataset.h
new file mode 100644
index 00000000000..97cf35d74d7
--- /dev/null
+++ b/c_glib/arrow-dataset-glib/dataset.h
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <arrow-dataset-glib/file-format.h>
+
+G_BEGIN_DECLS
+
+typedef struct _GADatasetScannerBuilder GADatasetScannerBuilder;
+
+#define GADATASET_TYPE_DATASET (gadataset_dataset_get_type())
+G_DECLARE_DERIVABLE_TYPE(GADatasetDataset,
+                         gadataset_dataset,
+                         GADATASET,
+                         DATASET,
+                         GObject)
+struct _GADatasetDatasetClass
+{
+  GObjectClass parent_class;
+};
+
+GARROW_AVAILABLE_IN_5_0
+GADatasetScannerBuilder *
+gadataset_dataset_begin_scan(GADatasetDataset *dataset,
+                             GError **error);
+GARROW_AVAILABLE_IN_5_0
+GArrowTable *
+gadataset_dataset_to_table(GADatasetDataset *dataset,
+                           GError **error);
+GARROW_AVAILABLE_IN_5_0
+gchar *
+gadataset_dataset_get_type_name(GADatasetDataset *dataset);
+
+
+#define GADATASET_TYPE_FILE_SYSTEM_DATASET      \
+  (gadataset_file_system_dataset_get_type())
+G_DECLARE_DERIVABLE_TYPE(GADatasetFileSystemDataset,
+                         gadataset_file_system_dataset,
+                         GADATASET,
+                         FILE_SYSTEM_DATASET,
+                         GADatasetDataset)
+struct _GADatasetFileSystemDatasetClass
+{
+  GADatasetDatasetClass parent_class;
+};
+
+
+G_END_DECLS
diff --git a/c_glib/arrow-dataset-glib/dataset.hpp b/c_glib/arrow-dataset-glib/dataset.hpp
new file mode 100644
index 00000000000..94dddd2eb7a
--- /dev/null
+++ b/c_glib/arrow-dataset-glib/dataset.hpp
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <arrow/dataset/api.h>
+
+#include <arrow-dataset-glib/dataset.h>
+
+GADatasetDataset *
+gadataset_dataset_new_raw(
+  std::shared_ptr<arrow::dataset::Dataset> *arrow_dataset);
+GADatasetDataset *
+gadataset_dataset_new_raw(
+  std::shared_ptr<arrow::dataset::Dataset> *arrow_dataset,
+  const gchar *first_property_name,
+  ...);
+GADatasetDataset *
+gadataset_dataset_new_raw_valist(
+  std::shared_ptr<arrow::dataset::Dataset> *arrow_dataset,
+  const gchar *first_property_name,
+  va_list arg);
+std::shared_ptr<arrow::dataset::Dataset>
+gadataset_dataset_get_raw(GADatasetDataset *dataset);
+
+GADatasetFileFormat *
+gadataset_file_format_new_raw(
+  std::shared_ptr<arrow::dataset::FileFormat> *arrow_format);
+std::shared_ptr<arrow::dataset::Dataset>
+gadataset_dataset_get_raw(GADatasetDataset *dataset);
+
+
diff --git a/c_glib/arrow-dataset-glib/meson.build b/c_glib/arrow-dataset-glib/meson.build
index 04dc420b057..b3f617330cf 100644
--- a/c_glib/arrow-dataset-glib/meson.build
+++ b/c_glib/arrow-dataset-glib/meson.build
@@ -18,6 +18,8 @@
 # under the License.
 
 sources = files(
+  'dataset-factory.cpp',
+  'dataset.cpp',
   'file-format.cpp',
   'fragment.cpp',
   'scanner.cpp',
@@ -25,6 +27,8 @@ sources = files(
 
 c_headers = files(
   'arrow-dataset-glib.h',
+  'dataset-factory.h',
+  'dataset.h',
   'file-format.h',
   'fragment.h',
   'scanner.h',
@@ -32,6 +36,8 @@ c_headers = files(
 
 cpp_headers = files(
   'arrow-dataset-glib.hpp',
+  'dataset-factory.hpp',
+  'dataset.hpp',
   'file-format.hpp',
   'fragment.hpp',
   'scanner.hpp',
diff --git a/c_glib/arrow-dataset-glib/scanner.cpp b/c_glib/arrow-dataset-glib/scanner.cpp
index 04778c8ae99..7f8d8be5fdb 100644
--- a/c_glib/arrow-dataset-glib/scanner.cpp
+++ b/c_glib/arrow-dataset-glib/scanner.cpp
@@ -17,13 +17,10 @@
  * under the License.
  */
 
-#include <arrow/util/iterator.h>
-
 #include <arrow-glib/error.hpp>
-#include <arrow-glib/record-batch.hpp>
-#include <arrow-glib/schema.hpp>
+#include <arrow-glib/table.hpp>
 
-#include <arrow-dataset-glib/fragment.hpp>
+#include <arrow-dataset-glib/dataset.hpp>
 #include <arrow-dataset-glib/scanner.hpp>
 
 G_BEGIN_DECLS
@@ -31,72 +28,55 @@ G_BEGIN_DECLS
 /**
  * SECTION: scanner
  * @section_id: scanner
- * @title: Scanner classes
+ * @title: Scanner related classes
  * @include: arrow-dataset-glib/arrow-dataset-glib.h
  *
- * #GADatasetScanOptions is a class for a set of scan options.
- *
- * #GADatasetScanTask is an abstract class for a scan task.
+ * #GADatasetScanner is a class for scanning dataset.
  *
- * #GADatasetInMemoryScanTask is a class for a scan task of record batches.
+ * #GADatasetScannerBuilder is a class for building a scanner.
  *
- * Since: 1.0.0
+ * Since: 5.0.0
  */
 
-/* arrow::dataset::ScanOptions */
-
-typedef struct GADatasetScanOptionsPrivate_ {
-  std::shared_ptr<arrow::dataset::ScanOptions> scan_options;
-} GADatasetScanOptionsPrivate;
+typedef struct GADatasetScannerPrivate_ {
+  std::shared_ptr<arrow::dataset::Scanner> scanner;
+} GADatasetScannerPrivate;
 
 enum {
-  PROP_SCAN_OPTIONS = 1,
-  PROP_FILTER,
-  PROP_EVALUATOR,
-  PROP_PROJECTOR,
-  PROP_BATCH_SIZE,
-  PROP_USE_THREADS,
+  PROP_SCANNER = 1,
 };
 
-G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScanOptions,
-                           gadataset_scan_options,
+G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScanner,
+                           gadataset_scanner,
                            G_TYPE_OBJECT)
 
-#define GADATASET_SCAN_OPTIONS_GET_PRIVATE(obj)       \
-  static_cast<GADatasetScanOptionsPrivate *>(         \
-    gadataset_scan_options_get_instance_private(      \
-      GADATASET_SCAN_OPTIONS(obj)))
+#define GADATASET_SCANNER_GET_PRIVATE(obj)        \
+  static_cast<GADatasetScannerPrivate *>(         \
+    gadataset_scanner_get_instance_private(       \
+      GADATASET_SCANNER(obj)))
 
 static void
-gadataset_scan_options_finalize(GObject *object)
+gadataset_scanner_finalize(GObject *object)
 {
-  auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(object);
-
-  priv->scan_options.~shared_ptr();
-
-  G_OBJECT_CLASS(gadataset_scan_options_parent_class)->finalize(object);
+  auto priv = GADATASET_SCANNER_GET_PRIVATE(object);
+  priv->scanner.~shared_ptr();
+  G_OBJECT_CLASS(gadataset_scanner_parent_class)->finalize(object);
 }
 
 static void
-gadataset_scan_options_set_property(GObject *object,
-                                    guint prop_id,
-                                    const GValue *value,
-                                    GParamSpec *pspec)
+gadataset_scanner_set_property(GObject *object,
+                               guint prop_id,
+                               const GValue *value,
+                               GParamSpec *pspec)
 {
-  auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(object);
+  auto priv = GADATASET_SCANNER_GET_PRIVATE(object);
 
   switch (prop_id) {
-  case PROP_SCAN_OPTIONS:
-    priv->scan_options =
-      *static_cast<std::shared_ptr<arrow::dataset::ScanOptions> *>(
+  case PROP_SCANNER:
+    priv->scanner =
+      *static_cast<std::shared_ptr<arrow::dataset::Scanner> *>(
         g_value_get_pointer(value));
     break;
-  case PROP_BATCH_SIZE:
-    priv->scan_options->batch_size = g_value_get_int64(value);
-    break;
-  case PROP_USE_THREADS:
-    priv->scan_options->use_threads = g_value_get_boolean(value);
-    break;
   default:
     G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
     break;
@@ -104,193 +84,92 @@ gadataset_scan_options_set_property(GObject *object,
 }
 
 static void
-gadataset_scan_options_get_property(GObject *object,
-                                    guint prop_id,
-                                    GValue *value,
-                                    GParamSpec *pspec)
+gadataset_scanner_init(GADatasetScanner *object)
 {
-  auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(object);
-
-  switch (prop_id) {
-  case PROP_BATCH_SIZE:
-    g_value_set_int64(value, priv->scan_options->batch_size);
-    break;
-  case PROP_USE_THREADS:
-    g_value_set_boolean(value, priv->scan_options->use_threads);
-    break;
-  default:
-    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
-    break;
-  }
+  auto priv = GADATASET_SCANNER_GET_PRIVATE(object);
+  new(&priv->scanner) std::shared_ptr<arrow::dataset::Scanner>;
 }
 
 static void
-gadataset_scan_options_init(GADatasetScanOptions *object)
+gadataset_scanner_class_init(GADatasetScannerClass *klass)
 {
-  auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(object);
-  new(&priv->scan_options) std::shared_ptr<arrow::dataset::ScanOptions>;
-}
+  auto gobject_class = G_OBJECT_CLASS(klass);
+  gobject_class->finalize     = gadataset_scanner_finalize;
+  gobject_class->set_property = gadataset_scanner_set_property;
 
-static void
-gadataset_scan_options_class_init(GADatasetScanOptionsClass *klass)
-{
-  GObjectClass *gobject_class;
   GParamSpec *spec;
-
-  gobject_class = G_OBJECT_CLASS(klass);
-
-  gobject_class->finalize     = gadataset_scan_options_finalize;
-  gobject_class->set_property = gadataset_scan_options_set_property;
-  gobject_class->get_property = gadataset_scan_options_get_property;
-
-  auto scan_options = std::make_shared<arrow::dataset::ScanOptions>();
-
-  spec = g_param_spec_pointer("scan-options",
-                              "ScanOptions",
-                              "The raw std::shared<arrow::dataset::ScanOptions> *",
+  spec = g_param_spec_pointer("scanner",
+                              "Scanner",
+                              "The raw std::shared<arrow::dataset::Scanner> *",
                               static_cast<GParamFlags>(G_PARAM_WRITABLE |
                                                        G_PARAM_CONSTRUCT_ONLY));
-  g_object_class_install_property(gobject_class, PROP_SCAN_OPTIONS, spec);
-
-  // TODO: PROP_FILTER
-  // TODO: PROP_EVALUATOR
-  // TODO: PROP_PROJECTOR
-
-  /**
-   * GADatasetScanOptions:batch-size:
-   *
-   * Maximum row count for scanned batches.
-   *
-   * Since: 1.0.0
-   */
-  spec = g_param_spec_int64("batch-size",
-                            "Batch size",
-                            "Maximum row count for scanned batches",
-                            0,
-                            G_MAXINT64,
-                            scan_options->batch_size,
-                            static_cast<GParamFlags>(G_PARAM_READWRITE));
-  g_object_class_install_property(gobject_class, PROP_BATCH_SIZE, spec);
-
-  /**
-   * GADatasetScanOptions:use-threads:
-   *
-   * Indicate if the Scanner should make use of a ThreadPool.
-   *
-   * Since: 4.0.0
-   */
-  spec = g_param_spec_boolean("use-threads",
-                              "Use threads",
-                              "Indicate if the Scanner should make use of a ThreadPool",
-                              scan_options->use_threads,
-                              static_cast<GParamFlags>(G_PARAM_READWRITE));
-  g_object_class_install_property(gobject_class, PROP_USE_THREADS, spec);
+  g_object_class_install_property(gobject_class, PROP_SCANNER, spec);
 }
 
 /**
- * gadataset_scan_options_new:
- * @schema: A #GArrowSchema.
- *
- * Returns: A newly created #GADatasetScanOptions.
- *
- * Since: 1.0.0
- */
-GADatasetScanOptions *
-gadataset_scan_options_new(GArrowSchema *schema)
-{
-  auto arrow_schema = garrow_schema_get_raw(schema);
-  auto arrow_scan_options = std::make_shared<arrow::dataset::ScanOptions>();
-  arrow_scan_options->dataset_schema = arrow_schema;
-  return gadataset_scan_options_new_raw(&arrow_scan_options);
-}
-
-/**
- * gadataset_scan_options_get_schema:
- * @scan_options: A #GADatasetScanOptions.
+ * gadataset_scanner_to_table:
+ * @scanner: A #GADatasetScanner.
+ * @error: (nullable): Return location for a #GError or %NULL.
  *
- * Returns: (transfer full): A #GArrowSchema.
+ * Returns: (transfer full) (nullable):
+ *   A newly created #GArrowTable on success, %NULL on error.
  *
- * Since: 1.0.0
+ * Since: 5.0.0
  */
-GArrowSchema *
-gadataset_scan_options_get_schema(GADatasetScanOptions *scan_options)
+GArrowTable *
+gadataset_scanner_to_table(GADatasetScanner *scanner,
+                           GError **error)
 {
-  auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(scan_options);
-  auto arrow_schema = priv->scan_options->dataset_schema;
-  return garrow_schema_new_raw(&arrow_schema);
+  auto arrow_scanner = gadataset_scanner_get_raw(scanner);
+  auto arrow_table_result = arrow_scanner->ToTable();
+  if (garrow::check(error, arrow_table_result, "[scanner][to-table]")) {
+    auto arrow_table = *arrow_table_result;
+    return garrow_table_new_raw(&arrow_table);
+  } else {
+    return NULL;
+  }
 }
 
-/* arrow::dataset::ScanTask */
 
-typedef struct GADatasetScanTaskPrivate_ {
-  std::shared_ptr<arrow::dataset::ScanTask> scan_task;
-  GADatasetScanOptions *options;
-  GADatasetFragment *fragment;
-} GADatasetScanTaskPrivate;
+typedef struct GADatasetScannerBuilderPrivate_ {
+  std::shared_ptr<arrow::dataset::ScannerBuilder> scanner_builder;
+} GADatasetScannerBuilderPrivate;
 
 enum {
-  PROP_SCAN_TASK = 1,
-  PROP_OPTIONS,
-  PROP_FRAGMENT,
+  PROP_SCANNER_BUILDER = 1,
 };
 
-G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADatasetScanTask,
-                                    gadataset_scan_task,
-                                    G_TYPE_OBJECT)
-
-#define GADATASET_SCAN_TASK_GET_PRIVATE(obj)          \
-  static_cast<GADatasetScanTaskPrivate *>(            \
-    gadataset_scan_task_get_instance_private(         \
-      GADATASET_SCAN_TASK(obj)))
-
-static void
-gadataset_scan_task_dispose(GObject *object)
-{
-  auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(object);
-
-  if (priv->options) {
-    g_object_unref(priv->options);
-    priv->options = NULL;
-  }
+G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScannerBuilder,
+                           gadataset_scanner_builder,
+                           G_TYPE_OBJECT)
 
-  if (priv->fragment) {
-    g_object_unref(priv->fragment);
-    priv->fragment = NULL;
-  }
-
-  G_OBJECT_CLASS(gadataset_scan_task_parent_class)->dispose(object);
-}
+#define GADATASET_SCANNER_BUILDER_GET_PRIVATE(obj)        \
+  static_cast<GADatasetScannerBuilderPrivate *>(          \
+    gadataset_scanner_builder_get_instance_private(       \
+      GADATASET_SCANNER_BUILDER(obj)))
 
 static void
-gadataset_scan_task_finalize(GObject *object)
+gadataset_scanner_builder_finalize(GObject *object)
 {
-  auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(object);
-
-  priv->scan_task.~shared_ptr();
-
-  G_OBJECT_CLASS(gadataset_scan_task_parent_class)->finalize(object);
+  auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object);
+  priv->scanner_builder.~shared_ptr();
+  G_OBJECT_CLASS(gadataset_scanner_builder_parent_class)->finalize(object);
 }
 
 static void
-gadataset_scan_task_set_property(GObject *object,
-                           guint prop_id,
-                           const GValue *value,
-                           GParamSpec *pspec)
+gadataset_scanner_builder_set_property(GObject *object,
+                                       guint prop_id,
+                                       const GValue *value,
+                                       GParamSpec *pspec)
 {
-  auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(object);
+  auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object);
 
   switch (prop_id) {
-  case PROP_SCAN_TASK:
-    priv->scan_task =
-      *static_cast<std::shared_ptr<arrow::dataset::ScanTask> *>(
+  case PROP_SCANNER_BUILDER:
+    priv->scanner_builder =
+      *static_cast<std::shared_ptr<arrow::dataset::ScannerBuilder> *>(
         g_value_get_pointer(value));
     break;
-  case PROP_OPTIONS:
-    priv->options = GADATASET_SCAN_OPTIONS(g_value_dup_object(value));
-    break;
-  case PROP_FRAGMENT:
-    priv->fragment = GADATASET_FRAGMENT(g_value_dup_object(value));
-    break;
   default:
     G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
     break;
@@ -298,230 +177,112 @@ gadataset_scan_task_set_property(GObject *object,
 }
 
 static void
-gadataset_scan_task_get_property(GObject *object,
-                           guint prop_id,
-                           GValue *value,
-                           GParamSpec *pspec)
+gadataset_scanner_builder_init(GADatasetScannerBuilder *object)
 {
-  auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(object);
-
-  switch (prop_id) {
-  case PROP_OPTIONS:
-    g_value_set_object(value, priv->options);
-    break;
-  case PROP_FRAGMENT:
-    g_value_set_object(value, priv->fragment);
-    break;
-  default:
-    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
-    break;
-  }
+  auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object);
+  new(&priv->scanner_builder) std::shared_ptr<arrow::dataset::ScannerBuilder>;
 }
 
 static void
-gadataset_scan_task_init(GADatasetScanTask *object)
-{
-  auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(object);
-  new(&priv->scan_task) std::shared_ptr<arrow::dataset::ScanTask>;
-}
-
-static void
-gadataset_scan_task_class_init(GADatasetScanTaskClass *klass)
+gadataset_scanner_builder_class_init(GADatasetScannerBuilderClass *klass)
 {
   auto gobject_class = G_OBJECT_CLASS(klass);
-
-  gobject_class->dispose      = gadataset_scan_task_dispose;
-  gobject_class->finalize     = gadataset_scan_task_finalize;
-  gobject_class->set_property = gadataset_scan_task_set_property;
-  gobject_class->get_property = gadataset_scan_task_get_property;
+  gobject_class->finalize     = gadataset_scanner_builder_finalize;
+  gobject_class->set_property = gadataset_scanner_builder_set_property;
 
   GParamSpec *spec;
-  spec = g_param_spec_pointer("scan-task",
-                              "ScanTask",
-                              "The raw std::shared<arrow::dataset::ScanTask> *",
+  spec = g_param_spec_pointer("scanner-builder",
+                              "Scanner builder",
+                              "The raw "
+                              "std::shared<arrow::dataset::ScannerBuilder> *",
                               static_cast<GParamFlags>(G_PARAM_WRITABLE |
                                                        G_PARAM_CONSTRUCT_ONLY));
-  g_object_class_install_property(gobject_class, PROP_SCAN_TASK, spec);
-
-  /**
-   * GADatasetScanTask:options:
-   *
-   * The options of the scan task.
-   *
-   * Since: 1.0.0
-   */
-  spec = g_param_spec_object("options",
-                             "Options",
-                             "The options of the scan task",
-                             GADATASET_TYPE_SCAN_OPTIONS,
-                             static_cast<GParamFlags>(G_PARAM_READWRITE |
-                                                      G_PARAM_CONSTRUCT_ONLY));
-  g_object_class_install_property(gobject_class, PROP_OPTIONS, spec);
-
-  /**
-   * GADatasetScanTask:fragment:
-   *
-   * The fragment of the scan task.
-   *
-   * Since: 4.0.0
-   */
-  spec = g_param_spec_object("fragment",
-                             "Fragment",
-                             "The fragment of the scan task",
-                             GADATASET_TYPE_FRAGMENT,
-                             static_cast<GParamFlags>(G_PARAM_READWRITE |
-                                                      G_PARAM_CONSTRUCT_ONLY));
-  g_object_class_install_property(gobject_class, PROP_FRAGMENT, spec);
+  g_object_class_install_property(gobject_class, PROP_SCANNER_BUILDER, spec);
 }
 
 /**
- * gadataset_scan_task_get_options:
- * @scan_task: A #GADatasetScanTask.
- *
- * Returns: (transfer full): A #GADatasetScanOptions.
- *
- * Since: 1.0.0
- */
-GADatasetScanOptions *
-gadataset_scan_task_get_options(GADatasetScanTask *scan_task)
-{
-  auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(scan_task);
-  if (priv->options) {
-    g_object_ref(priv->options);
-    return priv->options;
-  }
-
-  auto arrow_options = priv->scan_task->options();
-  return gadataset_scan_options_new_raw(&arrow_options);
-}
-
-/**
- * gadataset_scan_task_get_fragment:
- * @scan_task: A #GADatasetFragment.
+ * gadataset_scanner_builder_new:
+ * @dataset: A #GADatasetDatast to be scanned.
+ * @error: (nullable): Return location for a #GError or %NULL.
  *
- * Returns: (transfer full): A #GADatasetFragment.
+ * Returns: (nullable): A newly created #GADatasetScannerBuilder on success,
+ *   %NULL on error.
  *
- * Since: 4.0.0
+ * Since: 5.0.0
  */
-GADatasetFragment *
-gadataset_scan_task_get_fragment(GADatasetScanTask *scan_task)
+GADatasetScannerBuilder *
+gadataset_scanner_builder_new(GADatasetDataset *dataset, GError **error)
 {
-  auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(scan_task);
-  if (priv->fragment) {
-    g_object_ref(priv->fragment);
-    return priv->fragment;
+  auto arrow_dataset = gadataset_dataset_get_raw(dataset);
+  auto arrow_scanner_builder_result = arrow_dataset->NewScan();
+  if (garrow::check(error,
+                    arrow_scanner_builder_result,
+                    "[scanner-builder][new]")) {
+    auto arrow_scanner_builder = *arrow_scanner_builder_result;
+    return gadataset_scanner_builder_new_raw(&arrow_scanner_builder);
+  } else {
+    return NULL;
   }
-
-  auto arrow_fragment = priv->scan_task->fragment();
-  return gadataset_fragment_new_raw(&arrow_fragment);
 }
 
 /**
- * gadataset_scan_task_execute:
- * @scan_task: A #GADatasetScanTask.
+ * gadataset_scanner_builder_finish:
+ * @builder: A #GADatasetScannerBuilder.
  * @error: (nullable): Return location for a #GError or %NULL.
  *
- * Returns: (nullable) (transfer full): A newly created #GArrowRecordBatchIterator,
- *   or %NULL on error.
+ * Returns: (transfer full) (nullable):
+ *   A newly created #GADatasetScanner on success, %NULL on error.
  *
- * Since: 1.0.0
+ * Since: 5.0.0
  */
-GArrowRecordBatchIterator *
-gadataset_scan_task_execute(GADatasetScanTask *scan_task,
-                            GError **error)
+GADatasetScanner *
+gadataset_scanner_builder_finish(GADatasetScannerBuilder *builder,
+                                 GError **error)
 {
-  auto priv = GADATASET_SCAN_TASK_GET_PRIVATE(scan_task);
-  auto arrow_result = priv->scan_task->Execute();
-  if (garrow::check(error, arrow_result, "[datasets][scan-task][execute]")) {
-    auto arrow_record_batch_iteraor = std::move(*arrow_result);
-    return garrow_record_batch_iterator_new_raw(&arrow_record_batch_iteraor);
+  auto arrow_builder = gadataset_scanner_builder_get_raw(builder);
+  auto arrow_scanner_result = arrow_builder->Finish();
+  if (garrow::check(error, arrow_scanner_result, "[scanner-builder][finish]")) {
+    auto arrow_scanner = *arrow_scanner_result;
+    return gadataset_scanner_new_raw(&arrow_scanner);
   } else {
     return NULL;
   }
 }
 
-/* arrow::dataset::InMemoryScanTask */
-
-G_DEFINE_TYPE(GADatasetInMemoryScanTask,
-              gadataset_in_memory_scan_task,
-              GADATASET_TYPE_SCAN_TASK)
-
-static void
-gadataset_in_memory_scan_task_init(GADatasetInMemoryScanTask *object)
-{
-}
 
-static void
-gadataset_in_memory_scan_task_class_init(GADatasetInMemoryScanTaskClass *klass)
-{
-}
+G_END_DECLS
 
-/**
- * gadataset_in_memory_scan_task_new:
- * @record_batches: (array length=n_record_batches):
- *   (element-type GArrowRecordBatch): The record batches of the table.
- * @n_record_batches: The number of record batches.
- * @options: A #GADatasetScanOptions.
- * @fragment: A #GADatasetInMemoryFragment.
- *
- * Returns: A newly created #GADatasetInMemoryScanTask.
- *
- * Since: 1.0.0
- */
-GADatasetInMemoryScanTask *
-gadataset_in_memory_scan_task_new(GArrowRecordBatch **record_batches,
-                                  gsize n_record_batches,
-                                  GADatasetScanOptions *options,
-                                  GADatasetInMemoryFragment *fragment)
+GADatasetScanner *
+gadataset_scanner_new_raw(
+  std::shared_ptr<arrow::dataset::Scanner> *arrow_scanner)
 {
-  std::vector<std::shared_ptr<arrow::RecordBatch>> arrow_record_batches;
-  arrow_record_batches.reserve(n_record_batches);
-  for (gsize i = 0; i < n_record_batches; ++i) {
-    auto arrow_record_batch = garrow_record_batch_get_raw(record_batches[i]);
-    arrow_record_batches.push_back(arrow_record_batch);
-  }
-  auto arrow_options = gadataset_scan_options_get_raw(options);
-  auto arrow_fragment = gadataset_fragment_get_raw(GADATASET_FRAGMENT(fragment));
-  auto arrow_in_memory_scan_task =
-    std::make_shared<arrow::dataset::InMemoryScanTask>(arrow_record_batches,
-                                                       arrow_options,
-                                                       arrow_fragment);
-  return gadataset_in_memory_scan_task_new_raw(&arrow_in_memory_scan_task,
-                                         options,
-                                         fragment);
+  auto scanner =
+    GADATASET_SCANNER(g_object_new(GADATASET_TYPE_SCANNER,
+                                   "scanner", arrow_scanner,
+                                   NULL));
+  return scanner;
 }
 
-G_END_DECLS
-
-GADatasetScanOptions *
-gadataset_scan_options_new_raw(
-  std::shared_ptr<arrow::dataset::ScanOptions> *arrow_scan_options)
+std::shared_ptr<arrow::dataset::Scanner>
+gadataset_scanner_get_raw(GADatasetScanner *scanner)
 {
-  auto scan_options =
-    GADATASET_SCAN_OPTIONS(g_object_new(GADATASET_TYPE_SCAN_OPTIONS,
-                                  "scan-options", arrow_scan_options,
-                                  NULL));
-  return scan_options;
+  auto priv = GADATASET_SCANNER_GET_PRIVATE(scanner);
+  return priv->scanner;
 }
 
-std::shared_ptr<arrow::dataset::ScanOptions>
-gadataset_scan_options_get_raw(GADatasetScanOptions *scan_options)
+GADatasetScannerBuilder *
+gadataset_scanner_builder_new_raw(
+  std::shared_ptr<arrow::dataset::ScannerBuilder> *arrow_scanner_builder)
 {
-  auto priv = GADATASET_SCAN_OPTIONS_GET_PRIVATE(scan_options);
-  return priv->scan_options;
+  return GADATASET_SCANNER_BUILDER(
+    g_object_new(GADATASET_TYPE_SCANNER_BUILDER,
+                 "scanner-builder", arrow_scanner_builder,
+                 NULL));
 }
 
-GADatasetInMemoryScanTask *
-gadataset_in_memory_scan_task_new_raw(
-  std::shared_ptr<arrow::dataset::InMemoryScanTask> *arrow_in_memory_scan_task,
-  GADatasetScanOptions *options,
-  GADatasetInMemoryFragment *fragment)
+std::shared_ptr<arrow::dataset::ScannerBuilder>
+gadataset_scanner_builder_get_raw(GADatasetScannerBuilder *scanner_builder)
 {
-  auto in_memory_scan_task =
-    GADATASET_IN_MEMORY_SCAN_TASK(g_object_new(GADATASET_TYPE_IN_MEMORY_SCAN_TASK,
-                                         "scan-task", arrow_in_memory_scan_task,
-                                         "options", options,
-                                         "fragment", fragment,
-                                         NULL));
-  return in_memory_scan_task;
+  auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(scanner_builder);
+  return priv->scanner_builder;
 }
diff --git a/c_glib/arrow-dataset-glib/scanner.h b/c_glib/arrow-dataset-glib/scanner.h
index 90a60363e82..446815d6db1 100644
--- a/c_glib/arrow-dataset-glib/scanner.h
+++ b/c_glib/arrow-dataset-glib/scanner.h
@@ -19,76 +19,45 @@
 
 #pragma once
 
-#include <arrow-glib/arrow-glib.h>
-
+#include <arrow-dataset-glib/dataset.h>
 #include <arrow-dataset-glib/fragment.h>
 
 G_BEGIN_DECLS
 
-/* arrow::dataset::ScanOptions */
-
-#define GADATASET_TYPE_SCAN_OPTIONS (gadataset_scan_options_get_type())
-G_DECLARE_DERIVABLE_TYPE(GADatasetScanOptions,
-                         gadataset_scan_options,
+#define GADATASET_TYPE_SCANNER (gadataset_scanner_get_type())
+G_DECLARE_DERIVABLE_TYPE(GADatasetScanner,
+                         gadataset_scanner,
                          GADATASET,
-                         SCAN_OPTIONS,
+                         SCANNER,
                          GObject)
-struct _GADatasetScanOptionsClass
+struct _GADatasetScannerClass
 {
   GObjectClass parent_class;
 };
 
+GARROW_AVAILABLE_IN_5_0
+GArrowTable *
+gadataset_scanner_to_table(GADatasetScanner *scanner,
+                           GError **error);
 
-GARROW_AVAILABLE_IN_1_0
-GADatasetScanOptions *
-gadataset_scan_options_new(GArrowSchema *schema);
-GARROW_AVAILABLE_IN_1_0
-GArrowSchema *
-gadataset_scan_options_get_schema(GADatasetScanOptions *scan_options);
-
-/* arrow::dataset::ScanTask */
-
-#define GADATASET_TYPE_SCAN_TASK (gadataset_scan_task_get_type())
-G_DECLARE_DERIVABLE_TYPE(GADatasetScanTask,
-                         gadataset_scan_task,
+#define GADATASET_TYPE_SCANNER_BUILDER (gadataset_scanner_builder_get_type())
+G_DECLARE_DERIVABLE_TYPE(GADatasetScannerBuilder,
+                         gadataset_scanner_builder,
                          GADATASET,
-                         SCAN_TASK,
+                         SCANNER_BUILDER,
                          GObject)
-struct _GADatasetScanTaskClass
+struct _GADatasetScannerBuilderClass
 {
   GObjectClass parent_class;
 };
 
-GARROW_AVAILABLE_IN_1_0
-GADatasetScanOptions *
-gadataset_scan_task_get_options(GADatasetScanTask *scan_task);
-GARROW_AVAILABLE_IN_4_0
-GADatasetFragment *
-gadataset_scan_task_get_fragment(GADatasetScanTask *scan_task);
-GARROW_AVAILABLE_IN_1_0
-GArrowRecordBatchIterator *
-gadataset_scan_task_execute(GADatasetScanTask *scan_task,
-                            GError **error);
-
-/* arrow::dataset::InMemoryScanTask */
-
-#define GADATASET_TYPE_IN_MEMORY_SCAN_TASK      \
-  (gadataset_in_memory_scan_task_get_type())
-G_DECLARE_DERIVABLE_TYPE(GADatasetInMemoryScanTask,
-                         gadataset_in_memory_scan_task,
-                         GADATASET,
-                         IN_MEMORY_SCAN_TASK,
-                         GADatasetScanTask)
-struct _GADatasetInMemoryScanTaskClass
-{
-  GADatasetScanTaskClass parent_class;
-};
-
-GARROW_AVAILABLE_IN_1_0
-GADatasetInMemoryScanTask *
-gadataset_in_memory_scan_task_new(GArrowRecordBatch **record_batches,
-                                  gsize n_record_batches,
-                                  GADatasetScanOptions *options,
-                                  GADatasetInMemoryFragment *fragment);
+GARROW_AVAILABLE_IN_5_0
+GADatasetScannerBuilder *
+gadataset_scanner_builder_new(GADatasetDataset *dataset,
+                              GError **error);
+GARROW_AVAILABLE_IN_5_0
+GADatasetScanner *
+gadataset_scanner_builder_finish(GADatasetScannerBuilder *builder,
+                                 GError **error);
 
 G_END_DECLS
diff --git a/c_glib/arrow-dataset-glib/scanner.hpp b/c_glib/arrow-dataset-glib/scanner.hpp
index ad3ac6a03cd..663ab6fc44b 100644
--- a/c_glib/arrow-dataset-glib/scanner.hpp
+++ b/c_glib/arrow-dataset-glib/scanner.hpp
@@ -24,14 +24,14 @@
 #include <arrow-dataset-glib/fragment.h>
 #include <arrow-dataset-glib/scanner.h>
 
-GADatasetScanOptions *
-gadataset_scan_options_new_raw(
-  std::shared_ptr<arrow::dataset::ScanOptions> *arrow_scan_options);
-std::shared_ptr<arrow::dataset::ScanOptions>
-gadataset_scan_options_get_raw(GADatasetScanOptions *scan_options);
+GADatasetScanner *
+gadataset_scanner_new_raw(
+  std::shared_ptr<arrow::dataset::Scanner> *arrow_scanner);
+std::shared_ptr<arrow::dataset::Scanner>
+gadataset_scanner_get_raw(GADatasetScanner *scanner);
 
-GADatasetInMemoryScanTask *
-gadataset_in_memory_scan_task_new_raw(
-  std::shared_ptr<arrow::dataset::InMemoryScanTask> *arrow_in_memory_scan_task,
-  GADatasetScanOptions *scan_options,
-  GADatasetInMemoryFragment *fragment);
+GADatasetScannerBuilder *
+gadataset_scanner_builder_new_raw(
+  std::shared_ptr<arrow::dataset::ScannerBuilder> *arrow_scanner_builder);
+std::shared_ptr<arrow::dataset::ScannerBuilder>
+gadataset_scanner_builder_get_raw(GADatasetScannerBuilder *scanner_builder);
diff --git a/c_glib/arrow-glib/basic-array.cpp b/c_glib/arrow-glib/basic-array.cpp
index d5b221a36b0..1eb65b88964 100644
--- a/c_glib/arrow-glib/basic-array.cpp
+++ b/c_glib/arrow-glib/basic-array.cpp
@@ -221,9 +221,9 @@ garrow_equal_options_set_property(GObject *object,
 
 static void
 garrow_equal_options_get_property(GObject *object,
-                                 guint prop_id,
-                                 GValue *value,
-                                 GParamSpec *pspec)
+                                  guint prop_id,
+                                  GValue *value,
+                                  GParamSpec *pspec)
 {
   auto priv = GARROW_EQUAL_OPTIONS_GET_PRIVATE(object);
 
diff --git a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml
index 9a1ae059378..3e8da5bd9d1 100644
--- a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml
+++ b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml
@@ -36,9 +36,15 @@
     </releaseinfo>
   </bookinfo>
 
-  <part id="read">
-    <title>Read</title>
-    <chapter id="scan">
+  <part id="data">
+    <title>Data</title>
+    <chapter id="source">
+      <title>Dataset</title>
+      <xi:include href="xml/dataset.xml"/>
+      <title>Dataset factory</title>
+      <xi:include href="xml/dataset-factory.xml"/>
+    </chapter>
+    <chapter id="read">
       <title>Scan</title>
       <xi:include href="xml/scanner.xml"/>
       <title>Fragment</title>
@@ -60,6 +66,10 @@
     <title>Index of deprecated API</title>
     <xi:include href="xml/api-index-deprecated.xml"><xi:fallback /></xi:include>
   </index>
+  <index id="api-index-5-0-0" role="5.0.0">
+    <title>Index of new symbols in 4.0.0</title>
+    <xi:include href="xml/api-index-5.0.0.xml"><xi:fallback /></xi:include>
+  </index>
   <index id="api-index-4-0-0" role="4.0.0">
     <title>Index of new symbols in 4.0.0</title>
     <xi:include href="xml/api-index-4.0.0.xml"><xi:fallback /></xi:include>
@@ -68,9 +78,5 @@
     <title>Index of new symbols in 3.0.0</title>
     <xi:include href="xml/api-index-3.0.0.xml"><xi:fallback /></xi:include>
   </index>
-  <index id="api-index-1-0-0" role="1.0.0">
-    <title>Index of new symbols in 1.0.0</title>
-    <xi:include href="xml/api-index-1.0.0.xml"><xi:fallback /></xi:include>
-  </index>
   <xi:include href="xml/annotation-glossary.xml"><xi:fallback /></xi:include>
 </book>
diff --git a/c_glib/test/dataset/test-file-system-dataset-factory.rb b/c_glib/test/dataset/test-file-system-dataset-factory.rb
new file mode 100644
index 00000000000..9ef629c222e
--- /dev/null
+++ b/c_glib/test/dataset/test-file-system-dataset-factory.rb
@@ -0,0 +1,55 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestDatasetFileSystemDatasetFactory < Test::Unit::TestCase
+  include Helper::Buildable
+  include Helper::Writable
+
+  def setup
+    omit("Arrow Dataset is required") unless defined?(ArrowDataset)
+    Dir.mktmpdir do |tmpdir|
+      @dir = tmpdir
+      @path = File.join(@dir, "table.arrow")
+      @table = build_table(visible: [
+                             build_boolean_array([true, false, true]),
+                             build_boolean_array([false, true, false, true]),
+                           ],
+                           point: [
+                             build_int32_array([1, 2, 3]),
+                             build_int32_array([-1, -2, -3, -4]),
+                           ])
+      @format = ArrowDataset::IPCFileFormat.new
+      write_table(@table, @path)
+      yield
+    end
+  end
+
+  def test_file_system
+    factory = ArrowDataset::FileSystemDatasetFactory.new(@format)
+    factory.file_system = Arrow::LocalFileSystem.new
+    factory.add_path(File.expand_path(@path))
+    dataset = factory.finish
+    assert_equal(@table, dataset.to_table)
+  end
+
+  def test_file_system_uri
+    factory = ArrowDataset::FileSystemDatasetFactory.new(@format)
+    factory.file_system_uri = build_file_uri(@path)
+    dataset = factory.finish
+    assert_equal(@table, dataset.to_table)
+  end
+end
diff --git a/ruby/red-arrow-dataset/test/test-in-memory-scan-task.rb b/c_glib/test/dataset/test-file-system-dataset.rb
similarity index 64%
rename from ruby/red-arrow-dataset/test/test-in-memory-scan-task.rb
rename to c_glib/test/dataset/test-file-system-dataset.rb
index 37f041d3159..6d6ec3b18c6 100644
--- a/ruby/red-arrow-dataset/test/test-in-memory-scan-task.rb
+++ b/c_glib/test/dataset/test-file-system-dataset.rb
@@ -15,19 +15,20 @@
 # specific language governing permissions and limitations
 # under the License.
 
-class TestInMemoryScanTask < Test::Unit::TestCase
+class TestDatasetFileSystemDataset < Test::Unit::TestCase
   def setup
-    @record_batches = [
-      Arrow::RecordBatch.new(visible: [true, false, true],
-                             point: [1, 2, 3]),
-    ]
+    omit("Arrow Dataset is required") unless defined?(ArrowDataset)
+    Dir.mktmpdir do |tmpdir|
+      @dir = tmpdir
+      format = ArrowDataset::IPCFileFormat.new
+      factory = ArrowDataset::FileSystemDatasetFactory.new(format)
+      factory.file_system = Arrow::LocalFileSystem.new
+      @dataset = factory.finish
+      yield
+    end
   end
 
-  sub_test_case(".new") do
-    test("[[Arrow::RecordBatch]]") do
-      scan_task = ArrowDataset::InMemoryScanTask.new(@record_batches)
-      assert_equal(@record_batches,
-                   scan_task.execute.to_a)
-    end
+  def test_type_name
+    assert_equal("filesystem", @dataset.type_name)
   end
 end
diff --git a/c_glib/test/dataset/test-in-memory-scan-task.rb b/c_glib/test/dataset/test-in-memory-scan-task.rb
deleted file mode 100644
index 06e3d0d2424..00000000000
--- a/c_glib/test/dataset/test-in-memory-scan-task.rb
+++ /dev/null
@@ -1,59 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-class TestDatasetInMemoryScanTask < Test::Unit::TestCase
-  include Helper::Buildable
-
-  def setup
-    omit("Arrow Dataset is required") unless defined?(ArrowDataset)
-    fields = [
-      Arrow::Field.new("visible", Arrow::BooleanDataType.new),
-      Arrow::Field.new("point", Arrow::Int32DataType.new),
-    ]
-    @schema = Arrow::Schema.new(fields)
-    @record_batches = [
-      [
-        build_boolean_array([true, false, true]),
-        build_int32_array([1, 2, 3]),
-      ],
-      [
-        build_boolean_array([false, true, false, true]),
-        build_int32_array([-1, -2, -3, -4]),
-      ]
-    ].collect do |columns|
-      Arrow::RecordBatch.new(@schema, columns[0].length, columns)
-    end
-
-    @scan_options = ArrowDataset::ScanOptions.new(@schema)
-
-    @fragment = ArrowDataset::InMemoryFragment.new(@schema,
-                                                   @record_batches)
-
-    @scan_task = ArrowDataset::InMemoryScanTask.new(@record_batches,
-                                                    @scan_options,
-                                                    @fragment)
-  end
-
-  def test_scan_options
-    assert_equal(@scan_options, @scan_task.options)
-  end
-
-  def test_execute
-    assert_equal(@record_batches,
-                 @scan_task.execute.to_list)
-  end
-end
diff --git a/c_glib/test/dataset/test-scan-options.rb b/c_glib/test/dataset/test-scan-options.rb
deleted file mode 100644
index 0536b2a7cca..00000000000
--- a/c_glib/test/dataset/test-scan-options.rb
+++ /dev/null
@@ -1,47 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-class TestDatasetScanOptions < Test::Unit::TestCase
-  def setup
-    omit("Arrow Dataset is required") unless defined?(ArrowDataset)
-    @schema = Arrow::Schema.new([])
-    @scan_options = ArrowDataset::ScanOptions.new(@schema)
-  end
-
-  def test_schema
-    assert_equal(@schema,
-                 @scan_options.schema)
-  end
-
-  def test_batch_size
-    assert_equal(1<<20,
-                 @scan_options.batch_size)
-    @scan_options.batch_size = 42
-    assert_equal(42,
-                 @scan_options.batch_size)
-  end
-
-  def test_use_threads
-    assert do
-      not @scan_options.use_threads?
-    end
-    @scan_options.use_threads = true
-    assert do
-      @scan_options.use_threads?
-    end
-  end
-end
diff --git a/c_glib/test/dataset/test-scanner.rb b/c_glib/test/dataset/test-scanner.rb
new file mode 100644
index 00000000000..f7702d4905f
--- /dev/null
+++ b/c_glib/test/dataset/test-scanner.rb
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestDatasetScanner < Test::Unit::TestCase
+  include Helper::Buildable
+  include Helper::Writable
+
+  def setup
+    omit("Arrow Dataset is required") unless defined?(ArrowDataset)
+    Dir.mktmpdir do |tmpdir|
+      path = File.join(tmpdir, "table.arrow")
+      @table = build_table(visible: [
+                             build_boolean_array([true, false, true]),
+                             build_boolean_array([false, true, false, true]),
+                           ],
+                           point: [
+                             build_int32_array([1, 2, 3]),
+                             build_int32_array([-1, -2, -3, -4]),
+                           ])
+      @format = ArrowDataset::IPCFileFormat.new
+      write_table(@table, path)
+      factory = ArrowDataset::FileSystemDatasetFactory.new(@format)
+      factory.file_system_uri = build_file_uri(path)
+      @dataset = factory.finish
+      builder = @dataset.begin_scan
+      @scanner = builder.finish
+      yield
+    end
+  end
+
+  def test_to_table
+    assert_equal(@table, @scanner.to_table)
+  end
+end
diff --git a/c_glib/test/helper/buildable.rb b/c_glib/test/helper/buildable.rb
index 04ae22f8715..356fa651c6a 100644
--- a/c_glib/test/helper/buildable.rb
+++ b/c_glib/test/helper/buildable.rb
@@ -205,7 +205,15 @@ def append_to_builder(builder, value)
     def build_table(columns)
       fields = []
       chunked_arrays = []
-      columns.each do |name, chunked_array|
+      columns.each do |name, data|
+        case data
+        when Arrow::Array
+          chunked_array = Arrow::ChunkedArray.new([data])
+        when Array
+          chunked_array = Arrow::ChunkedArray.new(data)
+        else
+          chunked_array = data
+        end
         fields << Arrow::Field.new(name, chunked_array.value_data_type)
         chunked_arrays << chunked_array
       end
@@ -222,6 +230,15 @@ def build_record_batch(columns)
       Arrow::RecordBatch.new(schema, n_rows, columns.values)
     end
 
+    def build_file_uri(path)
+      absolute_path = File.expand_path(path)
+      if absolute_path.start_with?("/")
+        "file://#{absolute_path}"
+      else
+        "file:///#{absolute_path}"
+      end
+    end
+
     private
     def build_array(builder, values)
       values.each do |value|
diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-fragment.rb b/c_glib/test/helper/writable.rb
similarity index 63%
rename from ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-fragment.rb
rename to c_glib/test/helper/writable.rb
index 917d6c79d0d..0053e972f91 100644
--- a/ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-fragment.rb
+++ b/c_glib/test/helper/writable.rb
@@ -15,18 +15,25 @@
 # specific language governing permissions and limitations
 # under the License.
 
-module ArrowDataset
-  class InMemoryFragment
-    alias_method :initialize_raw, :initialize
-    private :initialize_raw
-    def initialize(schema, record_batches)
-      record_batches = record_batches.collect do |record_batch|
-        unless record_batch.is_a?(Arrow::RecordBatch)
-          record_batch = Arrow::RecordBatch.new(record_batch)
+module Helper
+  module Writable
+    def write_table(table, path, type: :file)
+      output = Arrow::FileOutputStream.new(path, false)
+      begin
+        if type == :file
+          writer_class = Arrow::RecordBatchFileWriter
+        else
+          writer_class = Arrow::RecordBatchStreamWriter
         end
-        record_batch
+        writer = writer_class.new(output, table.schema)
+        begin
+          writer.write_table(table)
+        ensure
+          writer.close
+        end
+      ensure
+        output.close
       end
-      initialize_raw(schema, record_batches)
     end
   end
 end
diff --git a/c_glib/test/run-test.rb b/c_glib/test/run-test.rb
index 044cb33a019..9c6af05224e 100755
--- a/c_glib/test/run-test.rb
+++ b/c_glib/test/run-test.rb
@@ -83,10 +83,11 @@ class BooleanScalar
 require_relative "helper/buildable"
 require_relative "helper/data-type"
 require_relative "helper/fixture"
-require_relative "helper/omittable"
-require_relative "helper/plasma-store"
 if defined?(ArrowFlight)
   require_relative "helper/flight-server"
 end
+require_relative "helper/omittable"
+require_relative "helper/plasma-store"
+require_relative "helper/writable"
 
 exit(Test::Unit::AutoRunner.run(true, test_dir.to_s))
diff --git a/cpp/src/arrow/dataset/discovery.h b/cpp/src/arrow/dataset/discovery.h
index 5559638448f..40c02051955 100644
--- a/cpp/src/arrow/dataset/discovery.h
+++ b/cpp/src/arrow/dataset/discovery.h
@@ -237,16 +237,23 @@ class ARROW_DS_EXPORT FileSystemDatasetFactory : public DatasetFactory {
                                                       std::shared_ptr<FileFormat> format,
                                                       FileSystemFactoryOptions options);
 
+  /// \brief Build a FileSystemDatasetFactory from an explicit list of
+  /// file information.
+  ///
+  /// \param[in] filesystem passed to FileSystemDataset
+  /// \param[in] files passed to FileSystemDataset
+  /// \param[in] format passed to FileSystemDataset
+  /// \param[in] options see FileSystemFactoryOptions for more information.
+  static Result<std::shared_ptr<DatasetFactory>> Make(
+      std::shared_ptr<fs::FileSystem> filesystem, const std::vector<fs::FileInfo>& files,
+      std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
+
   Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
       InspectOptions options) override;
 
   Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
 
  protected:
-  static Result<std::shared_ptr<DatasetFactory>> Make(
-      std::shared_ptr<fs::FileSystem> filesystem, const std::vector<fs::FileInfo>& files,
-      std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
-
   FileSystemDatasetFactory(std::vector<fs::FileInfo> files,
                            std::shared_ptr<fs::FileSystem> filesystem,
                            std::shared_ptr<FileFormat> format,
diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/scan-options.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb
similarity index 69%
rename from ruby/red-arrow-dataset/lib/arrow-dataset/scan-options.rb
rename to ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb
index 1467743655b..a658fc3f2e0 100644
--- a/ruby/red-arrow-dataset/lib/arrow-dataset/scan-options.rb
+++ b/ruby/red-arrow-dataset/lib/arrow-dataset/dataset.rb
@@ -16,21 +16,13 @@
 # under the License.
 
 module ArrowDataset
-  class ScanOptions
+  class Dataset
     class << self
-      def try_convert(value)
-        case value
-        when Hash
-          return nil unless value.key?(:schema)
-          options = new(value[:schema])
-          value.each do |name, value|
-            next if name == :schema
-            options.__send__("#{name}=", value)
-          end
-          options
-        else
-          nil
-        end
+      def build(*args)
+        factory_class = ArrowDataset.const_get("#{name}Factory")
+        factory = factory_class.new(*args)
+        yield(factory)
+        factory.finish
       end
     end
   end
diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-scan-task.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-scan-task.rb
deleted file mode 100644
index 5e127e179c6..00000000000
--- a/ruby/red-arrow-dataset/lib/arrow-dataset/in-memory-scan-task.rb
+++ /dev/null
@@ -1,35 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-module ArrowDataset
-  class InMemoryScanTask
-    alias_method :initialize_raw, :initialize
-    private :initialize_raw
-    def initialize(record_batches, **options)
-      record_batches = record_batches.collect do |record_batch|
-        unless record_batch.is_a?(Arrow::RecordBatch)
-          record_batch = Arrow::RecordBatch.new(record_batch)
-        end
-        record_batch
-      end
-      options[:schema] ||= record_batches.first.schema
-      fragment = options.delete(:fragment)
-      fragment ||= InMemoryFragment.new(options[:schema], record_batches)
-      initialize_raw(record_batches, options, fragment)
-    end
-  end
-end
diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb
index fcac52d268f..6a0dc5079d8 100644
--- a/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb
+++ b/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb
@@ -29,8 +29,7 @@ def post_load(repository, namespace)
     end
 
     def require_libraries
-      require "arrow-dataset/in-memory-scan-task"
-      require "arrow-dataset/scan-options"
+      require "arrow-dataset/dataset"
     end
   end
 end
diff --git a/ruby/red-arrow-dataset/test/helper.rb b/ruby/red-arrow-dataset/test/helper.rb
index 795df3beb01..7231eb1cb64 100644
--- a/ruby/red-arrow-dataset/test/helper.rb
+++ b/ruby/red-arrow-dataset/test/helper.rb
@@ -17,4 +17,6 @@
 
 require "arrow-dataset"
 
+require "tmpdir"
+
 require "test-unit"
diff --git a/ruby/red-arrow-dataset/test/test-scan-options.rb b/ruby/red-arrow-dataset/test/test-file-system-dataset.rb
similarity index 58%
rename from ruby/red-arrow-dataset/test/test-scan-options.rb
rename to ruby/red-arrow-dataset/test/test-file-system-dataset.rb
index a9a947ff88d..17cbcb88d74 100644
--- a/ruby/red-arrow-dataset/test/test-scan-options.rb
+++ b/ruby/red-arrow-dataset/test/test-file-system-dataset.rb
@@ -15,22 +15,24 @@
 # specific language governing permissions and limitations
 # under the License.
 
-class TestScanOptions < Test::Unit::TestCase
+class TestFileSystemDataset < Test::Unit::TestCase
   def setup
-    @record_batches = [
-      Arrow::RecordBatch.new(visible: [true, false, true],
-                             point: [1, 2, 3]),
-    ]
-    @schema = @record_batches.first.schema
+    Dir.mktmpdir do |tmpdir|
+      @dir = tmpdir
+      @path = File.join(@dir, "table.arrow")
+      @table = Arrow::Table.new(visible: [true, false, true],
+                                point: [1, 2, 3])
+      @table.save(@path)
+      @format = ArrowDataset::IPCFileFormat.new
+      yield
+    end
   end
 
-  sub_test_case(".try_convert") do
-    def test_hash
-      batch_size = 1024
-      context = ArrowDataset::ScanOptions.try_convert(schema: @schema,
-                                                      batch_size: batch_size)
-      assert_equal([@schema, batch_size],
-                   [context.schema, context.batch_size])
+  test(".build") do
+    dataset = ArrowDataset::FileSystemDataset.build(@format) do |factory|
+      factory.file_system = Arrow::LocalFileSystem.new
+      factory.add_path(File.expand_path(@path))
     end
+    assert_equal(@table, dataset.to_table)
   end
 end

From cc0bd605e57ee39c522c262327359aa341de73bf Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Wed, 16 Jun 2021 11:00:11 +0200
Subject: [PATCH 22/61] ARROW-13036: [Doc] Mention recommended file
 extension(s) for Arrow IPC

See JIRA

Closes #10512 from westonpace/feature/ARROW-13036--doc-mention-recommended-file-extension-s-for-ar

Authored-by: Weston Pace <weston.pace@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 docs/source/format/Columnar.rst | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst
index 102c3a73317..52920a49b35 100644
--- a/docs/source/format/Columnar.rst
+++ b/docs/source/format/Columnar.rst
@@ -1006,19 +1006,21 @@ message flatbuffer is read, you can then read the message body.
 
 The stream writer can signal end-of-stream (EOS) either by writing 8 bytes
 containing the 4-byte continuation indicator (``0xFFFFFFFF``) followed by 0
-metadata length (``0x00000000``) or closing the stream interface.
+metadata length (``0x00000000``) or closing the stream interface. We
+recommend the ".arrows" file extension for the streaming format although
+in many cases these streams will not ever be stored as files.
 
 IPC File Format
 ---------------
 
-We define a "file format" supporting random access that is build with
-the stream format. The file starts and ends with a magic string
-``ARROW1`` (plus padding). What follows in the file is identical to
-the stream format. At the end of the file, we write a *footer*
-containing a redundant copy of the schema (which is a part of the
-streaming format) plus memory offsets and sizes for each of the data
-blocks in the file. This enables random access any record batch in the
-file. See `File.fbs`_ for the precise details of the file footer.
+We define a "file format" supporting random access that is an extension of
+the stream format. The file starts and ends with a magic string ``ARROW1``
+(plus padding). What follows in the file is identical to the stream format.
+At the end of the file, we write a *footer* containing a redundant copy of
+the schema (which is a part of the streaming format) plus memory offsets and
+sizes for each of the data blocks in the file. This enables random access to
+any record batch in the file. See `File.fbs`_ for the precise details of the
+file footer.
 
 Schematically we have: ::
 
@@ -1034,8 +1036,9 @@ should be defined in a ``DictionaryBatch`` before they are used in a
 ``RecordBatch``, as long as the keys are defined somewhere in the
 file. Further more, it is invalid to have more than one **non-delta**
 dictionary batch per dictionary ID (i.e. dictionary replacement is not
-supported).  Delta dictionaries are applied in the order they appear in
-the file footer.
+supported). Delta dictionaries are applied in the order they appear in
+the file footer. We recommend the ".arrow" extension for files created with
+this format.
 
 Dictionary Messages
 -------------------

From 85a4052097a0d26e930d2a404ba86ecf3db1633d Mon Sep 17 00:00:00 2001
From: Nate Clark <nate@neworld.us>
Date: Wed, 16 Jun 2021 12:58:39 +0200
Subject: [PATCH 23/61] ARROW-12995: [C++] Add validation to CSV options

Closes #10505 from n3world/ARROW-12995-Validate_csv_opts

Authored-by: Nate Clark <nate@neworld.us>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/csv/options.cc         | 43 ++++++++++++++++
 cpp/src/arrow/csv/options.h          | 14 ++++++
 cpp/src/arrow/csv/reader.cc          |  8 +++
 cpp/src/arrow/csv/writer.cc          |  2 +
 python/pyarrow/_csv.pyx              | 13 +++++
 python/pyarrow/includes/libarrow.pxd |  8 +++
 python/pyarrow/tests/test_csv.py     | 74 ++++++++++++++++++++++++++++
 7 files changed, 162 insertions(+)

diff --git a/cpp/src/arrow/csv/options.cc b/cpp/src/arrow/csv/options.cc
index a515abf2cf4..c71cfdaf295 100644
--- a/cpp/src/arrow/csv/options.cc
+++ b/cpp/src/arrow/csv/options.cc
@@ -22,6 +22,19 @@ namespace csv {
 
 ParseOptions ParseOptions::Defaults() { return ParseOptions(); }
 
+Status ParseOptions::Validate() const {
+  if (ARROW_PREDICT_FALSE(delimiter == '\n' || delimiter == '\r')) {
+    return Status::Invalid("ParseOptions: delimiter cannot be \\r or \\n");
+  }
+  if (ARROW_PREDICT_FALSE(quoting && (quote_char == '\n' || quote_char == '\r'))) {
+    return Status::Invalid("ParseOptions: quote_char cannot be \\r or \\n");
+  }
+  if (ARROW_PREDICT_FALSE(escaping && (escape_char == '\n' || escape_char == '\r'))) {
+    return Status::Invalid("ParseOptions: escape_char cannot be \\r or \\n");
+  }
+  return Status::OK();
+}
+
 ConvertOptions ConvertOptions::Defaults() {
   auto options = ConvertOptions();
   // Same default null / true / false spellings as in Pandas.
@@ -33,8 +46,38 @@ ConvertOptions ConvertOptions::Defaults() {
   return options;
 }
 
+Status ConvertOptions::Validate() const { return Status::OK(); }
+
 ReadOptions ReadOptions::Defaults() { return ReadOptions(); }
+
+Status ReadOptions::Validate() const {
+  if (ARROW_PREDICT_FALSE(block_size < 1)) {
+    // Min is 1 because some tests use really small block sizes
+    return Status::Invalid("ReadOptions: block_size must be at least 1: ", block_size);
+  }
+  if (ARROW_PREDICT_FALSE(skip_rows < 0)) {
+    return Status::Invalid("ReadOptions: skip_rows cannot be negative: ", skip_rows);
+  }
+  if (ARROW_PREDICT_FALSE(skip_rows_after_names < 0)) {
+    return Status::Invalid("ReadOptions: skip_rows_after_names cannot be negative: ",
+                           skip_rows_after_names);
+  }
+  if (ARROW_PREDICT_FALSE(autogenerate_column_names && !column_names.empty())) {
+    return Status::Invalid(
+        "ReadOptions: autogenerate_column_names cannot be true when column_names are "
+        "provided");
+  }
+  return Status::OK();
+}
+
 WriteOptions WriteOptions::Defaults() { return WriteOptions(); }
 
+Status WriteOptions::Validate() const {
+  if (ARROW_PREDICT_FALSE(batch_size < 1)) {
+    return Status::Invalid("WriteOptions: batch_size must be at least 1: ", batch_size);
+  }
+  return Status::OK();
+}
+
 }  // namespace csv
 }  // namespace arrow
diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
index d9c94a03f86..790c47fc3f4 100644
--- a/cpp/src/arrow/csv/options.h
+++ b/cpp/src/arrow/csv/options.h
@@ -24,6 +24,7 @@
 #include <vector>
 
 #include "arrow/csv/type_fwd.h"
+#include "arrow/status.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
@@ -59,6 +60,9 @@ struct ARROW_EXPORT ParseOptions {
 
   /// Create parsing options with default values
   static ParseOptions Defaults();
+
+  /// \brief Test that all set options are valid
+  Status Validate() const;
 };
 
 struct ARROW_EXPORT ConvertOptions {
@@ -112,6 +116,9 @@ struct ARROW_EXPORT ConvertOptions {
   /// Create conversion options with default values, including conventional
   /// values for `null_values`, `true_values` and `false_values`
   static ConvertOptions Defaults();
+
+  /// \brief Test that all set options are valid
+  Status Validate() const;
 };
 
 struct ARROW_EXPORT ReadOptions {
@@ -124,6 +131,7 @@ struct ARROW_EXPORT ReadOptions {
   ///
   /// This will determine multi-threading granularity as well as
   /// the size of individual record batches.
+  /// Minimum valid value for block size is 1
   int32_t block_size = 1 << 20;  // 1 MB
 
   /// Number of header rows to skip (not including the row of column names, if any)
@@ -143,6 +151,9 @@ struct ARROW_EXPORT ReadOptions {
 
   /// Create read options with default values
   static ReadOptions Defaults();
+
+  /// \brief Test that all set options are valid
+  Status Validate() const;
 };
 
 /// Experimental
@@ -158,6 +169,9 @@ struct ARROW_EXPORT WriteOptions {
 
   /// Create write options with default values
   static WriteOptions Defaults();
+
+  /// \brief Test that all set options are valid
+  Status Validate() const;
 };
 
 }  // namespace csv
diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc
index 068e06178c8..f221ffcadd9 100644
--- a/cpp/src/arrow/csv/reader.cc
+++ b/cpp/src/arrow/csv/reader.cc
@@ -1033,6 +1033,9 @@ Result<std::shared_ptr<TableReader>> MakeTableReader(
     MemoryPool* pool, io::IOContext io_context, std::shared_ptr<io::InputStream> input,
     const ReadOptions& read_options, const ParseOptions& parse_options,
     const ConvertOptions& convert_options) {
+  RETURN_NOT_OK(parse_options.Validate());
+  RETURN_NOT_OK(read_options.Validate());
+  RETURN_NOT_OK(convert_options.Validate());
   std::shared_ptr<BaseTableReader> reader;
   if (read_options.use_threads) {
     auto cpu_executor = internal::GetCpuThreadPool();
@@ -1051,6 +1054,9 @@ Future<std::shared_ptr<StreamingReader>> MakeStreamingReader(
     io::IOContext io_context, std::shared_ptr<io::InputStream> input,
     internal::Executor* cpu_executor, const ReadOptions& read_options,
     const ParseOptions& parse_options, const ConvertOptions& convert_options) {
+  RETURN_NOT_OK(parse_options.Validate());
+  RETURN_NOT_OK(read_options.Validate());
+  RETURN_NOT_OK(convert_options.Validate());
   std::shared_ptr<BaseStreamingReader> reader;
   reader = std::make_shared<SerialStreamingReader>(
       io_context, cpu_executor, input, read_options, parse_options, convert_options,
@@ -1182,6 +1188,8 @@ Future<int64_t> CountRowsAsync(io::IOContext io_context,
                                internal::Executor* cpu_executor,
                                const ReadOptions& read_options,
                                const ParseOptions& parse_options) {
+  RETURN_NOT_OK(parse_options.Validate());
+  RETURN_NOT_OK(read_options.Validate());
   auto counter = std::make_shared<CSVRowCounter>(
       io_context, cpu_executor, std::move(input), read_options, parse_options);
   return counter->Count();
diff --git a/cpp/src/arrow/csv/writer.cc b/cpp/src/arrow/csv/writer.cc
index ddd59b46fc1..e1c34a77ae9 100644
--- a/cpp/src/arrow/csv/writer.cc
+++ b/cpp/src/arrow/csv/writer.cc
@@ -414,6 +414,7 @@ class CSVConverter {
 
 Status WriteCSV(const Table& table, const WriteOptions& options, MemoryPool* pool,
                 arrow::io::OutputStream* output) {
+  RETURN_NOT_OK(options.Validate());
   if (pool == nullptr) {
     pool = default_memory_pool();
   }
@@ -424,6 +425,7 @@ Status WriteCSV(const Table& table, const WriteOptions& options, MemoryPool* poo
 
 Status WriteCSV(const RecordBatch& batch, const WriteOptions& options, MemoryPool* pool,
                 arrow::io::OutputStream* output) {
+  RETURN_NOT_OK(options.Validate());
   if (pool == nullptr) {
     pool = default_memory_pool();
   }
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index e7dda3fb953..8ede8272c07 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -58,6 +58,7 @@ cdef class ReadOptions(_Weakrefable):
         How much bytes to process at a time from the input stream.
         This will determine multi-threading granularity as well as
         the size of individual record batches or table chunks.
+        Minimum valid value for block size is 1
     skip_rows: int, optional (default 0)
         The number of rows to skip before the column names (if any)
         and the CSV data.
@@ -189,6 +190,9 @@ cdef class ReadOptions(_Weakrefable):
     def skip_rows_after_names(self, value):
         deref(self.options).skip_rows_after_names = value
 
+    def validate(self):
+        check_status(deref(self.options).Validate())
+
     def equals(self, ReadOptions other):
         return (
             self.use_threads == other.use_threads and
@@ -359,6 +363,9 @@ cdef class ParseOptions(_Weakrefable):
     def ignore_empty_lines(self, value):
         deref(self.options).ignore_empty_lines = value
 
+    def validate(self):
+        check_status(deref(self.options).Validate())
+
     def equals(self, ParseOptions other):
         return (
             self.delimiter == other.delimiter and
@@ -680,6 +687,9 @@ cdef class ConvertOptions(_Weakrefable):
         out.options.reset(new CCSVConvertOptions(move(options)))
         return out
 
+    def validate(self):
+        check_status(deref(self.options).Validate())
+
     def equals(self, ConvertOptions other):
         return (
             self.check_utf8 == other.check_utf8 and
@@ -941,6 +951,9 @@ cdef class WriteOptions(_Weakrefable):
     def batch_size(self, value):
         self.options.batch_size = value
 
+    def validate(self):
+        check_status(self.options.Validate())
+
 
 cdef _get_write_options(WriteOptions write_options, CCSVWriteOptions* out):
     if write_options is None:
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 35a2034eba4..b1fb04a1f8e 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1592,6 +1592,8 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
         @staticmethod
         CCSVParseOptions Defaults()
 
+        CStatus Validate()
+
     cdef cppclass CCSVConvertOptions" arrow::csv::ConvertOptions":
         c_bool check_utf8
         unordered_map[c_string, shared_ptr[CDataType]] column_types
@@ -1613,6 +1615,8 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
         @staticmethod
         CCSVConvertOptions Defaults()
 
+        CStatus Validate()
+
     cdef cppclass CCSVReadOptions" arrow::csv::ReadOptions":
         c_bool use_threads
         int32_t block_size
@@ -1627,6 +1631,8 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
         @staticmethod
         CCSVReadOptions Defaults()
 
+        CStatus Validate()
+
     cdef cppclass CCSVWriteOptions" arrow::csv::WriteOptions":
         c_bool include_header
         int32_t batch_size
@@ -1634,6 +1640,8 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
         @staticmethod
         CCSVWriteOptions Defaults()
 
+        CStatus Validate()
+
     cdef cppclass CCSVReader" arrow::csv::TableReader":
         @staticmethod
         CResult[shared_ptr[CCSVReader]] Make(
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 32c0353fada..48cdff75f97 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -132,6 +132,34 @@ def test_read_options():
     opts = cls(block_size=1234)
     assert opts.block_size == 1234
 
+    opts.validate()
+
+    match = "ReadOptions: block_size must be at least 1: 0"
+    with pytest.raises(pa.ArrowInvalid, match=match):
+        opts = cls()
+        opts.block_size = 0
+        opts.validate()
+
+    match = "ReadOptions: skip_rows cannot be negative: -1"
+    with pytest.raises(pa.ArrowInvalid, match=match):
+        opts = cls()
+        opts.skip_rows = -1
+        opts.validate()
+
+    match = "ReadOptions: skip_rows_after_names cannot be negative: -1"
+    with pytest.raises(pa.ArrowInvalid, match=match):
+        opts = cls()
+        opts.skip_rows_after_names = -1
+        opts.validate()
+
+    match = "ReadOptions: autogenerate_column_names cannot be true when" \
+            " column_names are provided"
+    with pytest.raises(pa.ArrowInvalid, match=match):
+        opts = cls()
+        opts.autogenerate_column_names = True
+        opts.column_names = ('a', 'b')
+        opts.validate()
+
 
 def test_parse_options():
     cls = ParseOptions
@@ -150,6 +178,44 @@ def test_parse_options():
                                  newlines_in_values=True,
                                  ignore_empty_lines=False)
 
+    cls().validate()
+    opts = cls()
+    opts.delimiter = "\t"
+    opts.validate()
+
+    match = "ParseOptions: delimiter cannot be \\\\r or \\\\n"
+    with pytest.raises(pa.ArrowInvalid, match=match):
+        opts = cls()
+        opts.delimiter = "\n"
+        opts.validate()
+
+    with pytest.raises(pa.ArrowInvalid, match=match):
+        opts = cls()
+        opts.delimiter = "\r"
+        opts.validate()
+
+    match = "ParseOptions: quote_char cannot be \\\\r or \\\\n"
+    with pytest.raises(pa.ArrowInvalid, match=match):
+        opts = cls()
+        opts.quote_char = "\n"
+        opts.validate()
+
+    with pytest.raises(pa.ArrowInvalid, match=match):
+        opts = cls()
+        opts.quote_char = "\r"
+        opts.validate()
+
+    match = "ParseOptions: escape_char cannot be \\\\r or \\\\n"
+    with pytest.raises(pa.ArrowInvalid, match=match):
+        opts = cls()
+        opts.escape_char = "\n"
+        opts.validate()
+
+    with pytest.raises(pa.ArrowInvalid, match=match):
+        opts = cls()
+        opts.escape_char = "\r"
+        opts.validate()
+
 
 def test_convert_options():
     cls = ConvertOptions
@@ -238,6 +304,14 @@ def test_write_options():
     opts = cls(batch_size=9876)
     assert opts.batch_size == 9876
 
+    opts.validate()
+
+    match = "WriteOptions: batch_size must be at least 1: 0"
+    with pytest.raises(pa.ArrowInvalid, match=match):
+        opts = cls()
+        opts.batch_size = 0
+        opts.validate()
+
 
 class BaseTestCSVRead:
 

From 44495dbca134574d89c75a408f9b8d24dd76819a Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Wed, 16 Jun 2021 14:29:03 +0200
Subject: [PATCH 24/61] ARROW-13090: [Python] Fix create_dir() implementation
 in FSSpecHandler

Recent fsspec version have started raising FileExistsError if the target directory already exists.  Ignore the error, as create_dir() is supposed to succeed in that case.

Closes #10540 from pitrou/ARROW-13090-fsspec-create-dir

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/pyarrow/fs.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py
index fe505530751..1b86e4b7e0f 100644
--- a/python/pyarrow/fs.py
+++ b/python/pyarrow/fs.py
@@ -263,7 +263,10 @@ def get_file_info_selector(self, selector):
 
     def create_dir(self, path, recursive):
         # mkdir also raises FileNotFoundError when base directory is not found
-        self.fs.mkdir(path, create_parents=recursive)
+        try:
+            self.fs.mkdir(path, create_parents=recursive)
+        except FileExistsError:
+            pass
 
     def delete_dir(self, path):
         self.fs.rm(path, recursive=True)

From 43bafb875dd4578f72e449fd7c54c88c9df29dff Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Wed, 16 Jun 2021 20:05:33 +0200
Subject: [PATCH 25/61] ARROW-10115: [C++] Add CSV option to treat quoted
 strings as always non-null

The option is only applicable to string and binary columns.

Closes #10503 from pitrou/ARROW-10115-csv-quoted-nulls

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/csv/converter.cc       |   1 +
 cpp/src/arrow/csv/converter_test.cc  | 132 +++++++++++++++++----------
 cpp/src/arrow/csv/options.h          |   7 ++
 python/pyarrow/_csv.pyx              |  40 ++++++--
 python/pyarrow/includes/libarrow.pxd |   1 +
 python/pyarrow/tests/test_csv.py     |  16 +++-
 6 files changed, 138 insertions(+), 59 deletions(-)

diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index feebf374e38..cb72b22b405 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -185,6 +185,7 @@ struct BinaryValueDecoder : public ValueDecoder {
 
   bool IsNull(const uint8_t* data, uint32_t size, bool quoted) {
     return options_.strings_can_be_null &&
+           (!quoted || options_.quoted_strings_can_be_null) &&
            ValueDecoder::IsNull(data, size, false /* quoted */);
   }
 };
diff --git a/cpp/src/arrow/csv/converter_test.cc b/cpp/src/arrow/csv/converter_test.cc
index e12e3d17a83..4bed649d558 100644
--- a/cpp/src/arrow/csv/converter_test.cc
+++ b/cpp/src/arrow/csv/converter_test.cc
@@ -174,67 +174,105 @@ void AssertConversionError(const std::shared_ptr<DataType>& type,
 // Converter tests
 
 template <typename T>
-static void TestBinaryConversionBasics() {
-  auto type = TypeTraits<T>::type_singleton();
-  AssertConversion<T, std::string>(type, {"ab,cdé\n", ",\xffgh\n"},
-                                   {{"ab", ""}, {"cdé", "\xffgh"}});
-}
-
-TEST(BinaryConversion, Basics) { TestBinaryConversionBasics<BinaryType>(); }
+class BinaryConversionTestBase : public testing::Test {
+ public:
+  std::shared_ptr<DataType> type() { return TypeTraits<T>::type_singleton(); }
 
-TEST(LargeBinaryConversion, Basics) { TestBinaryConversionBasics<LargeBinaryType>(); }
+  void TestNulls() {
+    auto type = this->type();
+    AssertConversion<T, std::string>(type, {"ab,N/A\n", "NULL,\n"},
+                                     {{"ab", "NULL"}, {"N/A", ""}},
+                                     {{true, true}, {true, true}});
 
-TEST(BinaryConversion, Nulls) {
-  AssertConversion<BinaryType, std::string>(binary(), {"ab,N/A\n", "NULL,\n"},
-                                            {{"ab", "NULL"}, {"N/A", ""}},
-                                            {{true, true}, {true, true}});
+    auto options = ConvertOptions::Defaults();
+    options.strings_can_be_null = true;
+    AssertConversion<T, std::string>(type, {"ab,N/A\n", "NULL,\n"},
+                                     {{"ab", ""}, {"", ""}},
+                                     {{true, false}, {false, false}}, options);
+    AssertConversion<T, std::string>(type, {"ab,\"N/A\"\n", "\"NULL\",\"\"\n"},
+                                     {{"ab", ""}, {"", ""}},
+                                     {{true, false}, {false, false}}, options);
+    options.quoted_strings_can_be_null = false;
+    AssertConversion<T, std::string>(type, {"ab,N/A\n", "NULL,\n"},
+                                     {{"ab", ""}, {"", ""}},
+                                     {{true, false}, {false, false}}, options);
+    AssertConversion<T, std::string>(type, {"ab,\"N/A\"\n", "\"NULL\",\"\"\n"},
+                                     {{"ab", "NULL"}, {"N/A", ""}},
+                                     {{true, true}, {true, true}}, options);
+  }
 
-  auto options = ConvertOptions::Defaults();
-  options.strings_can_be_null = true;
-  AssertConversion<BinaryType, std::string>(binary(), {"ab,N/A\n", "NULL,\n"},
-                                            {{"ab", ""}, {"", ""}},
-                                            {{true, false}, {false, false}}, options);
-}
+  void TestCustomNulls() {
+    auto type = this->type();
+    auto options = ConvertOptions::Defaults();
+    options.null_values = {"xxx", "zzz"};
+    AssertConversion<T, std::string>(type, {"ab,N/A\n", "xxx,\"zzz\"\n"},
+                                     {{"ab", "xxx"}, {"N/A", "zzz"}},
+                                     {{true, true}, {true, true}}, options);
+
+    options.strings_can_be_null = true;
+    AssertConversion<T, std::string>(type, {"ab,N/A\n", "xxx,\"zzz\"\n"},
+                                     {{"ab", ""}, {"N/A", ""}},
+                                     {{true, false}, {true, false}}, options);
+    options.quoted_strings_can_be_null = false;
+    AssertConversion<T, std::string>(type, {"ab,N/A\n", "xxx,\"zzz\"\n"},
+                                     {{"ab", ""}, {"N/A", "zzz"}},
+                                     {{true, false}, {true, true}}, options);
+  }
+};
 
 template <typename T>
-static void TestStringConversionBasics() {
-  auto type = TypeTraits<T>::type_singleton();
-  AssertConversion<T, std::string>(type, {"ab,cdé\n", ",gh\n"},
-                                   {{"ab", ""}, {"cdé", "gh"}});
+class BinaryConversionTest : public BinaryConversionTestBase<T> {
+ public:
+  void TestBasics() {
+    auto type = this->type();
+    AssertConversion<T, std::string>(type, {"ab,cdé\n", ",\xffgh\n"},
+                                     {{"ab", ""}, {"cdé", "\xffgh"}});
+  }
+};
 
-  auto options = ConvertOptions::Defaults();
-  options.check_utf8 = false;
-  AssertConversion<T, std::string>(type, {"ab,cdé\n", ",\xffgh\n"},
-                                   {{"ab", ""}, {"cdé", "\xffgh"}}, options,
-                                   /*validate_full=*/false);
-}
+using BinaryTestTypes = ::testing::Types<BinaryType, LargeBinaryType>;
 
-TEST(StringConversion, Basics) { TestStringConversionBasics<StringType>(); }
+TYPED_TEST_SUITE(BinaryConversionTest, BinaryTestTypes);
 
-TEST(LargeStringConversion, Basics) { TestStringConversionBasics<LargeStringType>(); }
+TYPED_TEST(BinaryConversionTest, Basics) { this->TestBasics(); }
 
-TEST(StringConversion, Nulls) {
-  AssertConversion<StringType, std::string>(utf8(), {"ab,N/A\n", "NULL,\n"},
-                                            {{"ab", "NULL"}, {"N/A", ""}},
-                                            {{true, true}, {true, true}});
+TYPED_TEST(BinaryConversionTest, Nulls) { this->TestNulls(); }
 
-  auto options = ConvertOptions::Defaults();
-  options.strings_can_be_null = true;
-  AssertConversion<StringType, std::string>(utf8(), {"ab,N/A\n", "NULL,\n"},
-                                            {{"ab", ""}, {"", ""}},
-                                            {{true, false}, {false, false}}, options);
-}
+TYPED_TEST(BinaryConversionTest, CustomNulls) { this->TestNulls(); }
 
 template <typename T>
-static void TestStringConversionErrors() {
-  auto type = TypeTraits<T>::type_singleton();
-  // Invalid UTF8 in column 0
-  AssertConversionError(type, {"ab,cdé\n", "\xff,gh\n"}, {0});
-}
+class StringConversionTest : public BinaryConversionTestBase<T> {
+ public:
+  void TestBasics() {
+    auto type = TypeTraits<T>::type_singleton();
+    AssertConversion<T, std::string>(type, {"ab,cdé\n", ",gh\n"},
+                                     {{"ab", ""}, {"cdé", "gh"}});
+  }
+
+  void TestInvalidUtf8() {
+    auto type = TypeTraits<T>::type_singleton();
+    // Invalid UTF8 in column 0
+    AssertConversionError(type, {"ab,cdé\n", "\xff,gh\n"}, {0});
+
+    auto options = ConvertOptions::Defaults();
+    options.check_utf8 = false;
+    AssertConversion<T, std::string>(type, {"ab,cdé\n", ",\xffgh\n"},
+                                     {{"ab", ""}, {"cdé", "\xffgh"}}, options,
+                                     /*validate_full=*/false);
+  }
+};
+
+using StringTestTypes = ::testing::Types<StringType, LargeStringType>;
+
+TYPED_TEST_SUITE(StringConversionTest, StringTestTypes);
+
+TYPED_TEST(StringConversionTest, Basics) { this->TestBasics(); }
+
+TYPED_TEST(StringConversionTest, Nulls) { this->TestNulls(); }
 
-TEST(StringConversion, Errors) { TestStringConversionErrors<StringType>(); }
+TYPED_TEST(StringConversionTest, CustomNulls) { this->TestCustomNulls(); }
 
-TEST(LargeStringConversion, Errors) { TestStringConversionErrors<LargeStringType>(); }
+TYPED_TEST(StringConversionTest, InvalidUtf8) { this->TestInvalidUtf8(); }
 
 TEST(FixedSizeBinaryConversion, Basics) {
   AssertConversion<FixedSizeBinaryType, std::string>(
diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
index 790c47fc3f4..1e423fd76db 100644
--- a/cpp/src/arrow/csv/options.h
+++ b/cpp/src/arrow/csv/options.h
@@ -78,11 +78,18 @@ struct ARROW_EXPORT ConvertOptions {
   std::vector<std::string> true_values;
   /// Recognized spellings for boolean false values
   std::vector<std::string> false_values;
+
   /// Whether string / binary columns can have null values.
   ///
   /// If true, then strings in "null_values" are considered null for string columns.
   /// If false, then all strings are valid string values.
   bool strings_can_be_null = false;
+  /// Whether string / binary columns can have quoted null values.
+  ///
+  /// If true *and* `strings_can_be_null` is true, then quoted strings in
+  /// "null_values" are also considered null for string columns.  Otherwise,
+  /// quoted strings are never considered null.
+  bool quoted_strings_can_be_null = true;
 
   /// Whether to try to automatically dict-encode string / binary data.
   /// If true, then when type inference detects a string or binary column,
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index 8ede8272c07..01cabc1d8b0 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -447,6 +447,12 @@ cdef class ConvertOptions(_Weakrefable):
         If true, then strings in null_values are considered null for
         string columns.
         If false, then all strings are valid string values.
+    quoted_strings_can_be_null: bool, optional (default True)
+        Whether string / binary columns can have quoted null values.
+        If true *and* strings_can_be_null is true, then strings in
+        null_values are considered null for string columns, even when
+        quoted.
+        Otherwise, then all quoted strings are valid string values.
     auto_dict_encode: bool, optional (default False)
         Whether to try to automatically dict-encode string / binary data.
         If true, then when type inference detects a string or binary column,
@@ -478,9 +484,10 @@ cdef class ConvertOptions(_Weakrefable):
 
     def __init__(self, *, check_utf8=None, column_types=None, null_values=None,
                  true_values=None, false_values=None,
-                 strings_can_be_null=None, include_columns=None,
-                 include_missing_columns=None, auto_dict_encode=None,
-                 auto_dict_max_cardinality=None, timestamp_parsers=None):
+                 strings_can_be_null=None, quoted_strings_can_be_null=None,
+                 include_columns=None, include_missing_columns=None,
+                 auto_dict_encode=None, auto_dict_max_cardinality=None,
+                 timestamp_parsers=None):
         if check_utf8 is not None:
             self.check_utf8 = check_utf8
         if column_types is not None:
@@ -493,6 +500,8 @@ cdef class ConvertOptions(_Weakrefable):
             self.false_values = false_values
         if strings_can_be_null is not None:
             self.strings_can_be_null = strings_can_be_null
+        if quoted_strings_can_be_null is not None:
+            self.quoted_strings_can_be_null = quoted_strings_can_be_null
         if include_columns is not None:
             self.include_columns = include_columns
         if include_missing_columns is not None:
@@ -526,6 +535,17 @@ cdef class ConvertOptions(_Weakrefable):
     def strings_can_be_null(self, value):
         deref(self.options).strings_can_be_null = value
 
+    @property
+    def quoted_strings_can_be_null(self):
+        """
+        Whether string / binary columns can have quoted null values.
+        """
+        return deref(self.options).quoted_strings_can_be_null
+
+    @quoted_strings_can_be_null.setter
+    def quoted_strings_can_be_null(self, value):
+        deref(self.options).quoted_strings_can_be_null = value
+
     @property
     def column_types(self):
         """
@@ -699,6 +719,8 @@ cdef class ConvertOptions(_Weakrefable):
             self.false_values == other.false_values and
             self.timestamp_parsers == other.timestamp_parsers and
             self.strings_can_be_null == other.strings_can_be_null and
+            self.quoted_strings_can_be_null ==
+            other.quoted_strings_can_be_null and
             self.auto_dict_encode == other.auto_dict_encode and
             self.auto_dict_max_cardinality ==
             other.auto_dict_max_cardinality and
@@ -709,16 +731,16 @@ cdef class ConvertOptions(_Weakrefable):
     def __getstate__(self):
         return (self.check_utf8, self.column_types, self.null_values,
                 self.true_values, self.false_values, self.timestamp_parsers,
-                self.strings_can_be_null, self.auto_dict_encode,
-                self.auto_dict_max_cardinality, self.include_columns,
-                self.include_missing_columns)
+                self.strings_can_be_null, self.quoted_strings_can_be_null,
+                self.auto_dict_encode, self.auto_dict_max_cardinality,
+                self.include_columns, self.include_missing_columns)
 
     def __setstate__(self, state):
         (self.check_utf8, self.column_types, self.null_values,
          self.true_values, self.false_values, self.timestamp_parsers,
-         self.strings_can_be_null, self.auto_dict_encode,
-         self.auto_dict_max_cardinality, self.include_columns,
-         self.include_missing_columns) = state
+         self.strings_can_be_null, self.quoted_strings_can_be_null,
+         self.auto_dict_encode, self.auto_dict_max_cardinality,
+         self.include_columns, self.include_missing_columns) = state
 
     def __eq__(self, other):
         try:
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index b1fb04a1f8e..072062385ca 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1601,6 +1601,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
         vector[c_string] true_values
         vector[c_string] false_values
         c_bool strings_can_be_null
+        c_bool quoted_strings_can_be_null
         vector[shared_ptr[CTimestampParser]] timestamp_parsers
 
         c_bool auto_dict_encode
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index 48cdff75f97..482973a7258 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -224,14 +224,16 @@ def test_convert_options():
     check_options_class(
         cls, check_utf8=[True, False],
         strings_can_be_null=[False, True],
+        quoted_strings_can_be_null=[True, False],
         include_columns=[[], ['def', 'abc']],
         include_missing_columns=[False, True],
         auto_dict_encode=[False, True],
         timestamp_parsers=[[], [ISO8601, '%y-%m']])
 
     check_options_class_pickling(
-        cls, check_utf8=True,
-        strings_can_be_null=False,
+        cls, check_utf8=False,
+        strings_can_be_null=True,
+        quoted_strings_can_be_null=False,
         include_columns=['def', 'abc'],
         include_missing_columns=False,
         auto_dict_encode=True,
@@ -828,7 +830,7 @@ def test_auto_dict_encode(self):
     def test_custom_nulls(self):
         # Infer nulls with custom values
         opts = ConvertOptions(null_values=['Xxx', 'Zzz'])
-        rows = b"a,b,c,d\nZzz,Xxx,1,2\nXxx,#N/A,,Zzz\n"
+        rows = b"""a,b,c,d\nZzz,"Xxx",1,2\nXxx,#N/A,,Zzz\n"""
         table = self.read_bytes(rows, convert_options=opts)
         schema = pa.schema([('a', pa.null()),
                             ('b', pa.string()),
@@ -851,6 +853,14 @@ def test_custom_nulls(self):
             'c': ["1", ""],
             'd': [2, None],
         }
+        opts.quoted_strings_can_be_null = False
+        table = self.read_bytes(rows, convert_options=opts)
+        assert table.to_pydict() == {
+            'a': [None, None],
+            'b': ["Xxx", "#N/A"],
+            'c': ["1", ""],
+            'd': [2, None],
+        }
 
         opts = ConvertOptions(null_values=[])
         rows = b"a,b\n#N/A,\n"

From ca7f9ee34ec1573526c984b8a20932218b9fc3e9 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Wed, 16 Jun 2021 20:19:27 +0200
Subject: [PATCH 26/61] ARROW-12709: [C++] Add binary_join_element_wise

This adds a variadic scalar string join kernel, using the last argument (min 1 argument) as the separator. An options class allows emitting null (the default), skipping null non-separator arguments, or replacing null non-separator arguments with another string (mimicking libcudf).

Closes #10520 from lidavidm/arrow-12709

Lead-authored-by: David Li <li.davidm96@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/compute/api_scalar.cc           |   8 +-
 cpp/src/arrow/compute/api_scalar.h            |  23 +-
 .../arrow/compute/kernels/scalar_compare.cc   |  16 +-
 .../compute/kernels/scalar_compare_test.cc    | 184 +++++++-------
 .../arrow/compute/kernels/scalar_string.cc    | 239 +++++++++++++++++-
 .../kernels/scalar_string_benchmark.cc        |  43 ++++
 .../compute/kernels/scalar_string_test.cc     | 119 +++++++++
 docs/source/cpp/compute.rst                   |  22 +-
 docs/source/python/api/compute.rst            |  21 +-
 python/pyarrow/_compute.pyx                   |  31 +++
 python/pyarrow/compute.py                     |   1 +
 python/pyarrow/includes/libarrow.pxd          |  16 ++
 python/pyarrow/tests/test_compute.py          |  52 +++-
 13 files changed, 643 insertions(+), 132 deletions(-)

diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc
index dba71456c29..db1cac290cf 100644
--- a/cpp/src/arrow/compute/api_scalar.cc
+++ b/cpp/src/arrow/compute/api_scalar.cc
@@ -63,14 +63,14 @@ SCALAR_ARITHMETIC_BINARY(Multiply, "multiply", "multiply_checked")
 SCALAR_ARITHMETIC_BINARY(Divide, "divide", "divide_checked")
 SCALAR_ARITHMETIC_BINARY(Power, "power", "power_checked")
 
-Result<Datum> ElementWiseMax(const std::vector<Datum>& args,
+Result<Datum> MaxElementWise(const std::vector<Datum>& args,
                              ElementWiseAggregateOptions options, ExecContext* ctx) {
-  return CallFunction("element_wise_max", args, &options, ctx);
+  return CallFunction("max_element_wise", args, &options, ctx);
 }
 
-Result<Datum> ElementWiseMin(const std::vector<Datum>& args,
+Result<Datum> MinElementWise(const std::vector<Datum>& args,
                              ElementWiseAggregateOptions options, ExecContext* ctx) {
-  return CallFunction("element_wise_min", args, &options, ctx);
+  return CallFunction("min_element_wise", args, &options, ctx);
 }
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h
index 6e9a9340f2c..082876b356b 100644
--- a/cpp/src/arrow/compute/api_scalar.h
+++ b/cpp/src/arrow/compute/api_scalar.h
@@ -48,6 +48,25 @@ struct ARROW_EXPORT ElementWiseAggregateOptions : public FunctionOptions {
   bool skip_nulls;
 };
 
+/// Options for var_args_join.
+struct ARROW_EXPORT JoinOptions : public FunctionOptions {
+  /// How to handle null values. (A null separator always results in a null output.)
+  enum NullHandlingBehavior {
+    /// A null in any input results in a null in the output.
+    EMIT_NULL,
+    /// Nulls in inputs are skipped.
+    SKIP,
+    /// Nulls in inputs are replaced with the replacement string.
+    REPLACE,
+  };
+  explicit JoinOptions(NullHandlingBehavior null_handling = EMIT_NULL,
+                       std::string null_replacement = "")
+      : null_handling(null_handling), null_replacement(std::move(null_replacement)) {}
+  static JoinOptions Defaults() { return JoinOptions(); }
+  NullHandlingBehavior null_handling;
+  std::string null_replacement;
+};
+
 struct ARROW_EXPORT MatchSubstringOptions : public FunctionOptions {
   explicit MatchSubstringOptions(std::string pattern, bool ignore_case = false)
       : pattern(std::move(pattern)), ignore_case(ignore_case) {}
@@ -287,7 +306,7 @@ Result<Datum> Power(const Datum& left, const Datum& right,
 /// \param[in] ctx the function execution context, optional
 /// \return the element-wise maximum
 ARROW_EXPORT
-Result<Datum> ElementWiseMax(
+Result<Datum> MaxElementWise(
     const std::vector<Datum>& args,
     ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(),
     ExecContext* ctx = NULLPTR);
@@ -300,7 +319,7 @@ Result<Datum> ElementWiseMax(
 /// \param[in] ctx the function execution context, optional
 /// \return the element-wise minimum
 ARROW_EXPORT
-Result<Datum> ElementWiseMin(
+Result<Datum> MinElementWise(
     const std::vector<Datum>& args,
     ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(),
     ExecContext* ctx = NULLPTR);
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc
index 6763b6793f3..041c6a282f9 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -467,14 +467,14 @@ const FunctionDoc less_equal_doc{
     ("A null on either side emits a null comparison result."),
     {"x", "y"}};
 
-const FunctionDoc element_wise_min_doc{
+const FunctionDoc min_element_wise_doc{
     "Find the element-wise minimum value",
     ("Nulls will be ignored (default) or propagated. "
      "NaN will be taken over null, but not over any valid float."),
     {"*args"},
     "ElementWiseAggregateOptions"};
 
-const FunctionDoc element_wise_max_doc{
+const FunctionDoc max_element_wise_doc{
     "Find the element-wise maximum value",
     ("Nulls will be ignored (default) or propagated. "
      "NaN will be taken over null, but not over any valid float."),
@@ -501,13 +501,13 @@ void RegisterScalarComparison(FunctionRegistry* registry) {
   // ----------------------------------------------------------------------
   // Variadic element-wise functions
 
-  auto element_wise_min =
-      MakeScalarMinMax<Minimum>("element_wise_min", &element_wise_min_doc);
-  DCHECK_OK(registry->AddFunction(std::move(element_wise_min)));
+  auto min_element_wise =
+      MakeScalarMinMax<Minimum>("min_element_wise", &min_element_wise_doc);
+  DCHECK_OK(registry->AddFunction(std::move(min_element_wise)));
 
-  auto element_wise_max =
-      MakeScalarMinMax<Maximum>("element_wise_max", &element_wise_max_doc);
-  DCHECK_OK(registry->AddFunction(std::move(element_wise_max)));
+  auto max_element_wise =
+      MakeScalarMinMax<Maximum>("max_element_wise", &max_element_wise_doc);
+  DCHECK_OK(registry->AddFunction(std::move(max_element_wise)));
 }
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
index 6318a891d3a..50327e82032 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
@@ -729,90 +729,90 @@ TYPED_TEST_SUITE(TestVarArgsCompareNumeric, NumericBasedTypes);
 TYPED_TEST_SUITE(TestVarArgsCompareFloating, RealArrowTypes);
 TYPED_TEST_SUITE(TestVarArgsCompareParametricTemporal, ParametricTemporalTypes);
 
-TYPED_TEST(TestVarArgsCompareNumeric, ElementWiseMin) {
-  this->AssertNullScalar(ElementWiseMin, {});
-  this->AssertNullScalar(ElementWiseMin, {this->scalar("null"), this->scalar("null")});
+TYPED_TEST(TestVarArgsCompareNumeric, MinElementWise) {
+  this->AssertNullScalar(MinElementWise, {});
+  this->AssertNullScalar(MinElementWise, {this->scalar("null"), this->scalar("null")});
 
-  this->Assert(ElementWiseMin, this->scalar("0"), {this->scalar("0")});
-  this->Assert(ElementWiseMin, this->scalar("0"),
+  this->Assert(MinElementWise, this->scalar("0"), {this->scalar("0")});
+  this->Assert(MinElementWise, this->scalar("0"),
                {this->scalar("2"), this->scalar("0"), this->scalar("1")});
   this->Assert(
-      ElementWiseMin, this->scalar("0"),
+      MinElementWise, this->scalar("0"),
       {this->scalar("2"), this->scalar("0"), this->scalar("1"), this->scalar("null")});
-  this->Assert(ElementWiseMin, this->scalar("1"),
+  this->Assert(MinElementWise, this->scalar("1"),
                {this->scalar("null"), this->scalar("null"), this->scalar("1"),
                 this->scalar("null")});
 
-  this->Assert(ElementWiseMin, (this->array("[]")), {this->array("[]")});
-  this->Assert(ElementWiseMin, this->array("[1, 2, 3, null]"),
+  this->Assert(MinElementWise, (this->array("[]")), {this->array("[]")});
+  this->Assert(MinElementWise, this->array("[1, 2, 3, null]"),
                {this->array("[1, 2, 3, null]")});
 
-  this->Assert(ElementWiseMin, this->array("[1, 2, 2, 2]"),
+  this->Assert(MinElementWise, this->array("[1, 2, 2, 2]"),
                {this->array("[1, 2, 3, 4]"), this->scalar("2")});
-  this->Assert(ElementWiseMin, this->array("[1, 2, 2, 2]"),
+  this->Assert(MinElementWise, this->array("[1, 2, 2, 2]"),
                {this->array("[1, null, 3, 4]"), this->scalar("2")});
-  this->Assert(ElementWiseMin, this->array("[1, 2, 2, 2]"),
+  this->Assert(MinElementWise, this->array("[1, 2, 2, 2]"),
                {this->array("[1, null, 3, 4]"), this->scalar("2"), this->scalar("4")});
-  this->Assert(ElementWiseMin, this->array("[1, 2, 2, 2]"),
+  this->Assert(MinElementWise, this->array("[1, 2, 2, 2]"),
                {this->array("[1, null, 3, 4]"), this->scalar("null"), this->scalar("2")});
 
-  this->Assert(ElementWiseMin, this->array("[1, 2, 2, 2]"),
+  this->Assert(MinElementWise, this->array("[1, 2, 2, 2]"),
                {this->array("[1, 2, 3, 4]"), this->array("[2, 2, 2, 2]")});
-  this->Assert(ElementWiseMin, this->array("[1, 2, 2, 2]"),
+  this->Assert(MinElementWise, this->array("[1, 2, 2, 2]"),
                {this->array("[1, 2, 3, 4]"), this->array("[2, null, 2, 2]")});
-  this->Assert(ElementWiseMin, this->array("[1, 2, 2, 2]"),
+  this->Assert(MinElementWise, this->array("[1, 2, 2, 2]"),
                {this->array("[1, null, 3, 4]"), this->array("[2, 2, 2, 2]")});
 
-  this->Assert(ElementWiseMin, this->array("[1, 2, null, 6]"),
+  this->Assert(MinElementWise, this->array("[1, 2, null, 6]"),
                {this->array("[1, 2, null, null]"), this->array("[4, null, null, 6]")});
-  this->Assert(ElementWiseMin, this->array("[1, 2, null, 6]"),
+  this->Assert(MinElementWise, this->array("[1, 2, null, 6]"),
                {this->array("[4, null, null, 6]"), this->array("[1, 2, null, null]")});
-  this->Assert(ElementWiseMin, this->array("[1, 2, 3, 4]"),
+  this->Assert(MinElementWise, this->array("[1, 2, 3, 4]"),
                {this->array("[1, 2, 3, 4]"), this->array("[null, null, null, null]")});
-  this->Assert(ElementWiseMin, this->array("[1, 2, 3, 4]"),
+  this->Assert(MinElementWise, this->array("[1, 2, 3, 4]"),
                {this->array("[null, null, null, null]"), this->array("[1, 2, 3, 4]")});
 
-  this->Assert(ElementWiseMin, this->array("[1, 1, 1, 1]"),
+  this->Assert(MinElementWise, this->array("[1, 1, 1, 1]"),
                {this->scalar("1"), this->array("[1, 2, 3, 4]")});
-  this->Assert(ElementWiseMin, this->array("[1, 1, 1, 1]"),
+  this->Assert(MinElementWise, this->array("[1, 1, 1, 1]"),
                {this->scalar("1"), this->array("[null, null, null, null]")});
-  this->Assert(ElementWiseMin, this->array("[1, 1, 1, 1]"),
+  this->Assert(MinElementWise, this->array("[1, 1, 1, 1]"),
                {this->scalar("null"), this->array("[1, 1, 1, 1]")});
-  this->Assert(ElementWiseMin, this->array("[null, null, null, null]"),
+  this->Assert(MinElementWise, this->array("[null, null, null, null]"),
                {this->scalar("null"), this->array("[null, null, null, null]")});
 
   // Test null handling
   this->element_wise_aggregate_options_.skip_nulls = false;
-  this->AssertNullScalar(ElementWiseMin, {this->scalar("null"), this->scalar("null")});
-  this->AssertNullScalar(ElementWiseMin, {this->scalar("0"), this->scalar("null")});
+  this->AssertNullScalar(MinElementWise, {this->scalar("null"), this->scalar("null")});
+  this->AssertNullScalar(MinElementWise, {this->scalar("0"), this->scalar("null")});
 
-  this->Assert(ElementWiseMin, this->array("[1, null, 2, 2]"),
+  this->Assert(MinElementWise, this->array("[1, null, 2, 2]"),
                {this->array("[1, null, 3, 4]"), this->scalar("2"), this->scalar("4")});
-  this->Assert(ElementWiseMin, this->array("[null, null, null, null]"),
+  this->Assert(MinElementWise, this->array("[null, null, null, null]"),
                {this->array("[1, null, 3, 4]"), this->scalar("null"), this->scalar("2")});
-  this->Assert(ElementWiseMin, this->array("[1, null, 2, 2]"),
+  this->Assert(MinElementWise, this->array("[1, null, 2, 2]"),
                {this->array("[1, 2, 3, 4]"), this->array("[2, null, 2, 2]")});
 
-  this->Assert(ElementWiseMin, this->array("[null, null, null, null]"),
+  this->Assert(MinElementWise, this->array("[null, null, null, null]"),
                {this->scalar("1"), this->array("[null, null, null, null]")});
-  this->Assert(ElementWiseMin, this->array("[null, null, null, null]"),
+  this->Assert(MinElementWise, this->array("[null, null, null, null]"),
                {this->scalar("null"), this->array("[1, 1, 1, 1]")});
 }
 
-TYPED_TEST(TestVarArgsCompareFloating, ElementWiseMin) {
+TYPED_TEST(TestVarArgsCompareFloating, MinElementWise) {
   auto Check = [this](const std::string& expected,
                       const std::vector<std::string>& inputs) {
     std::vector<Datum> args;
     for (const auto& input : inputs) {
       args.emplace_back(this->scalar(input));
     }
-    this->Assert(ElementWiseMin, this->scalar(expected), args);
+    this->Assert(MinElementWise, this->scalar(expected), args);
 
     args.clear();
     for (const auto& input : inputs) {
       args.emplace_back(this->array("[" + input + "]"));
     }
-    this->Assert(ElementWiseMin, this->array("[" + expected + "]"), args);
+    this->Assert(MinElementWise, this->array("[" + expected + "]"), args);
   };
   Check("-0.0", {"0.0", "-0.0"});
   Check("-0.0", {"1.0", "-0.0", "0.0"});
@@ -828,111 +828,111 @@ TYPED_TEST(TestVarArgsCompareFloating, ElementWiseMin) {
   Check("-Inf", {"0", "-Inf"});
 }
 
-TYPED_TEST(TestVarArgsCompareParametricTemporal, ElementWiseMin) {
+TYPED_TEST(TestVarArgsCompareParametricTemporal, MinElementWise) {
   // Temporal kernel is implemented with numeric kernel underneath
-  this->AssertNullScalar(ElementWiseMin, {});
-  this->AssertNullScalar(ElementWiseMin, {this->scalar("null"), this->scalar("null")});
+  this->AssertNullScalar(MinElementWise, {});
+  this->AssertNullScalar(MinElementWise, {this->scalar("null"), this->scalar("null")});
 
-  this->Assert(ElementWiseMin, this->scalar("0"), {this->scalar("0")});
-  this->Assert(ElementWiseMin, this->scalar("0"), {this->scalar("2"), this->scalar("0")});
-  this->Assert(ElementWiseMin, this->scalar("0"),
+  this->Assert(MinElementWise, this->scalar("0"), {this->scalar("0")});
+  this->Assert(MinElementWise, this->scalar("0"), {this->scalar("2"), this->scalar("0")});
+  this->Assert(MinElementWise, this->scalar("0"),
                {this->scalar("0"), this->scalar("null")});
 
-  this->Assert(ElementWiseMin, (this->array("[]")), {this->array("[]")});
-  this->Assert(ElementWiseMin, this->array("[1, 2, 3, null]"),
+  this->Assert(MinElementWise, (this->array("[]")), {this->array("[]")});
+  this->Assert(MinElementWise, this->array("[1, 2, 3, null]"),
                {this->array("[1, 2, 3, null]")});
 
-  this->Assert(ElementWiseMin, this->array("[1, 2, 2, 2]"),
+  this->Assert(MinElementWise, this->array("[1, 2, 2, 2]"),
                {this->array("[1, null, 3, 4]"), this->scalar("null"), this->scalar("2")});
 
-  this->Assert(ElementWiseMin, this->array("[1, 2, 3, 2]"),
+  this->Assert(MinElementWise, this->array("[1, 2, 3, 2]"),
                {this->array("[1, null, 3, 4]"), this->array("[2, 2, null, 2]")});
 }
 
-TYPED_TEST(TestVarArgsCompareNumeric, ElementWiseMax) {
-  this->AssertNullScalar(ElementWiseMax, {});
-  this->AssertNullScalar(ElementWiseMax, {this->scalar("null"), this->scalar("null")});
+TYPED_TEST(TestVarArgsCompareNumeric, MaxElementWise) {
+  this->AssertNullScalar(MaxElementWise, {});
+  this->AssertNullScalar(MaxElementWise, {this->scalar("null"), this->scalar("null")});
 
-  this->Assert(ElementWiseMax, this->scalar("0"), {this->scalar("0")});
-  this->Assert(ElementWiseMax, this->scalar("2"),
+  this->Assert(MaxElementWise, this->scalar("0"), {this->scalar("0")});
+  this->Assert(MaxElementWise, this->scalar("2"),
                {this->scalar("2"), this->scalar("0"), this->scalar("1")});
   this->Assert(
-      ElementWiseMax, this->scalar("2"),
+      MaxElementWise, this->scalar("2"),
       {this->scalar("2"), this->scalar("0"), this->scalar("1"), this->scalar("null")});
-  this->Assert(ElementWiseMax, this->scalar("1"),
+  this->Assert(MaxElementWise, this->scalar("1"),
                {this->scalar("null"), this->scalar("null"), this->scalar("1"),
                 this->scalar("null")});
 
-  this->Assert(ElementWiseMax, (this->array("[]")), {this->array("[]")});
-  this->Assert(ElementWiseMax, this->array("[1, 2, 3, null]"),
+  this->Assert(MaxElementWise, (this->array("[]")), {this->array("[]")});
+  this->Assert(MaxElementWise, this->array("[1, 2, 3, null]"),
                {this->array("[1, 2, 3, null]")});
 
-  this->Assert(ElementWiseMax, this->array("[2, 2, 3, 4]"),
+  this->Assert(MaxElementWise, this->array("[2, 2, 3, 4]"),
                {this->array("[1, 2, 3, 4]"), this->scalar("2")});
-  this->Assert(ElementWiseMax, this->array("[2, 2, 3, 4]"),
+  this->Assert(MaxElementWise, this->array("[2, 2, 3, 4]"),
                {this->array("[1, null, 3, 4]"), this->scalar("2")});
-  this->Assert(ElementWiseMax, this->array("[4, 4, 4, 4]"),
+  this->Assert(MaxElementWise, this->array("[4, 4, 4, 4]"),
                {this->array("[1, null, 3, 4]"), this->scalar("2"), this->scalar("4")});
-  this->Assert(ElementWiseMax, this->array("[2, 2, 3, 4]"),
+  this->Assert(MaxElementWise, this->array("[2, 2, 3, 4]"),
                {this->array("[1, null, 3, 4]"), this->scalar("null"), this->scalar("2")});
 
-  this->Assert(ElementWiseMax, this->array("[2, 2, 3, 4]"),
+  this->Assert(MaxElementWise, this->array("[2, 2, 3, 4]"),
                {this->array("[1, 2, 3, 4]"), this->array("[2, 2, 2, 2]")});
-  this->Assert(ElementWiseMax, this->array("[2, 2, 3, 4]"),
+  this->Assert(MaxElementWise, this->array("[2, 2, 3, 4]"),
                {this->array("[1, 2, 3, 4]"), this->array("[2, null, 2, 2]")});
-  this->Assert(ElementWiseMax, this->array("[2, 2, 3, 4]"),
+  this->Assert(MaxElementWise, this->array("[2, 2, 3, 4]"),
                {this->array("[1, null, 3, 4]"), this->array("[2, 2, 2, 2]")});
 
-  this->Assert(ElementWiseMax, this->array("[4, 2, null, 6]"),
+  this->Assert(MaxElementWise, this->array("[4, 2, null, 6]"),
                {this->array("[1, 2, null, null]"), this->array("[4, null, null, 6]")});
-  this->Assert(ElementWiseMax, this->array("[4, 2, null, 6]"),
+  this->Assert(MaxElementWise, this->array("[4, 2, null, 6]"),
                {this->array("[4, null, null, 6]"), this->array("[1, 2, null, null]")});
-  this->Assert(ElementWiseMax, this->array("[1, 2, 3, 4]"),
+  this->Assert(MaxElementWise, this->array("[1, 2, 3, 4]"),
                {this->array("[1, 2, 3, 4]"), this->array("[null, null, null, null]")});
-  this->Assert(ElementWiseMax, this->array("[1, 2, 3, 4]"),
+  this->Assert(MaxElementWise, this->array("[1, 2, 3, 4]"),
                {this->array("[null, null, null, null]"), this->array("[1, 2, 3, 4]")});
 
-  this->Assert(ElementWiseMax, this->array("[1, 2, 3, 4]"),
+  this->Assert(MaxElementWise, this->array("[1, 2, 3, 4]"),
                {this->scalar("1"), this->array("[1, 2, 3, 4]")});
-  this->Assert(ElementWiseMax, this->array("[1, 1, 1, 1]"),
+  this->Assert(MaxElementWise, this->array("[1, 1, 1, 1]"),
                {this->scalar("1"), this->array("[null, null, null, null]")});
-  this->Assert(ElementWiseMax, this->array("[1, 1, 1, 1]"),
+  this->Assert(MaxElementWise, this->array("[1, 1, 1, 1]"),
                {this->scalar("null"), this->array("[1, 1, 1, 1]")});
-  this->Assert(ElementWiseMax, this->array("[null, null, null, null]"),
+  this->Assert(MaxElementWise, this->array("[null, null, null, null]"),
                {this->scalar("null"), this->array("[null, null, null, null]")});
 
   // Test null handling
   this->element_wise_aggregate_options_.skip_nulls = false;
-  this->AssertNullScalar(ElementWiseMax, {this->scalar("null"), this->scalar("null")});
-  this->AssertNullScalar(ElementWiseMax, {this->scalar("0"), this->scalar("null")});
+  this->AssertNullScalar(MaxElementWise, {this->scalar("null"), this->scalar("null")});
+  this->AssertNullScalar(MaxElementWise, {this->scalar("0"), this->scalar("null")});
 
-  this->Assert(ElementWiseMax, this->array("[4, null, 4, 4]"),
+  this->Assert(MaxElementWise, this->array("[4, null, 4, 4]"),
                {this->array("[1, null, 3, 4]"), this->scalar("2"), this->scalar("4")});
-  this->Assert(ElementWiseMax, this->array("[null, null, null, null]"),
+  this->Assert(MaxElementWise, this->array("[null, null, null, null]"),
                {this->array("[1, null, 3, 4]"), this->scalar("null"), this->scalar("2")});
-  this->Assert(ElementWiseMax, this->array("[2, null, 3, 4]"),
+  this->Assert(MaxElementWise, this->array("[2, null, 3, 4]"),
                {this->array("[1, 2, 3, 4]"), this->array("[2, null, 2, 2]")});
 
-  this->Assert(ElementWiseMax, this->array("[null, null, null, null]"),
+  this->Assert(MaxElementWise, this->array("[null, null, null, null]"),
                {this->scalar("1"), this->array("[null, null, null, null]")});
-  this->Assert(ElementWiseMax, this->array("[null, null, null, null]"),
+  this->Assert(MaxElementWise, this->array("[null, null, null, null]"),
                {this->scalar("null"), this->array("[1, 1, 1, 1]")});
 }
 
-TYPED_TEST(TestVarArgsCompareFloating, ElementWiseMax) {
+TYPED_TEST(TestVarArgsCompareFloating, MaxElementWise) {
   auto Check = [this](const std::string& expected,
                       const std::vector<std::string>& inputs) {
     std::vector<Datum> args;
     for (const auto& input : inputs) {
       args.emplace_back(this->scalar(input));
     }
-    this->Assert(ElementWiseMax, this->scalar(expected), args);
+    this->Assert(MaxElementWise, this->scalar(expected), args);
 
     args.clear();
     for (const auto& input : inputs) {
       args.emplace_back(this->array("[" + input + "]"));
     }
-    this->Assert(ElementWiseMax, this->array("[" + expected + "]"), args);
+    this->Assert(MaxElementWise, this->array("[" + expected + "]"), args);
   };
   Check("0.0", {"0.0", "-0.0"});
   Check("1.0", {"1.0", "-0.0", "0.0"});
@@ -948,34 +948,34 @@ TYPED_TEST(TestVarArgsCompareFloating, ElementWiseMax) {
   Check("0", {"0", "-Inf"});
 }
 
-TYPED_TEST(TestVarArgsCompareParametricTemporal, ElementWiseMax) {
+TYPED_TEST(TestVarArgsCompareParametricTemporal, MaxElementWise) {
   // Temporal kernel is implemented with numeric kernel underneath
-  this->AssertNullScalar(ElementWiseMax, {});
-  this->AssertNullScalar(ElementWiseMax, {this->scalar("null"), this->scalar("null")});
+  this->AssertNullScalar(MaxElementWise, {});
+  this->AssertNullScalar(MaxElementWise, {this->scalar("null"), this->scalar("null")});
 
-  this->Assert(ElementWiseMax, this->scalar("0"), {this->scalar("0")});
-  this->Assert(ElementWiseMax, this->scalar("2"), {this->scalar("2"), this->scalar("0")});
-  this->Assert(ElementWiseMax, this->scalar("0"),
+  this->Assert(MaxElementWise, this->scalar("0"), {this->scalar("0")});
+  this->Assert(MaxElementWise, this->scalar("2"), {this->scalar("2"), this->scalar("0")});
+  this->Assert(MaxElementWise, this->scalar("0"),
                {this->scalar("0"), this->scalar("null")});
 
-  this->Assert(ElementWiseMax, (this->array("[]")), {this->array("[]")});
-  this->Assert(ElementWiseMax, this->array("[1, 2, 3, null]"),
+  this->Assert(MaxElementWise, (this->array("[]")), {this->array("[]")});
+  this->Assert(MaxElementWise, this->array("[1, 2, 3, null]"),
                {this->array("[1, 2, 3, null]")});
 
-  this->Assert(ElementWiseMax, this->array("[2, 2, 3, 4]"),
+  this->Assert(MaxElementWise, this->array("[2, 2, 3, 4]"),
                {this->array("[1, null, 3, 4]"), this->scalar("null"), this->scalar("2")});
 
-  this->Assert(ElementWiseMax, this->array("[2, 2, 3, 4]"),
+  this->Assert(MaxElementWise, this->array("[2, 2, 3, 4]"),
                {this->array("[1, null, 3, 4]"), this->array("[2, 2, null, 2]")});
 }
 
-TEST(TestElementWiseMaxElementWiseMin, CommonTimestamp) {
+TEST(TestMaxElementWiseMinElementWise, CommonTimestamp) {
   {
     auto t1 = std::make_shared<TimestampType>(TimeUnit::SECOND);
     auto t2 = std::make_shared<TimestampType>(TimeUnit::MILLI);
     auto expected = MakeScalar(t2, 1000).ValueOrDie();
     ASSERT_OK_AND_ASSIGN(auto actual,
-                         ElementWiseMin({Datum(MakeScalar(t1, 1).ValueOrDie()),
+                         MinElementWise({Datum(MakeScalar(t1, 1).ValueOrDie()),
                                          Datum(MakeScalar(t2, 12000).ValueOrDie())}));
     AssertScalarsEqual(*expected, *actual.scalar(), /*verbose=*/true);
   }
@@ -984,7 +984,7 @@ TEST(TestElementWiseMaxElementWiseMin, CommonTimestamp) {
     auto t2 = std::make_shared<TimestampType>(TimeUnit::SECOND);
     auto expected = MakeScalar(t2, 86401).ValueOrDie();
     ASSERT_OK_AND_ASSIGN(auto actual,
-                         ElementWiseMax({Datum(MakeScalar(t1, 1).ValueOrDie()),
+                         MaxElementWise({Datum(MakeScalar(t1, 1).ValueOrDie()),
                                          Datum(MakeScalar(t2, 86401).ValueOrDie())}));
     AssertScalarsEqual(*expected, *actual.scalar(), /*verbose=*/true);
   }
@@ -994,7 +994,7 @@ TEST(TestElementWiseMaxElementWiseMin, CommonTimestamp) {
     auto t3 = std::make_shared<TimestampType>(TimeUnit::SECOND);
     auto expected = MakeScalar(t3, 86400).ValueOrDie();
     ASSERT_OK_AND_ASSIGN(
-        auto actual, ElementWiseMin({Datum(MakeScalar(t1, 1).ValueOrDie()),
+        auto actual, MinElementWise({Datum(MakeScalar(t1, 1).ValueOrDie()),
                                      Datum(MakeScalar(t2, 2 * 86400000).ValueOrDie())}));
     AssertScalarsEqual(*expected, *actual.scalar(), /*verbose=*/true);
   }
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index cd054fcea0e..3f63bf2c405 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -3344,12 +3344,227 @@ struct BinaryJoin {
   }
 };
 
+using BinaryJoinElementWiseState = OptionsWrapper<JoinOptions>;
+
+template <typename Type>
+struct BinaryJoinElementWise {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+  using offset_type = typename Type::offset_type;
+
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    JoinOptions options = BinaryJoinElementWiseState::Get(ctx);
+    // Last argument is the separator (for consistency with binary_join)
+    if (std::all_of(batch.values.begin(), batch.values.end(),
+                    [](const Datum& d) { return d.is_scalar(); })) {
+      return ExecOnlyScalar(ctx, options, batch, out);
+    }
+    return ExecContainingArrays(ctx, options, batch, out);
+  }
+
+  static Status ExecOnlyScalar(KernelContext* ctx, const JoinOptions& options,
+                               const ExecBatch& batch, Datum* out) {
+    BaseBinaryScalar* output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+    const size_t num_args = batch.values.size();
+    if (num_args == 1) {
+      // Only separator, no values
+      ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0));
+      output->is_valid = batch.values[0].scalar()->is_valid;
+      return Status::OK();
+    }
+
+    int64_t final_size = CalculateRowSize(options, batch, 0);
+    if (final_size < 0) {
+      ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0));
+      output->is_valid = false;
+      return Status::OK();
+    }
+    ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(final_size));
+    const auto separator = UnboxScalar<Type>::Unbox(*batch.values.back().scalar());
+    uint8_t* buf = output->value->mutable_data();
+    bool first = true;
+    for (size_t i = 0; i < num_args - 1; i++) {
+      const Scalar& scalar = *batch[i].scalar();
+      util::string_view s;
+      if (scalar.is_valid) {
+        s = UnboxScalar<Type>::Unbox(scalar);
+      } else {
+        switch (options.null_handling) {
+          case JoinOptions::EMIT_NULL:
+            // Handled by CalculateRowSize
+            DCHECK(false) << "unreachable";
+            break;
+          case JoinOptions::SKIP:
+            continue;
+          case JoinOptions::REPLACE:
+            s = options.null_replacement;
+            break;
+        }
+      }
+      if (!first) {
+        buf = std::copy(separator.begin(), separator.end(), buf);
+      }
+      first = false;
+      buf = std::copy(s.begin(), s.end(), buf);
+    }
+    output->is_valid = true;
+    DCHECK_EQ(final_size, buf - output->value->mutable_data());
+    return Status::OK();
+  }
+
+  static Status ExecContainingArrays(KernelContext* ctx, const JoinOptions& options,
+                                     const ExecBatch& batch, Datum* out) {
+    // Presize data to avoid reallocations
+    int64_t final_size = 0;
+    for (int64_t i = 0; i < batch.length; i++) {
+      auto size = CalculateRowSize(options, batch, i);
+      if (size > 0) final_size += size;
+    }
+    BuilderType builder(ctx->memory_pool());
+    RETURN_NOT_OK(builder.Reserve(batch.length));
+    RETURN_NOT_OK(builder.ReserveData(final_size));
+
+    std::vector<util::string_view> valid_cols(batch.values.size());
+    for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) {
+      size_t num_valid = 0;  // Not counting separator
+      for (size_t col = 0; col < batch.values.size(); col++) {
+        if (batch[col].is_scalar()) {
+          const auto& scalar = *batch[col].scalar();
+          if (scalar.is_valid) {
+            valid_cols[col] = UnboxScalar<Type>::Unbox(scalar);
+            if (col < batch.values.size() - 1) num_valid++;
+          } else {
+            valid_cols[col] = util::string_view();
+          }
+        } else {
+          const ArrayData& array = *batch[col].array();
+          if (!array.MayHaveNulls() ||
+              BitUtil::GetBit(array.buffers[0]->data(), array.offset + row)) {
+            const offset_type* offsets = array.GetValues<offset_type>(1);
+            const uint8_t* data = array.GetValues<uint8_t>(2, /*absolute_offset=*/0);
+            const int64_t length = offsets[row + 1] - offsets[row];
+            valid_cols[col] = util::string_view(
+                reinterpret_cast<const char*>(data + offsets[row]), length);
+            if (col < batch.values.size() - 1) num_valid++;
+          } else {
+            valid_cols[col] = util::string_view();
+          }
+        }
+      }
+
+      if (!valid_cols.back().data()) {
+        // Separator is null
+        builder.UnsafeAppendNull();
+        continue;
+      } else if (batch.values.size() == 1) {
+        // Only given separator
+        builder.UnsafeAppendEmptyValue();
+        continue;
+      } else if (num_valid < batch.values.size() - 1) {
+        // We had some nulls
+        if (options.null_handling == JoinOptions::EMIT_NULL) {
+          builder.UnsafeAppendNull();
+          continue;
+        }
+      }
+      const auto separator = valid_cols.back();
+      bool first = true;
+      for (size_t col = 0; col < batch.values.size() - 1; col++) {
+        util::string_view value = valid_cols[col];
+        if (!value.data()) {
+          switch (options.null_handling) {
+            case JoinOptions::EMIT_NULL:
+              DCHECK(false) << "unreachable";
+              break;
+            case JoinOptions::SKIP:
+              continue;
+            case JoinOptions::REPLACE:
+              value = options.null_replacement;
+              break;
+          }
+        }
+        if (first) {
+          builder.UnsafeAppend(value);
+          first = false;
+          continue;
+        }
+        builder.UnsafeExtendCurrent(separator);
+        builder.UnsafeExtendCurrent(value);
+      }
+    }
+
+    std::shared_ptr<Array> string_array;
+    RETURN_NOT_OK(builder.Finish(&string_array));
+    *out = *string_array->data();
+    out->mutable_array()->type = batch[0].type();
+    DCHECK_EQ(batch.length, out->array()->length);
+    DCHECK_EQ(final_size,
+              checked_cast<const ArrayType&>(*string_array).total_values_length());
+    return Status::OK();
+  }
+
+  // Compute the length of the output for the given position, or -1 if it would be null.
+  static int64_t CalculateRowSize(const JoinOptions& options, const ExecBatch& batch,
+                                  const int64_t index) {
+    const auto num_args = batch.values.size();
+    int64_t final_size = 0;
+    int64_t num_non_null_args = 0;
+    for (size_t i = 0; i < num_args; i++) {
+      int64_t element_size = 0;
+      bool valid = true;
+      if (batch[i].is_scalar()) {
+        const Scalar& scalar = *batch[i].scalar();
+        valid = scalar.is_valid;
+        element_size = UnboxScalar<Type>::Unbox(scalar).size();
+      } else {
+        const ArrayData& array = *batch[i].array();
+        valid = !array.MayHaveNulls() ||
+                BitUtil::GetBit(array.buffers[0]->data(), array.offset + index);
+        const offset_type* offsets = array.GetValues<offset_type>(1);
+        element_size = offsets[index + 1] - offsets[index];
+      }
+      if (i == num_args - 1) {
+        if (!valid) return -1;
+        if (num_non_null_args > 1) {
+          // Add separator size (only if there were values to join)
+          final_size += (num_non_null_args - 1) * element_size;
+        }
+        break;
+      }
+      if (!valid) {
+        switch (options.null_handling) {
+          case JoinOptions::EMIT_NULL:
+            return -1;
+          case JoinOptions::SKIP:
+            continue;
+          case JoinOptions::REPLACE:
+            element_size = options.null_replacement.size();
+            break;
+        }
+      }
+      num_non_null_args++;
+      final_size += element_size;
+    }
+    return final_size;
+  }
+};
+
 const FunctionDoc binary_join_doc(
     "Join a list of strings together with a `separator` to form a single string",
     ("Insert `separator` between `list` elements, and concatenate them.\n"
      "Any null input and any null `list` element emits a null output.\n"),
     {"list", "separator"});
 
+const FunctionDoc binary_join_element_wise_doc(
+    "Join string arguments into one, using the last argument as the separator",
+    ("Insert the last argument of `strings` between the rest of the elements, "
+     "and concatenate them.\n"
+     "Any null separator element emits a null output. Null elements either "
+     "emit a null (the default), are skipped, or replaced with a given string.\n"),
+    {"*strings"}, "JoinOptions");
+
+const auto kDefaultJoinOptions = JoinOptions::Defaults();
+
 template <typename ListType>
 void AddBinaryJoinForListType(ScalarFunction* func) {
   for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
@@ -3360,11 +3575,25 @@ void AddBinaryJoinForListType(ScalarFunction* func) {
 }
 
 void AddBinaryJoin(FunctionRegistry* registry) {
-  auto func =
-      std::make_shared<ScalarFunction>("binary_join", Arity::Binary(), &binary_join_doc);
-  AddBinaryJoinForListType<ListType>(func.get());
-  AddBinaryJoinForListType<LargeListType>(func.get());
-  DCHECK_OK(registry->AddFunction(std::move(func)));
+  {
+    auto func = std::make_shared<ScalarFunction>("binary_join", Arity::Binary(),
+                                                 &binary_join_doc);
+    AddBinaryJoinForListType<ListType>(func.get());
+    AddBinaryJoinForListType<LargeListType>(func.get());
+    DCHECK_OK(registry->AddFunction(std::move(func)));
+  }
+  {
+    auto func = std::make_shared<ScalarFunction>(
+        "binary_join_element_wise", Arity::VarArgs(/*min_args=*/1),
+        &binary_join_element_wise_doc, &kDefaultJoinOptions);
+    for (const auto& ty : BaseBinaryTypes()) {
+      DCHECK_OK(
+          func->AddKernel({InputType(ty)}, ty,
+                          GenerateTypeAgnosticVarBinaryBase<BinaryJoinElementWise>(ty),
+                          BinaryJoinElementWiseState::Init));
+    }
+    DCHECK_OK(registry->AddFunction(std::move(func)));
+  }
 }
 
 template <template <typename> class ExecFunctor>
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
index 606e774451c..ddc3a56f00f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
@@ -169,6 +169,47 @@ static void BinaryJoinArrayArray(benchmark::State& state) {
   });
 }
 
+static void BinaryJoinElementWise(benchmark::State& state,
+                                  SeparatorFactory make_separator) {
+  // Unfortunately benchmark is not 1:1 with BinaryJoin since BinaryJoin can join a
+  // varying number of inputs per output
+  const int64_t n_rows = 10000;
+  const int64_t n_cols = state.range(0);
+  const double null_probability = 0.02;
+
+  random::RandomArrayGenerator rng(kSeed);
+
+  DatumVector args;
+  ArrayVector strings;
+  int64_t total_values_length = 0;
+  for (int i = 0; i < n_cols; i++) {
+    auto arr = rng.String(n_rows, /*min_length=*/5, /*max_length=*/20, null_probability);
+    strings.push_back(arr);
+    args.emplace_back(arr);
+    total_values_length += checked_cast<const StringArray&>(*arr).total_values_length();
+  }
+  auto separator = make_separator(n_rows, null_probability);
+  args.emplace_back(separator);
+
+  for (auto _ : state) {
+    ABORT_NOT_OK(CallFunction("binary_join_element_wise", args));
+  }
+  state.SetBytesProcessed(state.iterations() * total_values_length);
+}
+
+static void BinaryJoinElementWiseArrayScalar(benchmark::State& state) {
+  BinaryJoinElementWise(state, [](int64_t n, double null_probability) -> Datum {
+    return ScalarFromJSON(utf8(), R"("--")");
+  });
+}
+
+static void BinaryJoinElementWiseArrayArray(benchmark::State& state) {
+  BinaryJoinElementWise(state, [](int64_t n, double null_probability) -> Datum {
+    random::RandomArrayGenerator rng(kSeed + 1);
+    return rng.String(n, /*min_length=*/0, /*max_length=*/4, null_probability);
+  });
+}
+
 BENCHMARK(AsciiLower);
 BENCHMARK(AsciiUpper);
 BENCHMARK(IsAlphaNumericAscii);
@@ -192,6 +233,8 @@ BENCHMARK(TrimManyUtf8);
 
 BENCHMARK(BinaryJoinArrayScalar);
 BENCHMARK(BinaryJoinArrayArray);
+BENCHMARK(BinaryJoinElementWiseArrayScalar)->RangeMultiplier(8)->Range(2, 128);
+BENCHMARK(BinaryJoinElementWiseArrayArray)->RangeMultiplier(8)->Range(2, 128);
 
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 2053dbaa971..6192e0a5dd7 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -58,6 +58,26 @@ class BaseTestStringKernels : public ::testing::Test {
                             json_expected, options);
   }
 
+  void CheckVarArgsScalar(std::string func_name, std::string json_input,
+                          std::shared_ptr<DataType> out_ty, std::string json_expected,
+                          const FunctionOptions* options = nullptr) {
+    // CheckScalar (on arrays) checks scalar arguments individually,
+    // but this lets us test the all-scalar case explicitly
+    ScalarVector inputs;
+    std::shared_ptr<Array> args = ArrayFromJSON(type(), json_input);
+    for (int64_t i = 0; i < args->length(); i++) {
+      ASSERT_OK_AND_ASSIGN(auto scalar, args->GetScalar(i));
+      inputs.push_back(std::move(scalar));
+    }
+    CheckScalar(func_name, inputs, ScalarFromJSON(out_ty, json_expected), options);
+  }
+
+  void CheckVarArgs(std::string func_name, const std::vector<Datum>& inputs,
+                    std::shared_ptr<DataType> out_ty, std::string json_expected,
+                    const FunctionOptions* options = nullptr) {
+    CheckScalar(func_name, inputs, ArrayFromJSON(out_ty, json_expected), options);
+  }
+
   std::shared_ptr<DataType> type() { return TypeTraits<TestType>::type_singleton(); }
 
   template <typename CType>
@@ -229,6 +249,105 @@ TYPED_TEST(TestBinaryKernels, CountSubstringIgnoreCase) {
 }
 #endif
 
+TYPED_TEST(TestBinaryKernels, BinaryJoinElementWise) {
+  const auto ty = this->type();
+  JoinOptions options;
+  JoinOptions options_skip(JoinOptions::SKIP);
+  JoinOptions options_replace(JoinOptions::REPLACE, "X");
+  // Scalar args, Scalar separator
+  this->CheckVarArgsScalar("binary_join_element_wise", R"([null])", ty, R"(null)",
+                           &options);
+  this->CheckVarArgsScalar("binary_join_element_wise", R"(["-"])", ty, R"("")", &options);
+  this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", "-"])", ty, R"("a")",
+                           &options);
+  this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", "b", "-"])", ty,
+                           R"("a-b")", &options);
+  this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", "b", null])", ty,
+                           R"(null)", &options);
+  this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", null, "-"])", ty,
+                           R"(null)", &options);
+  this->CheckVarArgsScalar("binary_join_element_wise", R"(["foo", "bar", "baz", "++"])",
+                           ty, R"("foo++bar++baz")", &options);
+
+  // Scalar args, Array separator
+  const auto sep = ArrayFromJSON(ty, R"([null, "-", "--"])");
+  const auto scalar1 = ScalarFromJSON(ty, R"("foo")");
+  const auto scalar2 = ScalarFromJSON(ty, R"("bar")");
+  const auto scalar3 = ScalarFromJSON(ty, R"("")");
+  const auto scalar_null = ScalarFromJSON(ty, R"(null)");
+  this->CheckVarArgs("binary_join_element_wise", {sep}, ty, R"([null, "", ""])",
+                     &options);
+  this->CheckVarArgs("binary_join_element_wise", {scalar1, sep}, ty,
+                     R"([null, "foo", "foo"])", &options);
+  this->CheckVarArgs("binary_join_element_wise", {scalar1, scalar2, sep}, ty,
+                     R"([null, "foo-bar", "foo--bar"])", &options);
+  this->CheckVarArgs("binary_join_element_wise", {scalar1, scalar_null, sep}, ty,
+                     R"([null, null, null])", &options);
+  this->CheckVarArgs("binary_join_element_wise", {scalar1, scalar2, scalar3, sep}, ty,
+                     R"([null, "foo-bar-", "foo--bar--"])", &options);
+
+  // Array args, Scalar separator
+  const auto sep1 = ScalarFromJSON(ty, R"("-")");
+  const auto sep2 = ScalarFromJSON(ty, R"("--")");
+  const auto arr1 = ArrayFromJSON(ty, R"([null, "a", "bb", "ccc"])");
+  const auto arr2 = ArrayFromJSON(ty, R"(["d", null, "e", ""])");
+  const auto arr3 = ArrayFromJSON(ty, R"(["gg", null, "h", "iii"])");
+  this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, arr3, scalar_null}, ty,
+                     R"([null, null, null, null])", &options);
+  this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, arr3, sep1}, ty,
+                     R"([null, null, "bb-e-h", "ccc--iii"])", &options);
+  this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, arr3, sep2}, ty,
+                     R"([null, null, "bb--e--h", "ccc----iii"])", &options);
+
+  // Array args, Array separator
+  const auto sep3 = ArrayFromJSON(ty, R"(["-", "--", null, "---"])");
+  this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, arr3, sep3}, ty,
+                     R"([null, null, null, "ccc------iii"])", &options);
+
+  // Mixed
+  this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep3}, ty,
+                     R"([null, null, null, "ccc------bar"])", &options);
+  this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, sep3}, ty,
+                     R"([null, null, null, null])", &options);
+  this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep1}, ty,
+                     R"([null, null, "bb-e-bar", "ccc--bar"])", &options);
+  this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, scalar_null},
+                     ty, R"([null, null, null, null])", &options);
+
+  // Skip
+  this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", null, "b", "-"])", ty,
+                           R"("a-b")", &options_skip);
+  this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", null, "b", null])", ty,
+                           R"(null)", &options_skip);
+  this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep3}, ty,
+                     R"(["d-bar", "a--bar", null, "ccc------bar"])", &options_skip);
+  this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, sep3}, ty,
+                     R"(["d", "a", null, "ccc---"])", &options_skip);
+  this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep1}, ty,
+                     R"(["d-bar", "a-bar", "bb-e-bar", "ccc--bar"])", &options_skip);
+  this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, scalar_null},
+                     ty, R"([null, null, null, null])", &options_skip);
+
+  // Replace
+  this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", null, "b", "-"])", ty,
+                           R"("a-X-b")", &options_replace);
+  this->CheckVarArgsScalar("binary_join_element_wise", R"(["a", null, "b", null])", ty,
+                           R"(null)", &options_replace);
+  this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep3}, ty,
+                     R"(["X-d-bar", "a--X--bar", null, "ccc------bar"])",
+                     &options_replace);
+  this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, sep3}, ty,
+                     R"(["X-d-X", "a--X--X", null, "ccc------X"])", &options_replace);
+  this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar2, sep1}, ty,
+                     R"(["X-d-bar", "a-X-bar", "bb-e-bar", "ccc--bar"])",
+                     &options_replace);
+  this->CheckVarArgs("binary_join_element_wise", {arr1, arr2, scalar_null, scalar_null},
+                     ty, R"([null, null, null, null])", &options_replace);
+
+  // Error cases
+  ASSERT_RAISES(Invalid, CallFunction("binary_join_element_wise", {}, &options));
+}
+
 template <typename TestType>
 class TestStringKernels : public BaseTestStringKernels<TestType> {};
 
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 91ee6bdf599..dfdd64d19c6 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -318,8 +318,8 @@ expanded for the purposes of comparison.
 +--------------------------+------------+---------------------------------------------+---------------------+---------------------------------------+-------+
 | Function names           | Arity      | Input types                                 | Output type         | Options class                         | Notes |
 +==========================+============+=============================================+=====================+=======================================+=======+
-| element_wise_max,        | Varargs    | Numeric and Temporal                        | Numeric or Temporal | :struct:`ElementWiseAggregateOptions` | \(1)  |
-| element_wise_min         |            |                                             |                     |                                       |       |
+| max_element_wise,        | Varargs    | Numeric and Temporal                        | Numeric or Temporal | :struct:`ElementWiseAggregateOptions` | \(1)  |
+| min_element_wise         |            |                                             |                     |                                       |       |
 +--------------------------+------------+---------------------------------------------+---------------------+---------------------------------------+-------+
 
 * \(1) By default, nulls are skipped (but the kernel can be configured to propagate nulls).
@@ -680,19 +680,25 @@ String component extraction
 String joining
 ~~~~~~~~~~~~~~
 
-This function does the inverse of string splitting.
+These functions do the inverse of string splitting.
 
-+-----------------+-----------+----------------------+----------------+-------------------+---------+
-| Function name   | Arity     | Input type 1         | Input type 2   | Output type       | Notes   |
-+=================+===========+======================+================+===================+=========+
-| binary_join     | Binary    | List of string-like  | String-like    | String-like       | \(1)    |
-+-----------------+-----------+----------------------+----------------+-------------------+---------+
++--------------------------+-----------+-----------------------+----------------+-------------------+-----------------------+---------+
+| Function name            | Arity     | Input type 1          | Input type 2   | Output type       | Options class         | Notes   |
++==========================+===========+=======================+================+===================+=======================+=========+
+| binary_join              | Binary    | List of string-like   | String-like    | String-like       |                       | \(1)    |
++--------------------------+-----------+-----------------------+----------------+-------------------+-----------------------+---------+
+| binary_join_element_wise | Varargs   | String-like (varargs) | String-like    | String-like       | :struct:`JoinOptions` | \(2)    |
++--------------------------+-----------+-----------------------+----------------+-------------------+-----------------------+---------+
 
 * \(1) The first input must be an array, while the second can be a scalar or array.
   Each list of values in the first input is joined using each second input
   as separator.  If any input list is null or contains a null, the corresponding
   output will be null.
 
+* \(2) All arguments are concatenated element-wise, with the last argument treated
+  as the separator (scalars are recycled in either case). Null separators emit
+  null. If any other argument is null, by default the corresponding output will be
+  null, but it can instead either be skipped or replaced with a given string.
 
 Slicing
 ~~~~~~~
diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst
index dd722e44f05..80fcb2078f1 100644
--- a/docs/source/python/api/compute.rst
+++ b/docs/source/python/api/compute.rst
@@ -80,8 +80,8 @@ These functions take any number of arguments of a numeric or temporal type.
 .. autosummary::
    :toctree: ../generated/
 
-   element_wise_max
-   element_wise_min
+   max_element_wise
+   min_element_wise
 
 Logical Functions
 -----------------
@@ -159,6 +159,23 @@ String Splitting
    ascii_split_whitespace
    utf8_split_whitespace
 
+String Component Extraction
+---------------------------
+
+.. autosummary::
+   :toctree: ../generated/
+
+   extract_regex
+
+String Joining
+--------------
+
+.. autosummary::
+   :toctree: ../generated/
+
+   binary_join
+   binary_join_element_wise
+
 String Transforms
 -----------------
 
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index 104cd1bac1f..559a8a02b1c 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -667,6 +667,37 @@ class ElementWiseAggregateOptions(_ElementWiseAggregateOptions):
         self._set_options(skip_nulls)
 
 
+cdef class _JoinOptions(FunctionOptions):
+    cdef:
+        unique_ptr[CJoinOptions] join_options
+
+    cdef const CFunctionOptions* get_options(self) except NULL:
+        return self.join_options.get()
+
+    def _set_options(self, null_handling, null_replacement):
+        cdef:
+            CJoinNullHandlingBehavior c_null_handling = \
+                CJoinNullHandlingBehavior_EMIT_NULL
+            c_string c_null_replacement = tobytes(null_replacement)
+        if null_handling == 'emit_null':
+            c_null_handling = CJoinNullHandlingBehavior_EMIT_NULL
+        elif null_handling == 'skip':
+            c_null_handling = CJoinNullHandlingBehavior_SKIP
+        elif null_handling == 'replace':
+            c_null_handling = CJoinNullHandlingBehavior_REPLACE
+        else:
+            raise ValueError(
+                '"{}" is not a valid null_handling'
+                .format(null_handling))
+        self.join_options.reset(
+            new CJoinOptions(c_null_handling, c_null_replacement))
+
+
+class JoinOptions(_JoinOptions):
+    def __init__(self, null_handling='emit_null', null_replacement=''):
+        self._set_options(null_handling, null_replacement)
+
+
 cdef class _MatchSubstringOptions(FunctionOptions):
     cdef:
         unique_ptr[CMatchSubstringOptions] match_substring_options
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index b8bd9e65f17..b258b551f02 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -36,6 +36,7 @@
     ExtractRegexOptions,
     FilterOptions,
     IndexOptions,
+    JoinOptions,
     MatchSubstringOptions,
     ModeOptions,
     PartitionNthOptions,
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 072062385ca..0a8c7494989 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1796,6 +1796,22 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
         CElementWiseAggregateOptions(c_bool skip_nulls)
         c_bool skip_nulls
 
+    enum CJoinNullHandlingBehavior \
+            "arrow::compute::JoinOptions::NullHandlingBehavior":
+        CJoinNullHandlingBehavior_EMIT_NULL \
+            "arrow::compute::JoinOptions::EMIT_NULL"
+        CJoinNullHandlingBehavior_SKIP \
+            "arrow::compute::JoinOptions::SKIP"
+        CJoinNullHandlingBehavior_REPLACE \
+            "arrow::compute::JoinOptions::REPLACE"
+
+    cdef cppclass CJoinOptions \
+            "arrow::compute::JoinOptions"(CFunctionOptions):
+        CJoinOptions(CJoinNullHandlingBehavior null_handling,
+                     c_string null_replacement)
+        CJoinNullHandlingBehavior null_handling
+        c_string null_replacement
+
     cdef cppclass CMatchSubstringOptions \
             "arrow::compute::MatchSubstringOptions"(CFunctionOptions):
         CMatchSubstringOptions(c_string pattern, c_bool ignore_case)
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index 1ed582db831..efe2e6be2f8 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -766,6 +766,36 @@ def test_binary_join():
     assert pc.binary_join(ar_list, separator_array).equals(expected)
 
 
+def test_binary_join_element_wise():
+    null = pa.scalar(None, type=pa.string())
+    arrs = [[None, 'a', 'b'], ['c', None, 'd'], [None, '-', '--']]
+    assert pc.binary_join_element_wise(*arrs).to_pylist() == \
+        [None, None, 'b--d']
+    assert pc.binary_join_element_wise('a', 'b', '-').as_py() == 'a-b'
+    assert pc.binary_join_element_wise('a', null, '-').as_py() is None
+    assert pc.binary_join_element_wise('a', 'b', null).as_py() is None
+
+    skip = pc.JoinOptions('skip')
+    assert pc.binary_join_element_wise(*arrs, options=skip).to_pylist() == \
+        [None, 'a', 'b--d']
+    assert pc.binary_join_element_wise(
+        'a', 'b', '-', options=skip).as_py() == 'a-b'
+    assert pc.binary_join_element_wise(
+        'a', null, '-', options=skip).as_py() == 'a'
+    assert pc.binary_join_element_wise(
+        'a', 'b', null, options=skip).as_py() is None
+
+    replace = pc.JoinOptions('replace', null_replacement='spam')
+    assert pc.binary_join_element_wise(*arrs, options=replace).to_pylist() == \
+        [None, 'a-spam', 'b--d']
+    assert pc.binary_join_element_wise(
+        'a', 'b', '-', options=replace).as_py() == 'a-b'
+    assert pc.binary_join_element_wise(
+        'a', null, '-', options=replace).as_py() == 'a-spam'
+    assert pc.binary_join_element_wise(
+        'a', 'b', null, options=replace).as_py() is None
+
+
 @pytest.mark.parametrize(('ty', 'values'), all_array_types)
 def test_take(ty, values):
     arr = pa.array(values, type=ty)
@@ -1437,35 +1467,35 @@ def test_fill_null_segfault():
     assert result == pa.array([0], pa.int8())
 
 
-def test_elementwise_min_max():
+def test_min_max_element_wise():
     arr1 = pa.array([1, 2, 3])
     arr2 = pa.array([3, 1, 2])
     arr3 = pa.array([2, 3, None])
 
-    result = pc.element_wise_max(arr1, arr2)
+    result = pc.max_element_wise(arr1, arr2)
     assert result == pa.array([3, 2, 3])
-    result = pc.element_wise_min(arr1, arr2)
+    result = pc.min_element_wise(arr1, arr2)
     assert result == pa.array([1, 1, 2])
 
-    result = pc.element_wise_max(arr1, arr2, arr3)
+    result = pc.max_element_wise(arr1, arr2, arr3)
     assert result == pa.array([3, 3, 3])
-    result = pc.element_wise_min(arr1, arr2, arr3)
+    result = pc.min_element_wise(arr1, arr2, arr3)
     assert result == pa.array([1, 1, 2])
 
     # with specifying the option
-    result = pc.element_wise_max(arr1, arr3, skip_nulls=True)
+    result = pc.max_element_wise(arr1, arr3, skip_nulls=True)
     assert result == pa.array([2, 3, 3])
-    result = pc.element_wise_min(arr1, arr3, skip_nulls=True)
+    result = pc.min_element_wise(arr1, arr3, skip_nulls=True)
     assert result == pa.array([1, 2, 3])
-    result = pc.element_wise_max(
+    result = pc.max_element_wise(
         arr1, arr3, options=pc.ElementWiseAggregateOptions())
     assert result == pa.array([2, 3, 3])
-    result = pc.element_wise_min(
+    result = pc.min_element_wise(
         arr1, arr3, options=pc.ElementWiseAggregateOptions())
     assert result == pa.array([1, 2, 3])
 
     # not skipping nulls
-    result = pc.element_wise_max(arr1, arr3, skip_nulls=False)
+    result = pc.max_element_wise(arr1, arr3, skip_nulls=False)
     assert result == pa.array([2, 3, None])
-    result = pc.element_wise_min(arr1, arr3, skip_nulls=False)
+    result = pc.min_element_wise(arr1, arr3, skip_nulls=False)
     assert result == pa.array([1, 2, None])

From b1dc38cf498a28625308a1b8c410186e2fa66370 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Wed, 16 Jun 2021 16:18:07 -0700
Subject: [PATCH 27/61] ARROW-11705: [R] Support scalar value recycling in
 RecordBatch/Table$create()

This also adds missing spaces in some unrelated R files

Closes #10269 from thisisnic/ARROW-11705_scalar_recycling

Lead-authored-by: Nic Crane <thisisnic@gmail.com>
Co-authored-by: Nic <thisisnic@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
---
 r/R/arrow-datum.R                     |  4 +--
 r/R/arrow-package.R                   |  2 +-
 r/R/arrow-tabular.R                   |  4 +--
 r/R/arrowExports.R                    |  4 +--
 r/R/chunked-array.R                   |  2 +-
 r/R/compression.R                     |  4 +--
 r/R/compute.R                         |  4 +--
 r/R/csv.R                             |  6 ++--
 r/R/enums.R                           |  4 +--
 r/R/filesystem.R                      |  2 +-
 r/R/metadata.R                        |  2 +-
 r/R/parquet.R                         |  4 +--
 r/R/record-batch.R                    |  3 ++
 r/R/scalar.R                          |  2 +-
 r/R/table.R                           | 19 ++++++-----
 r/R/util.R                            | 45 +++++++++++++++++++++++++
 r/data-raw/codegen.R                  |  8 ++---
 r/extra-tests/helpers.R               |  4 +--
 r/extra-tests/write-files.R           |  2 +-
 r/man/recycle_scalars.Rd              | 18 ++++++++++
 r/man/repeat_value_as_array.Rd        | 20 +++++++++++
 r/src/arrowExports.cpp                | 11 +++---
 r/src/scalar.cpp                      |  4 +--
 r/tests/testthat/helper-expectation.R |  6 ++--
 r/tests/testthat/test-RecordBatch.R   | 46 +++++++++++++++++++++----
 r/tests/testthat/test-Table.R         | 48 +++++++++++++++++++++++++--
 r/tests/testthat/test-dataset.R       |  2 +-
 r/tools/winlibs.R                     |  6 ++--
 28 files changed, 226 insertions(+), 60 deletions(-)
 create mode 100644 r/man/recycle_scalars.Rd
 create mode 100644 r/man/repeat_value_as_array.Rd

diff --git a/r/R/arrow-datum.R b/r/R/arrow-datum.R
index 3be8d75af0b..8becc37daf2 100644
--- a/r/R/arrow-datum.R
+++ b/r/R/arrow-datum.R
@@ -128,7 +128,7 @@ eval_array_expression <- function(FUN,
 }
 
 #' @export
-na.omit.ArrowDatum <- function(object, ...){
+na.omit.ArrowDatum <- function(object, ...) {
   object$Filter(!is.na(object))
 }
 
@@ -136,7 +136,7 @@ na.omit.ArrowDatum <- function(object, ...){
 na.exclude.ArrowDatum <- na.omit.ArrowDatum
 
 #' @export
-na.fail.ArrowDatum <- function(object, ...){
+na.fail.ArrowDatum <- function(object, ...) {
   if (object$null_count > 0) {
     stop("missing values in object", call. = FALSE)
   }
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 684382039f1..d2bf81cf5ee 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -279,7 +279,7 @@ ArrowObject <- R6Class("ArrowObject",
         class_title <- class(self)[[1]]
       }
       cat(class_title, "\n", sep = "")
-      if (!is.null(self$ToString)){
+      if (!is.null(self$ToString)) {
         cat(self$ToString(), "\n", sep = "")
       }
       invisible(self)
diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R
index f5535f9ac20..440dcea5994 100644
--- a/r/R/arrow-tabular.R
+++ b/r/R/arrow-tabular.R
@@ -212,7 +212,7 @@ head.ArrowTabular <- head.ArrowDatum
 tail.ArrowTabular <- tail.ArrowDatum
 
 #' @export
-na.fail.ArrowTabular <- function(object, ...){
+na.fail.ArrowTabular <- function(object, ...) {
   for (col in seq_len(object$num_columns)) {
     if (object$column(col - 1L)$null_count > 0) {
       stop("missing values in object", call. = FALSE)
@@ -222,7 +222,7 @@ na.fail.ArrowTabular <- function(object, ...){
 }
 
 #' @export
-na.omit.ArrowTabular <- function(object, ...){
+na.omit.ArrowTabular <- function(object, ...) {
   not_na <- map(object$columns, ~call_function("is_valid", .x))
   not_na_agg <- Reduce("&", not_na)
   object$Filter(not_na_agg)
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index 45a0ea69c59..577773c42bd 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -1548,8 +1548,8 @@ Scalar__as_vector <- function(scalar){
     .Call(`_arrow_Scalar__as_vector`, scalar)
 }
 
-MakeArrayFromScalar <- function(scalar){
-    .Call(`_arrow_MakeArrayFromScalar`, scalar)
+MakeArrayFromScalar <- function(scalar, n){
+    .Call(`_arrow_MakeArrayFromScalar`, scalar, n)
 }
 
 Scalar__is_valid <- function(s){
diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R
index fac1eeba2b1..c58e5ac94f9 100644
--- a/r/R/chunked-array.R
+++ b/r/R/chunked-array.R
@@ -83,7 +83,7 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowDatum,
     type_id = function() ChunkedArray__type(self)$id,
     chunk = function(i) Array$create(ChunkedArray__chunk(self, i)),
     as_vector = function() ChunkedArray__as_vector(self),
-    Slice = function(offset, length = NULL){
+    Slice = function(offset, length = NULL) {
       if (is.null(length)) {
         ChunkedArray__Slice1(self, offset)
       } else {
diff --git a/r/R/compression.R b/r/R/compression.R
index 8fd709f4fda..499a75c83e1 100644
--- a/r/R/compression.R
+++ b/r/R/compression.R
@@ -99,7 +99,7 @@ compression_from_name <- function(name) {
 #' @export
 #' @include arrow-package.R
 CompressedOutputStream <- R6Class("CompressedOutputStream", inherit = OutputStream)
-CompressedOutputStream$create <- function(stream, codec = "gzip", compression_level = NA){
+CompressedOutputStream$create <- function(stream, codec = "gzip", compression_level = NA) {
   codec <- Codec$create(codec, compression_level = compression_level)
   if (is.string(stream)) {
     stream <- FileOutputStream$create(stream)
@@ -113,7 +113,7 @@ CompressedOutputStream$create <- function(stream, codec = "gzip", compression_le
 #' @format NULL
 #' @export
 CompressedInputStream <- R6Class("CompressedInputStream", inherit = InputStream)
-CompressedInputStream$create <- function(stream, codec = "gzip", compression_level = NA){
+CompressedInputStream$create <- function(stream, codec = "gzip", compression_level = NA) {
   codec <- Codec$create(codec, compression_level = compression_level)
   if (is.string(stream)) {
     stream <- ReadableFile$create(stream)
diff --git a/r/R/compute.R b/r/R/compute.R
index 4d36f6057b6..5a00e884980 100644
--- a/r/R/compute.R
+++ b/r/R/compute.R
@@ -202,7 +202,7 @@ unique.ArrowDatum <- function(x, incomparables = FALSE, ...) {
 }
 
 #' @export
-any.ArrowDatum <- function(..., na.rm = FALSE){
+any.ArrowDatum <- function(..., na.rm = FALSE) {
   
   a <- collect_arrays_from_dots(list(...))
   result <- call_function("any", a)
@@ -217,7 +217,7 @@ any.ArrowDatum <- function(..., na.rm = FALSE){
 }
 
 #' @export
-all.ArrowDatum <- function(..., na.rm = FALSE){
+all.ArrowDatum <- function(..., na.rm = FALSE) {
   
   a <- collect_arrays_from_dots(list(...))
   result <- call_function("all", a)
diff --git a/r/R/csv.R b/r/R/csv.R
index 2708a5370f0..1312a2676ae 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -414,7 +414,7 @@ CsvReadOptions$create <- function(use_threads = option_use_threads(),
 #' @rdname CsvReadOptions
 #' @export
 CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject)
-CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L){
+CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L) {
   assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
   csv___WriteOptions__initialize(
     list(
@@ -637,9 +637,9 @@ write_csv_arrow <- function(x,
     on.exit(sink$close())
   }
   
-  if(inherits(x, "RecordBatch")){
+  if (inherits(x, "RecordBatch")) {
     csv___WriteCSV__RecordBatch(x, write_options, sink)
-  } else if(inherits(x, "Table")){
+  } else if (inherits(x, "Table")) {
     csv___WriteCSV__Table(x, write_options, sink)
   }
   
diff --git a/r/R/enums.R b/r/R/enums.R
index ae44ccf2cad..4271f2ad138 100644
--- a/r/R/enums.R
+++ b/r/R/enums.R
@@ -16,11 +16,11 @@
 # under the License.
 
 #' @export
-`print.arrow-enum` <- function(x, ...){
+`print.arrow-enum` <- function(x, ...) {
   NextMethod()
 }
 
-enum <- function(class, ..., .list = list(...)){
+enum <- function(class, ..., .list = list(...)) {
   structure(
     .list,
     class = c(class, "arrow-enum")
diff --git a/r/R/filesystem.R b/r/R/filesystem.R
index 6761acab30e..283fbbb0ae5 100644
--- a/r/R/filesystem.R
+++ b/r/R/filesystem.R
@@ -203,7 +203,7 @@ FileSystem <- R6Class("FileSystem", inherit = ArrowObject,
     GetFileInfo = function(x) {
       if (inherits(x, "FileSelector")) {
         fs___FileSystem__GetTargetInfos_FileSelector(self, x)
-      } else if (is.character(x)){
+      } else if (is.character(x)) {
         fs___FileSystem__GetTargetInfos_Paths(self, clean_path_rel(x))
       } else {
         abort("incompatible type for FileSystem$GetFileInfo()")
diff --git a/r/R/metadata.R b/r/R/metadata.R
index d3e5e2150bb..408c2214a31 100644
--- a/r/R/metadata.R
+++ b/r/R/metadata.R
@@ -59,7 +59,7 @@ apply_arrow_r_metadata <- function(x, r_metadata) {
           x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]])
         }
       }
-    } else if(is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) {
+    } else if (is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) {
       x <- map2(x, columns_metadata, function(.x, .y) {
         apply_arrow_r_metadata(.x, .y)
       })
diff --git a/r/R/parquet.R b/r/R/parquet.R
index a9aef2c4d0d..3006fcbbe50 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -296,7 +296,7 @@ ParquetWriterPropertiesBuilder <- R6Class("ParquetWriterPropertiesBuilder", inhe
         parquet___ArrowWriterProperties___Builder__set_compressions
       )
     },
-    set_compression_level = function(table, compression_level){
+    set_compression_level = function(table, compression_level) {
       # cast to integer but keep names
       compression_level <- set_names(as.integer(compression_level), names(compression_level))
       private$.set(table, compression_level,
@@ -558,7 +558,7 @@ ParquetArrowReaderProperties <- R6Class("ParquetArrowReaderProperties",
   ),
   active = list(
     use_threads = function(use_threads) {
-      if(missing(use_threads)) {
+      if (missing(use_threads)) {
         parquet___arrow___ArrowReaderProperties__get_use_threads(self)
       } else {
         parquet___arrow___ArrowReaderProperties__set_use_threads(self, use_threads)
diff --git a/r/R/record-batch.R b/r/R/record-batch.R
index 1e41d6533a8..0ba6b4bd45d 100644
--- a/r/R/record-batch.R
+++ b/r/R/record-batch.R
@@ -162,6 +162,9 @@ RecordBatch$create <- function(..., schema = NULL) {
     return(dplyr::group_by(out, !!!dplyr::groups(arrays[[1]])))
   }
   
+  # If any arrays are length 1, recycle them
+  arrays <- recycle_scalars(arrays)
+
   # TODO: should this also assert that they're all Arrays?
   RecordBatch__from_arrays(schema, arrays)
 }
diff --git a/r/R/scalar.R b/r/R/scalar.R
index 01a50b0f358..6e5e63cee3e 100644
--- a/r/R/scalar.R
+++ b/r/R/scalar.R
@@ -58,7 +58,7 @@ Scalar <- R6Class("Scalar",
     ToString = function() Scalar__ToString(self),
     type_id = function() Scalar__type(self)$id,
     as_vector = function() Scalar__as_vector(self),
-    as_array = function() MakeArrayFromScalar(self),
+    as_array = function(length = 1L) MakeArrayFromScalar(self, as.integer(length)),
     Equals = function(other, ...) {
       inherits(other, "Scalar") && Scalar__Equals(self, other)
     },
diff --git a/r/R/table.R b/r/R/table.R
index 09be952af61..3e5c52d9624 100644
--- a/r/R/table.R
+++ b/r/R/table.R
@@ -166,18 +166,21 @@ Table$create <- function(..., schema = NULL) {
     names(dots) <- rep_len("", length(dots))
   }
   stopifnot(length(dots) > 0)
+  
+  if (all_record_batches(dots)) {
+    return(Table__from_record_batches(dots, schema))
+  }
+
+  # If any arrays are length 1, recycle them  
+  dots <- recycle_scalars(dots)
 
+  out <- Table__from_dots(dots, schema, option_use_threads())
+  
   # Preserve any grouping
   if (length(dots) == 1 && inherits(dots[[1]], "grouped_df")) {
-    out <- Table__from_dots(dots, schema, option_use_threads())
-    return(dplyr::group_by(out, !!!dplyr::groups(dots[[1]])))
-  }
-
-  if (all_record_batches(dots)) {
-    Table__from_record_batches(dots, schema)
-  } else {
-    Table__from_dots(dots, schema, option_use_threads())
+    out <- dplyr::group_by(out, !!!dplyr::groups(dots[[1]]))
   }
+  out
 }
 
 #' @export
diff --git a/r/R/util.R b/r/R/util.R
index 8d1f51bd079..884c346e503 100644
--- a/r/R/util.R
+++ b/r/R/util.R
@@ -139,3 +139,48 @@ attr(is_writable_table, "fail") <- function(call, env){
   )
 }
 
+#' Recycle scalar values in a list of arrays
+#' 
+#' @param arrays List of arrays
+#' @return List of arrays with any vector/Scalar/Array/ChunkedArray values of length 1 recycled 
+#' @keywords internal
+recycle_scalars <- function(arrays){
+  # Get lengths of items in arrays
+  arr_lens <- map_int(arrays, NROW)
+  
+  is_scalar <- arr_lens == 1
+  
+  if (length(arrays) > 1 && any(is_scalar) && !all(is_scalar)) {
+    
+    # Recycling not supported for tibbles and data.frames
+    if (all(map_lgl(arrays, ~inherits(.x, "data.frame")))) {
+      
+      abort(c(
+          "All input tibbles or data.frames must have the same number of rows",
+          x = paste(
+            "Number of rows in longest and shortest inputs:",
+            oxford_paste(c(max(arr_lens), min(arr_lens)))
+          )
+      ))
+    }
+    
+    max_array_len <- max(arr_lens)
+    arrays[is_scalar] <- lapply(arrays[is_scalar], repeat_value_as_array, max_array_len)
+  }
+  arrays
+}
+
+#' Take an object of length 1 and repeat it.
+#' 
+#' @param object Object of length 1 to be repeated - vector, `Scalar`, `Array`, or `ChunkedArray`
+#' @param n Number of repetitions
+#' 
+#' @return `Array` of length `n`
+#' 
+#' @keywords internal
+repeat_value_as_array <- function(object, n) {
+  if (inherits(object, "ChunkedArray")) {
+    return(Scalar$create(object$chunks[[1]])$as_array(n))
+  }
+  return(Scalar$create(object)$as_array(n))
+}
\ No newline at end of file
diff --git a/r/data-raw/codegen.R b/r/data-raw/codegen.R
index ad4514a3124..9b25cb1842c 100644
--- a/r/data-raw/codegen.R
+++ b/r/data-raw/codegen.R
@@ -67,13 +67,13 @@ get_exported_functions <- function(decorations, export_tag) {
 
 glue_collapse_data <- function(data, ..., sep = ", ", last = "") {
   res <- glue_collapse(glue_data(data, ...), sep = sep, last = last)
-  if(length(res) == 0) res <- ""
+  if (length(res) == 0) res <- ""
   res
 }
 
 wrap_call <- function(name, return_type, args) {
   call <- glue::glue('{name}({list_params})', list_params = glue_collapse_data(args, "{name}"))
-  if(return_type == "void") {
+  if (return_type == "void") {
     glue::glue("\t{call};\n\treturn R_NilValue;", .trim = FALSE)
   } else {
     glue::glue("\treturn cpp11::as_sexp({call});")
@@ -149,7 +149,7 @@ cpp_functions_definitions <- arrow_exports %>%
       sep = "\n",
       real_params = glue_collapse_data(args, "{type} {name}"),
       input_params = glue_collapse_data(args, "\tarrow::r::Input<{type}>::type {name}({name}_sexp);", sep = "\n"),
-      return_line = if(nrow(args)) "\n" else "")
+      return_line = if (nrow(args)) "\n" else "")
 
     glue::glue('
     // {basename(file)}
@@ -162,7 +162,7 @@ cpp_functions_definitions <- arrow_exports %>%
 
 cpp_functions_registration <- arrow_exports %>%
   select(name, return_type, args) %>%
-  pmap_chr(function(name, return_type, args){
+  pmap_chr(function(name, return_type, args) {
     glue('\t\t{{ "_arrow_{name}", (DL_FUNC) &_arrow_{name}, {nrow(args)}}}, ')
   }) %>%
   glue_collapse(sep = "\n")
diff --git a/r/extra-tests/helpers.R b/r/extra-tests/helpers.R
index af57d45e5d2..3fb450ee332 100644
--- a/r/extra-tests/helpers.R
+++ b/r/extra-tests/helpers.R
@@ -24,13 +24,13 @@ if_version_less_than <- function(version) {
 }
 
 skip_if_version_less_than <- function(version, msg) {
-  if(if_version(version, `<`)) {
+  if (if_version(version, `<`)) {
     skip(msg)
   }
 }
 
 skip_if_version_equals <- function(version, msg) {
-  if(if_version(version, `==`)) {
+  if (if_version(version, `==`)) {
     skip(msg)
   }
 }
diff --git a/r/extra-tests/write-files.R b/r/extra-tests/write-files.R
index 75889b61407..e11405d67bf 100644
--- a/r/extra-tests/write-files.R
+++ b/r/extra-tests/write-files.R
@@ -26,7 +26,7 @@ source("tests/testthat/helper-data.R")
 write_parquet(example_with_metadata, "extra-tests/files/ex_data.parquet")
 
 for (comp in c("lz4", "uncompressed", "zstd")) {
-  if(!codec_is_available(comp)) break
+  if (!codec_is_available(comp)) break
 
   name <- paste0("extra-tests/files/ex_data_", comp, ".feather")
   write_feather(example_with_metadata, name, compression = comp)
diff --git a/r/man/recycle_scalars.Rd b/r/man/recycle_scalars.Rd
new file mode 100644
index 00000000000..3d97ecfd79f
--- /dev/null
+++ b/r/man/recycle_scalars.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/util.R
+\name{recycle_scalars}
+\alias{recycle_scalars}
+\title{Recycle scalar values in a list of arrays}
+\usage{
+recycle_scalars(arrays)
+}
+\arguments{
+\item{arrays}{List of arrays}
+}
+\value{
+List of arrays with any vector/Scalar/Array/ChunkedArray values of length 1 recycled
+}
+\description{
+Recycle scalar values in a list of arrays
+}
+\keyword{internal}
diff --git a/r/man/repeat_value_as_array.Rd b/r/man/repeat_value_as_array.Rd
new file mode 100644
index 00000000000..a4937326efa
--- /dev/null
+++ b/r/man/repeat_value_as_array.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/util.R
+\name{repeat_value_as_array}
+\alias{repeat_value_as_array}
+\title{Take an object of length 1 and repeat it.}
+\usage{
+repeat_value_as_array(object, n)
+}
+\arguments{
+\item{object}{Object of length 1 to be repeated - vector, \code{Scalar}, \code{Array}, or \code{ChunkedArray}}
+
+\item{n}{Number of repetitions}
+}
+\value{
+\code{Array} of length \code{n}
+}
+\description{
+Take an object of length 1 and repeat it.
+}
+\keyword{internal}
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 2024483f47d..024e5c58b0e 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -6091,15 +6091,16 @@ extern "C" SEXP _arrow_Scalar__as_vector(SEXP scalar_sexp){
 
 // scalar.cpp
 #if defined(ARROW_R_WITH_ARROW)
-std::shared_ptr<arrow::Array> MakeArrayFromScalar(const std::shared_ptr<arrow::Scalar>& scalar);
-extern "C" SEXP _arrow_MakeArrayFromScalar(SEXP scalar_sexp){
+std::shared_ptr<arrow::Array> MakeArrayFromScalar(const std::shared_ptr<arrow::Scalar>& scalar, int n);
+extern "C" SEXP _arrow_MakeArrayFromScalar(SEXP scalar_sexp, SEXP n_sexp){
 BEGIN_CPP11
 	arrow::r::Input<const std::shared_ptr<arrow::Scalar>&>::type scalar(scalar_sexp);
-	return cpp11::as_sexp(MakeArrayFromScalar(scalar));
+	arrow::r::Input<int>::type n(n_sexp);
+	return cpp11::as_sexp(MakeArrayFromScalar(scalar, n));
 END_CPP11
 }
 #else
-extern "C" SEXP _arrow_MakeArrayFromScalar(SEXP scalar_sexp){
+extern "C" SEXP _arrow_MakeArrayFromScalar(SEXP scalar_sexp, SEXP n_sexp){
 	Rf_error("Cannot call MakeArrayFromScalar(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
 }
 #endif
@@ -7279,7 +7280,7 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_arrow_StructScalar__field", (DL_FUNC) &_arrow_StructScalar__field, 2}, 
 		{ "_arrow_StructScalar__GetFieldByName", (DL_FUNC) &_arrow_StructScalar__GetFieldByName, 2}, 
 		{ "_arrow_Scalar__as_vector", (DL_FUNC) &_arrow_Scalar__as_vector, 1}, 
-		{ "_arrow_MakeArrayFromScalar", (DL_FUNC) &_arrow_MakeArrayFromScalar, 1}, 
+		{ "_arrow_MakeArrayFromScalar", (DL_FUNC) &_arrow_MakeArrayFromScalar, 2}, 
 		{ "_arrow_Scalar__is_valid", (DL_FUNC) &_arrow_Scalar__is_valid, 1}, 
 		{ "_arrow_Scalar__type", (DL_FUNC) &_arrow_Scalar__type, 1}, 
 		{ "_arrow_Scalar__Equals", (DL_FUNC) &_arrow_Scalar__Equals, 2}, 
diff --git a/r/src/scalar.cpp b/r/src/scalar.cpp
index 057e587e7eb..5450a6f0ab7 100644
--- a/r/src/scalar.cpp
+++ b/r/src/scalar.cpp
@@ -70,8 +70,8 @@ SEXP Scalar__as_vector(const std::shared_ptr<arrow::Scalar>& scalar) {
 
 // [[arrow::export]]
 std::shared_ptr<arrow::Array> MakeArrayFromScalar(
-    const std::shared_ptr<arrow::Scalar>& scalar) {
-  return ValueOrStop(arrow::MakeArrayFromScalar(*scalar, 1, gc_memory_pool()));
+    const std::shared_ptr<arrow::Scalar>& scalar, int n) {
+  return ValueOrStop(arrow::MakeArrayFromScalar(*scalar, n, gc_memory_pool()));
 }
 
 // [[arrow::export]]
diff --git a/r/tests/testthat/helper-expectation.R b/r/tests/testthat/helper-expectation.R
index 5b6958a9a7a..b815515a4fa 100644
--- a/r/tests/testthat/helper-expectation.R
+++ b/r/tests/testthat/helper-expectation.R
@@ -16,7 +16,7 @@
 # under the License.
 
 expect_as_vector <- function(x, y, ignore_attr = FALSE, ...) {
-  expect_fun <- if(ignore_attr){
+  expect_fun <- if (ignore_attr) {
     expect_equivalent
   } else {
     expect_equal
@@ -28,7 +28,7 @@ expect_data_frame <- function(x, y, ...) {
   expect_equal(as.data.frame(x), y, ...)
 }
 
-expect_r6_class <- function(object, class){
+expect_r6_class <- function(object, class) {
   expect_s3_class(object, class)
   expect_s3_class(object, "R6")
 }
@@ -255,7 +255,7 @@ expect_vector_error <- function(expr, # A vectorized R expression containing `in
   }
 }
 
-split_vector_as_list <- function(vec){
+split_vector_as_list <- function(vec) {
   vec_split <- length(vec) %/% 2
   vec1 <- vec[seq(from = min(1, length(vec) - 1), to = min(length(vec) - 1, vec_split), by = 1)]
   vec2 <- vec[seq(from = min(length(vec), vec_split + 1), to = length(vec), by = 1)]
diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R
index beb1306ab4f..6617805db54 100644
--- a/r/tests/testthat/test-RecordBatch.R
+++ b/r/tests/testthat/test-RecordBatch.R
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-
 test_that("RecordBatch", {
   # Note that we're reusing `tbl` and `batch` throughout the tests in this file
   tbl <- tibble::tibble(
@@ -415,14 +414,50 @@ test_that("record_batch() handles null type (ARROW-7064)", {
   expect_equivalent(batch$schema,  schema(a = int32(), n = null()))
 })
 
-test_that("record_batch() scalar recycling", {
-  skip("Not implemented (ARROW-11705)")
+test_that("record_batch() scalar recycling with vectors", {
   expect_data_frame(
     record_batch(a = 1:10, b = 5),
     tibble::tibble(a = 1:10, b = 5)
   )
 })
 
+test_that("record_batch() scalar recycling with Scalars, Arrays, and ChunkedArrays", {
+  
+  expect_data_frame(
+    record_batch(a = Array$create(1:10), b = Scalar$create(5)),
+    tibble::tibble(a = 1:10, b = 5)
+  )
+  
+  expect_data_frame(
+    record_batch(a = Array$create(1:10), b = Array$create(5)),
+    tibble::tibble(a = 1:10, b = 5)
+  )
+  
+  expect_data_frame(
+    record_batch(a = Array$create(1:10), b = ChunkedArray$create(5)),
+    tibble::tibble(a = 1:10, b = 5)
+  )
+  
+})
+
+test_that("record_batch() no recycling with tibbles", {
+  expect_error(
+    record_batch(
+      tibble::tibble(a = 1:10),
+      tibble::tibble(a = 1, b = 5)
+    ),
+    regexp = "All input tibbles or data.frames must have the same number of rows"
+  )
+  
+  expect_error(
+    record_batch(
+      tibble::tibble(a = 1:10),
+      tibble::tibble(a = 1)
+    ),
+    regexp = "All input tibbles or data.frames must have the same number of rows"
+  )
+})
+
 test_that("RecordBatch$Equals", {
   df <- tibble::tibble(x = 1:10, y = letters[1:10])
   a <- record_batch(df)
@@ -435,7 +470,7 @@ test_that("RecordBatch$Equals", {
 test_that("RecordBatch$Equals(check_metadata)", {
   df <- tibble::tibble(x = 1:2, y = c("a", "b"))
   rb1 <- record_batch(df)
-  rb2 <- record_batch(df, schema = rb1$schema$WithMetadata(list(some="metadata")))
+  rb2 <- record_batch(df, schema = rb1$schema$WithMetadata(list(some = "metadata")))
 
   expect_r6_class(rb1, "RecordBatch")
   expect_r6_class(rb2, "RecordBatch")
@@ -467,8 +502,7 @@ test_that("RecordBatch name assignment", {
 
 test_that("record_batch() with different length arrays", {
   msg <- "All arrays must have the same length"
-  expect_error(record_batch(a=1:5, b = 42), msg)
-  expect_error(record_batch(a=1:5, b = 1:6), msg)
+  expect_error(record_batch(a = 1:5, b = 1:6), msg)
 })
 
 test_that("Handling string data with embedded nuls", {
diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R
index 1f9628859d0..6dd36b248ec 100644
--- a/r/tests/testthat/test-Table.R
+++ b/r/tests/testthat/test-Table.R
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-
 test_that("read_table handles various input streams (ARROW-3450, ARROW-3505)", {
   tbl <- tibble::tibble(
     int = 1:10, dbl = as.numeric(1:10),
@@ -471,8 +470,51 @@ test_that("Table name assignment", {
 
 test_that("Table$create() with different length columns", {
   msg <- "All columns must have the same length"
-  expect_error(Table$create(a=1:5, b = 42), msg)
-  expect_error(Table$create(a=1:5, b = 1:6), msg)
+  expect_error(Table$create(a = 1:5, b = 1:6), msg)
+})
+
+test_that("Table$create() scalar recycling with vectors", {
+  expect_data_frame(
+    Table$create(a = 1:10, b = 5),
+    tibble::tibble(a = 1:10, b = 5)
+  )
+})
+
+test_that("Table$create() scalar recycling with Scalars, Arrays, and ChunkedArrays", {
+  
+  expect_data_frame(
+    Table$create(a = Array$create(1:10), b = Scalar$create(5)),
+    tibble::tibble(a = 1:10, b = 5)
+  )
+  
+  expect_data_frame(
+    Table$create(a = Array$create(1:10), b = Array$create(5)),
+    tibble::tibble(a = 1:10, b = 5)
+  )
+  
+  expect_data_frame(
+    Table$create(a = Array$create(1:10), b = ChunkedArray$create(5)),
+    tibble::tibble(a = 1:10, b = 5)
+  )
+  
+})
+
+test_that("Table$create() no recycling with tibbles", {
+  expect_error(
+    Table$create(
+      tibble::tibble(a = 1:10, b = 5),
+      tibble::tibble(a = 1, b = 5)
+    ),
+    regexp = "All input tibbles or data.frames must have the same number of rows"
+  )
+  
+  expect_error(
+    Table$create(
+      tibble::tibble(a = 1:10, b = 5),
+      tibble::tibble(a = 1)
+    ),
+    regexp = "All input tibbles or data.frames must have the same number of rows"
+  )
 })
 
 test_that("ARROW-11769 - grouping preserved in table creation", {
diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index d84ed03c2d2..ad3e7c30f1f 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -90,7 +90,7 @@ test_that("Setup (putting data in the dir)", {
   expect_length(dir(tsv_dir, recursive = TRUE), 2)
 })
 
-if(arrow_with_parquet()) {
+if (arrow_with_parquet()) {
   files <- c(
     file.path(dataset_dir, 1, "file1.parquet", fsep = "/"),
     file.path(dataset_dir, 2, "file2.parquet", fsep = "/")
diff --git a/r/tools/winlibs.R b/r/tools/winlibs.R
index f90becb7649..ccaa5c95d87 100644
--- a/r/tools/winlibs.R
+++ b/r/tools/winlibs.R
@@ -17,12 +17,12 @@
 
 args <- commandArgs(TRUE)
 VERSION <- args[1]
-if(!file.exists(sprintf("windows/arrow-%s/include/arrow/api.h", VERSION))){
-  if(length(args) > 1){
+if (!file.exists(sprintf("windows/arrow-%s/include/arrow/api.h", VERSION))) {
+  if (length(args) > 1) {
     # Arg 2 would be the path/to/lib.zip
     localfile <- args[2]
     cat(sprintf("*** Using RWINLIB_LOCAL %s\n", localfile))
-    if(!file.exists(localfile)){
+    if (!file.exists(localfile)) {
       cat(sprintf("*** %s does not exist; build will fail\n", localfile))
     }
     file.copy(localfile, "lib.zip")

From 69d5455f22684d70d658d8ec1548b9ecd4cb6242 Mon Sep 17 00:00:00 2001
From: Jinpeng Zhou <jinpengz@google.com>
Date: Thu, 17 Jun 2021 00:20:28 -0700
Subject: [PATCH 28/61] PARQUET-2056: [C++] Add ability for retrieving
 dictionary and indices separately for ColumnReader

In some contexts it is useful to be able to retrieve encoding information separately instead of decoding. This introduces new apis in RowGroupReader, ColumnReader, TypedColumnReader, and DictDecoder to support reading batches of dictionary indices. Given that a column chunk only has one dictionary page, the dictionary is read along with the 1st batch.

Thanks.

Closes #10537 from zjpzlz/expose-encodings

Authored-by: Jinpeng Zhou <jinpengz@google.com>
Signed-off-by: Micah Kornfield <emkornfield@gmail.com>
---
 cpp/src/parquet/column_reader.cc      | 132 ++++++++++++++++++++------
 cpp/src/parquet/column_reader.h       |  43 +++++++++
 cpp/src/parquet/column_reader_test.cc |  86 +++++++++++++++++
 cpp/src/parquet/encoding.cc           |  13 +++
 cpp/src/parquet/encoding.h            |  18 ++++
 cpp/src/parquet/file_reader.cc        |  39 ++++++++
 cpp/src/parquet/file_reader.h         |  14 +++
 cpp/src/parquet/reader_test.cc        |  78 +++++++++++++++
 cpp/src/parquet/types.h               |   9 ++
 9 files changed, 404 insertions(+), 28 deletions(-)

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index ec205f3d3f9..047d99fed9a 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -823,6 +823,9 @@ class ColumnReaderImplBase {
   /// DictionaryRecordReader
   bool new_dictionary_;
 
+  // The exposed encoding
+  ExposedEncoding exposed_encoding_ = ExposedEncoding::NO_ENCODING;
+
   // Map of encoding type to the respective decoder object. For example, a
   // column chunk's data pages may include both dictionary-encoded and
   // plain-encoded data.
@@ -861,8 +864,108 @@ class TypedColumnReaderImpl : public TypedColumnReader<DType>,
   Type::type type() const override { return this->descr_->physical_type(); }
 
   const ColumnDescriptor* descr() const override { return this->descr_; }
+
+  ExposedEncoding GetExposedEncoding() override { return this->exposed_encoding_; };
+
+  int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
+                                  int16_t* rep_levels, int32_t* indices,
+                                  int64_t* indices_read, const T** dict,
+                                  int32_t* dict_len) override;
+
+ protected:
+  void SetExposedEncoding(ExposedEncoding encoding) override {
+    this->exposed_encoding_ = encoding;
+  }
+
+ private:
+  // Read dictionary indices. Similar to ReadValues but decode data to dictionary indices.
+  // This function is called only by ReadBatchWithDictionary().
+  int64_t ReadDictionaryIndices(int64_t indices_to_read, int32_t* indices) {
+    auto decoder = dynamic_cast<DictDecoder<DType>*>(this->current_decoder_);
+    return decoder->DecodeIndices(static_cast<int>(indices_to_read), indices);
+  }
+
+  // Get dictionary. The dictionary should have been set by SetDict(). The dictionary is
+  // owned by the internal decoder and is destroyed when the reader is destroyed. This
+  // function is called only by ReadBatchWithDictionary() after dictionary is configured.
+  void GetDictionary(const T** dictionary, int32_t* dictionary_length) {
+    auto decoder = dynamic_cast<DictDecoder<DType>*>(this->current_decoder_);
+    decoder->GetDictionary(dictionary, dictionary_length);
+  }
+
+  // Read definition and repetition levels. Also return the number of definition levels
+  // and number of values to read. This function is called before reading values.
+  void ReadLevels(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+                  int64_t* num_def_levels, int64_t* values_to_read) {
+    batch_size =
+        std::min(batch_size, this->num_buffered_values_ - this->num_decoded_values_);
+
+    // If the field is required and non-repeated, there are no definition levels
+    if (this->max_def_level_ > 0 && def_levels != nullptr) {
+      *num_def_levels = this->ReadDefinitionLevels(batch_size, def_levels);
+      // TODO(wesm): this tallying of values-to-decode can be performed with better
+      // cache-efficiency if fused with the level decoding.
+      for (int64_t i = 0; i < *num_def_levels; ++i) {
+        if (def_levels[i] == this->max_def_level_) {
+          ++(*values_to_read);
+        }
+      }
+    } else {
+      // Required field, read all values
+      *values_to_read = batch_size;
+    }
+
+    // Not present for non-repeated fields
+    if (this->max_rep_level_ > 0 && rep_levels != nullptr) {
+      int64_t num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels);
+      if (def_levels != nullptr && *num_def_levels != num_rep_levels) {
+        throw ParquetException("Number of decoded rep / def levels did not match");
+      }
+    }
+  }
 };
 
+template <typename DType>
+int64_t TypedColumnReaderImpl<DType>::ReadBatchWithDictionary(
+    int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, int32_t* indices,
+    int64_t* indices_read, const T** dict, int32_t* dict_len) {
+  bool has_dict_output = dict != nullptr && dict_len != nullptr;
+  // Similar logic as ReadValues to get pages.
+  if (!HasNext()) {
+    *indices_read = 0;
+    if (has_dict_output) {
+      *dict = nullptr;
+      *dict_len = 0;
+    }
+    return 0;
+  }
+
+  // Verify the current data page is dictionary encoded.
+  if (this->current_encoding_ != Encoding::RLE_DICTIONARY) {
+    std::stringstream ss;
+    ss << "Data page is not dictionary encoded. Encoding: "
+       << EncodingToString(this->current_encoding_);
+    throw ParquetException(ss.str());
+  }
+
+  // Get dictionary pointer and length.
+  if (has_dict_output) {
+    GetDictionary(dict, dict_len);
+  }
+
+  // Similar logic as ReadValues to get def levels and rep levels.
+  int64_t num_def_levels = 0;
+  int64_t indices_to_read = 0;
+  ReadLevels(batch_size, def_levels, rep_levels, &num_def_levels, &indices_to_read);
+
+  // Read dictionary indices.
+  *indices_read = ReadDictionaryIndices(indices_to_read, indices);
+  int64_t total_indices = std::max(num_def_levels, *indices_read);
+  this->ConsumeBufferedValues(total_indices);
+
+  return total_indices;
+}
+
 template <typename DType>
 int64_t TypedColumnReaderImpl<DType>::ReadBatch(int64_t batch_size, int16_t* def_levels,
                                                 int16_t* rep_levels, T* values,
@@ -875,36 +978,9 @@ int64_t TypedColumnReaderImpl<DType>::ReadBatch(int64_t batch_size, int16_t* def
 
   // TODO(wesm): keep reading data pages until batch_size is reached, or the
   // row group is finished
-  batch_size =
-      std::min(batch_size, this->num_buffered_values_ - this->num_decoded_values_);
-
   int64_t num_def_levels = 0;
-  int64_t num_rep_levels = 0;
-
   int64_t values_to_read = 0;
-
-  // If the field is required and non-repeated, there are no definition levels
-  if (this->max_def_level_ > 0 && def_levels) {
-    num_def_levels = this->ReadDefinitionLevels(batch_size, def_levels);
-    // TODO(wesm): this tallying of values-to-decode can be performed with better
-    // cache-efficiency if fused with the level decoding.
-    for (int64_t i = 0; i < num_def_levels; ++i) {
-      if (def_levels[i] == this->max_def_level_) {
-        ++values_to_read;
-      }
-    }
-  } else {
-    // Required field, read all values
-    values_to_read = batch_size;
-  }
-
-  // Not present for non-repeated fields
-  if (this->max_rep_level_ > 0 && rep_levels) {
-    num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels);
-    if (def_levels && num_def_levels != num_rep_levels) {
-      throw ParquetException("Number of decoded rep / def levels did not match");
-    }
-  }
+  ReadLevels(batch_size, def_levels, rep_levels, &num_def_levels, &values_to_read);
 
   *values_read = this->ReadValues(values_to_read, values);
   int64_t total_values = std::max(num_def_levels, *values_read);
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index a73bba6cb4e..8c48e4d7843 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -128,6 +128,19 @@ class PARQUET_EXPORT ColumnReader {
   virtual Type::type type() const = 0;
 
   virtual const ColumnDescriptor* descr() const = 0;
+
+  // Get the encoding that can be exposed by this reader. If it returns
+  // dictionary encoding, then ReadBatchWithDictionary can be used to read data.
+  //
+  // \note API EXPERIMENTAL
+  virtual ExposedEncoding GetExposedEncoding() = 0;
+
+ protected:
+  friend class RowGroupReader;
+  // Set the encoding that can be exposed by this reader.
+  //
+  // \note API EXPERIMENTAL
+  virtual void SetExposedEncoding(ExposedEncoding encoding) = 0;
 };
 
 // API to read values from a single column. This is a main client facing API.
@@ -201,6 +214,36 @@ class TypedColumnReader : public ColumnReader {
   // Skip reading levels
   // Returns the number of levels skipped
   virtual int64_t Skip(int64_t num_rows_to_skip) = 0;
+
+  // Read a batch of repetition levels, definition levels, and indices from the
+  // column. And read the dictionary if a dictionary page is encountered during
+  // reading pages. This API is similar to ReadBatch(), with ability to read
+  // dictionary and indices. It is only valid to call this method  when the reader can
+  // expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns
+  // DICTIONARY).
+  //
+  // The dictionary is read along with the data page. When there's no data page,
+  // the dictionary won't be returned.
+  //
+  // @param batch_size The batch size to read
+  // @param[out] def_levels The Parquet definition levels.
+  // @param[out] rep_levels The Parquet repetition levels.
+  // @param[out] indices The dictionary indices.
+  // @param[out] indices_read The number of indices read.
+  // @param[out] dict The pointer to dictionary values. It will return nullptr if
+  // there's no data page. Each column chunk only has one dictionary page. The dictionary
+  // is owned by the reader, so the caller is responsible for copying the dictionary
+  // values before the reader gets destroyed.
+  // @param[out] dict_len The dictionary length. It will return 0 if there's no data
+  // page.
+  // @returns: actual number of levels read (see indices_read for number of
+  // indices read
+  //
+  // \note API EXPERIMENTAL
+  virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
+                                          int16_t* rep_levels, int32_t* indices,
+                                          int64_t* indices_read, const T** dict,
+                                          int32_t* dict_len) = 0;
 };
 
 namespace internal {
diff --git a/cpp/src/parquet/column_reader_test.cc b/cpp/src/parquet/column_reader_test.cc
index f0025f4c3a9..a50610bb8a2 100644
--- a/cpp/src/parquet/column_reader_test.cc
+++ b/cpp/src/parquet/column_reader_test.cc
@@ -27,6 +27,7 @@
 #include <vector>
 
 #include "arrow/testing/macros.h"
+#include "arrow/util/make_unique.h"
 #include "parquet/column_page.h"
 #include "parquet/column_reader.h"
 #include "parquet/schema.h"
@@ -386,5 +387,90 @@ TEST_F(TestPrimitiveReader, TestDictionaryEncodedPages) {
   pages_.clear();
 }
 
+TEST_F(TestPrimitiveReader, TestDictionaryEncodedPagesWithExposeEncoding) {
+  max_def_level_ = 0;
+  max_rep_level_ = 0;
+  int levels_per_page = 100;
+  int num_pages = 5;
+  std::vector<int16_t> def_levels;
+  std::vector<int16_t> rep_levels;
+  std::vector<ByteArray> values;
+  std::vector<uint8_t> buffer;
+  NodePtr type = schema::ByteArray("a", Repetition::REQUIRED);
+  const ColumnDescriptor descr(type, max_def_level_, max_rep_level_);
+
+  // Fully dictionary encoded
+  MakePages<ByteArrayType>(&descr, num_pages, levels_per_page, def_levels, rep_levels,
+                           values, buffer, pages_, Encoding::RLE_DICTIONARY);
+  InitReader(&descr);
+
+  auto reader = static_cast<ByteArrayReader*>(reader_.get());
+  const ByteArray* dict = nullptr;
+  int32_t dict_len = 0;
+  int64_t total_indices = 0;
+  int64_t indices_read = 0;
+  int64_t value_size = values.size();
+  auto indices = ::arrow::internal::make_unique<int32_t[]>(value_size);
+  while (total_indices < value_size && reader->HasNext()) {
+    const ByteArray* tmp_dict = nullptr;
+    int32_t tmp_dict_len = 0;
+    EXPECT_NO_THROW(reader->ReadBatchWithDictionary(
+        value_size, /*def_levels=*/nullptr,
+        /*rep_levels=*/nullptr, indices.get() + total_indices, &indices_read, &tmp_dict,
+        &tmp_dict_len));
+    if (tmp_dict != nullptr) {
+      // Dictionary is read along with data
+      EXPECT_GT(indices_read, 0);
+      dict = tmp_dict;
+      dict_len = tmp_dict_len;
+    } else {
+      // Dictionary is not read when there's no data
+      EXPECT_EQ(indices_read, 0);
+    }
+    total_indices += indices_read;
+  }
+
+  EXPECT_EQ(total_indices, value_size);
+  for (int64_t i = 0; i < total_indices; ++i) {
+    EXPECT_LT(indices[i], dict_len);
+    EXPECT_EQ(dict[indices[i]].len, values[i].len);
+    EXPECT_EQ(memcmp(dict[indices[i]].ptr, values[i].ptr, values[i].len), 0);
+  }
+  pages_.clear();
+}
+
+TEST_F(TestPrimitiveReader, TestNonDictionaryEncodedPagesWithExposeEncoding) {
+  max_def_level_ = 0;
+  max_rep_level_ = 0;
+  int64_t value_size = 100;
+  std::vector<int32_t> values(value_size, 0);
+  NodePtr type = schema::Int32("a", Repetition::REQUIRED);
+  const ColumnDescriptor descr(type, max_def_level_, max_rep_level_);
+
+  // The data page falls back to plain encoding
+  std::shared_ptr<ResizableBuffer> dummy = AllocateBuffer();
+  std::shared_ptr<DictionaryPage> dict_page =
+      std::make_shared<DictionaryPage>(dummy, 0, Encoding::PLAIN);
+  std::shared_ptr<DataPageV1> data_page = MakeDataPage<Int32Type>(
+      &descr, values, static_cast<int>(value_size), Encoding::PLAIN, /*indices=*/{},
+      /*indices_size=*/0, /*def_levels=*/{}, /*max_def_level=*/0, /*rep_levels=*/{},
+      /*max_rep_level=*/0);
+  pages_.push_back(dict_page);
+  pages_.push_back(data_page);
+  InitReader(&descr);
+
+  auto reader = static_cast<ByteArrayReader*>(reader_.get());
+  const ByteArray* dict = nullptr;
+  int32_t dict_len = 0;
+  int64_t indices_read = 0;
+  auto indices = ::arrow::internal::make_unique<int32_t[]>(value_size);
+  // Dictionary cannot be exposed when it's not fully dictionary encoded
+  EXPECT_THROW(reader->ReadBatchWithDictionary(value_size, /*def_levels=*/nullptr,
+                                               /*rep_levels=*/nullptr, indices.get(),
+                                               &indices_read, &dict, &dict_len),
+               ParquetException);
+  pages_.clear();
+}
+
 }  // namespace test
 }  // namespace parquet
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index eeeff1c8f9b..89b2b0e0413 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -1569,6 +1569,19 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
     return num_values;
   }
 
+  int DecodeIndices(int num_values, int32_t* indices) override {
+    if (num_values != idx_decoder_.GetBatch(indices, num_values)) {
+      ParquetException::EofException();
+    }
+    num_values_ -= num_values;
+    return num_values;
+  }
+
+  void GetDictionary(const T** dictionary, int32_t* dictionary_length) override {
+    *dictionary_length = dictionary_length_;
+    *dictionary = reinterpret_cast<T*>(dictionary_->mutable_data());
+  }
+
  protected:
   Status IndexInBounds(int32_t index) {
     if (ARROW_PREDICT_TRUE(0 <= index && index < dictionary_length_)) {
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index a3d8e012b6a..b9ca7a7ee68 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -350,6 +350,8 @@ class TypedDecoder : virtual public Decoder {
 template <typename DType>
 class DictDecoder : virtual public TypedDecoder<DType> {
  public:
+  using T = typename DType::c_type;
+
   virtual void SetDict(TypedDecoder<DType>* dictionary) = 0;
 
   /// \brief Insert dictionary values into the Arrow dictionary builder's memo,
@@ -371,6 +373,22 @@ class DictDecoder : virtual public TypedDecoder<DType> {
   /// \warning Remember to reset the builder each time the dict decoder is initialized
   /// with a new dictionary page
   virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0;
+
+  /// \brief Decode only dictionary indices (no nulls). Same as above
+  /// DecodeIndices but target is an array instead of a builder.
+  ///
+  /// \note API EXPERIMENTAL
+  virtual int DecodeIndices(int num_values, int32_t* indices) = 0;
+
+  /// \brief Get dictionary. The reader will call this API when it encounters a
+  /// new dictionary.
+  ///
+  /// @param[out] dictionary The pointer to dictionary values. Dictionary is owned by
+  /// the decoder and is destroyed when the decoder is destroyed.
+  /// @param[out] dictionary_length The dictionary length.
+  ///
+  /// \note API EXPERIMENTAL
+  virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0;
 };
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc
index 9dbfca433ce..4e38901aa0d 100644
--- a/cpp/src/parquet/file_reader.cc
+++ b/cpp/src/parquet/file_reader.cc
@@ -77,6 +77,45 @@ std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) {
       const_cast<ReaderProperties*>(contents_->properties())->memory_pool());
 }
 
+std::shared_ptr<ColumnReader> RowGroupReader::ColumnWithExposeEncoding(
+    int i, ExposedEncoding encoding_to_expose) {
+  std::shared_ptr<ColumnReader> reader = Column(i);
+
+  if (encoding_to_expose == ExposedEncoding::DICTIONARY) {
+    // Check the encoding_stats to see if all data pages are dictionary encoded.
+    std::unique_ptr<ColumnChunkMetaData> col = metadata()->ColumnChunk(i);
+    const std::vector<PageEncodingStats>& encoding_stats = col->encoding_stats();
+    if (encoding_stats.empty()) {
+      // Some parquet files may have empty encoding_stats. In this case we are
+      // not sure whether all data pages are dictionary encoded. So we do not
+      // enable exposing dictionary.
+      return reader;
+    }
+    // The 1st page should be the dictionary page.
+    if (encoding_stats[0].page_type != PageType::DICTIONARY_PAGE ||
+        (encoding_stats[0].encoding != Encoding::PLAIN &&
+         encoding_stats[0].encoding != Encoding::PLAIN_DICTIONARY)) {
+      return reader;
+    }
+    // The following pages should be dictionary encoded data pages.
+    for (size_t idx = 1; idx < encoding_stats.size(); ++idx) {
+      if ((encoding_stats[idx].encoding != Encoding::RLE_DICTIONARY &&
+           encoding_stats[idx].encoding != Encoding::PLAIN_DICTIONARY) ||
+          (encoding_stats[idx].page_type != PageType::DATA_PAGE &&
+           encoding_stats[idx].page_type != PageType::DATA_PAGE_V2)) {
+        return reader;
+      }
+    }
+  } else {
+    // Exposing other encodings are not supported for now.
+    return reader;
+  }
+
+  // Set exposed encoding.
+  reader->SetExposedEncoding(encoding_to_expose);
+  return reader;
+}
+
 std::unique_ptr<PageReader> RowGroupReader::GetColumnPageReader(int i) {
   if (i >= metadata()->num_columns()) {
     std::stringstream ss;
diff --git a/cpp/src/parquet/file_reader.h b/cpp/src/parquet/file_reader.h
index 4bc7ec2353a..0fc84054939 100644
--- a/cpp/src/parquet/file_reader.h
+++ b/cpp/src/parquet/file_reader.h
@@ -56,6 +56,20 @@ class PARQUET_EXPORT RowGroupReader {
   // column. Ownership is shared with the RowGroupReader.
   std::shared_ptr<ColumnReader> Column(int i);
 
+  // Construct a ColumnReader, trying to enable exposed encoding.
+  //
+  // For dictionary encoding, currently we only support column chunks that are fully
+  // dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded.
+  // If a column chunk uses dictionary encoding but then falls back to plain encoding, the
+  // encoding will not be exposed.
+  //
+  // The returned column reader provides an API GetExposedEncoding() for the
+  // users to check the exposed encoding and determine how to read the batches.
+  //
+  // \note API EXPERIMENTAL
+  std::shared_ptr<ColumnReader> ColumnWithExposeEncoding(
+      int i, ExposedEncoding encoding_to_expose);
+
   std::unique_ptr<PageReader> GetColumnPageReader(int i);
 
  private:
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index 9bbcda3cf1f..806ff2b9494 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -31,6 +31,7 @@
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
 #include "arrow/util/checked_cast.h"
+#include "arrow/util/make_unique.h"
 
 #include "parquet/column_reader.h"
 #include "parquet/column_scanner.h"
@@ -476,6 +477,83 @@ TEST(TestJSONWithLocalFile, JSONOutput) {
   ASSERT_EQ(json_output, ss.str());
 }
 
+TEST(TestFileReader, BufferedReadsWithDictionary) {
+  const int num_rows = 1000;
+
+  // Make schema
+  schema::NodeVector fields;
+  fields.push_back(PrimitiveNode::Make("field", Repetition::REQUIRED, Type::DOUBLE,
+                                       ConvertedType::NONE));
+  auto schema = std::static_pointer_cast<GroupNode>(
+      GroupNode::Make("schema", Repetition::REQUIRED, fields));
+
+  // Write small batches and small data pages
+  std::shared_ptr<WriterProperties> writer_props = WriterProperties::Builder()
+                                                       .write_batch_size(64)
+                                                       ->data_pagesize(128)
+                                                       ->enable_dictionary()
+                                                       ->build();
+
+  ASSERT_OK_AND_ASSIGN(auto out_file, ::arrow::io::BufferOutputStream::Create());
+  std::shared_ptr<ParquetFileWriter> file_writer =
+      ParquetFileWriter::Open(out_file, schema, writer_props);
+
+  RowGroupWriter* rg_writer = file_writer->AppendRowGroup();
+
+  // write one column
+  ::arrow::random::RandomArrayGenerator rag(0);
+  DoubleWriter* writer = static_cast<DoubleWriter*>(rg_writer->NextColumn());
+  std::shared_ptr<::arrow::Array> col = rag.Float64(num_rows, 0, 100);
+  const auto& col_typed = static_cast<const ::arrow::DoubleArray&>(*col);
+  writer->WriteBatch(num_rows, nullptr, nullptr, col_typed.raw_values());
+  rg_writer->Close();
+  file_writer->Close();
+
+  // Open the reader
+  ASSERT_OK_AND_ASSIGN(auto file_buf, out_file->Finish());
+  auto in_file = std::make_shared<::arrow::io::BufferReader>(file_buf);
+
+  ReaderProperties reader_props;
+  reader_props.enable_buffered_stream();
+  reader_props.set_buffer_size(64);
+  std::unique_ptr<ParquetFileReader> file_reader =
+      ParquetFileReader::Open(in_file, reader_props);
+
+  auto row_group = file_reader->RowGroup(0);
+  auto col_reader = std::static_pointer_cast<DoubleReader>(
+      row_group->ColumnWithExposeEncoding(0, ExposedEncoding::DICTIONARY));
+  EXPECT_EQ(col_reader->GetExposedEncoding(), ExposedEncoding::DICTIONARY);
+
+  auto indices = ::arrow::internal::make_unique<int32_t[]>(num_rows);
+  const double* dict = nullptr;
+  int32_t dict_len = 0;
+  for (int row_index = 0; row_index < num_rows; ++row_index) {
+    const double* tmp_dict = nullptr;
+    int32_t tmp_dict_len = 0;
+    int64_t values_read = 0;
+    int64_t levels_read = col_reader->ReadBatchWithDictionary(
+        /*batch_size=*/1, /*def_levels=*/nullptr, /*rep_levels=*/nullptr,
+        indices.get() + row_index, &values_read, &tmp_dict, &tmp_dict_len);
+
+    if (tmp_dict != nullptr) {
+      EXPECT_EQ(values_read, 1);
+      dict = tmp_dict;
+      dict_len = tmp_dict_len;
+    } else {
+      EXPECT_EQ(values_read, 0);
+    }
+
+    ASSERT_EQ(1, levels_read);
+    ASSERT_EQ(1, values_read);
+  }
+
+  // Check the results
+  for (int row_index = 0; row_index < num_rows; ++row_index) {
+    EXPECT_LT(indices[row_index], dict_len);
+    EXPECT_EQ(dict[indices[row_index]], col_typed.Value(row_index));
+  }
+}
+
 TEST(TestFileReader, BufferedReads) {
   // PARQUET-1636: Buffered reads were broken before introduction of
   // RandomAccessFile::GetStream
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index 6bd67f1ee5f..c25719830ec 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -479,6 +479,15 @@ struct Encoding {
   };
 };
 
+// Exposed data encodings. It is the encoding of the data read from the file,
+// rather than the encoding of the data in the file. E.g., the data encoded as
+// RLE_DICTIONARY in the file can be read as dictionary indices by RLE
+// decoding, in which case the data read from the file is DICTIONARY encoded.
+enum class ExposedEncoding {
+  NO_ENCODING = 0,  // data is not encoded, i.e. already decoded during reading
+  DICTIONARY = 1
+};
+
 /// \brief Return true if Parquet supports indicated compression type
 PARQUET_EXPORT
 bool IsCodecSupported(Compression::type codec);

From 3cee2485e9e2bd7d28545d3351aba39cb7c86c07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Pedro?= <joaop@simbioseventures.com>
Date: Thu, 17 Jun 2021 13:37:46 +0530
Subject: [PATCH 29/61] ARROW-12882: [C++][Gandiva] Fix behavior of the convert
 replace function on gandiva
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The convert_replace function on Gandiva, when defining an empty replacement char, should be able to replace the invalid chars with an empty string.

Closes #10406 from jpedroantunes/bugfix/convert-replace-empty-char and squashes the following commits:

0e1ec000f <João Pedro> Fix behavior of the convert replace function on gandiva

Authored-by: João Pedro <joaop@simbioseventures.com>
Signed-off-by: Praveen <praveen@dremio.com>
---
 cpp/src/gandiva/precompiled/string_ops.cc     | 22 ++++++++++++-------
 .../gandiva/precompiled/string_ops_test.cc    | 12 ++++++++--
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc
index 738ec367cd7..1cd566de4a5 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -1243,10 +1243,7 @@ const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char*
                                                     const char* char_to_replace,
                                                     int32_t char_to_replace_len,
                                                     int32_t* out_len) {
-  if (char_to_replace_len == 0) {
-    *out_len = text_len;
-    return text_in;
-  } else if (char_to_replace_len != 1) {
+  if (char_to_replace_len > 1) {
     gdv_fn_context_set_error_msg(context, "Replacement of multiple bytes not supported");
     *out_len = 0;
     return "";
@@ -1262,6 +1259,7 @@ const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char*
   }
   int32_t valid_bytes_to_cpy = 0;
   int32_t out_byte_counter = 0;
+  int32_t in_byte_counter = 0;
   int32_t char_len;
   // scan the base text from left to right and increment the start pointer till
   // looking for invalid chars to substitute
@@ -1273,9 +1271,15 @@ const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char*
       // define char_len = 1 to increase text_index by 1 (as ASCII char fits in 1 byte)
       char_len = 1;
       // first copy the valid bytes until now and then replace the invalid character
-      memcpy(ret + out_byte_counter, text_in + out_byte_counter, valid_bytes_to_cpy);
-      ret[out_byte_counter + valid_bytes_to_cpy] = char_to_replace[0];
-      out_byte_counter += valid_bytes_to_cpy + char_len;
+      memcpy(ret + out_byte_counter, text_in + in_byte_counter, valid_bytes_to_cpy);
+      // if the replacement char is empty, the invalid char should be ignored
+      if (char_to_replace_len == 0) {
+        out_byte_counter += valid_bytes_to_cpy;
+      } else {
+        ret[out_byte_counter + valid_bytes_to_cpy] = char_to_replace[0];
+        out_byte_counter += valid_bytes_to_cpy + char_len;
+      }
+      in_byte_counter += valid_bytes_to_cpy + char_len;
       valid_bytes_to_cpy = 0;
       continue;
     }
@@ -1285,8 +1289,10 @@ const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char*
   if (out_byte_counter == 0) return text_in;
   // if there are still valid bytes to copy, do it
   if (valid_bytes_to_cpy != 0) {
-    memcpy(ret + out_byte_counter, text_in + out_byte_counter, valid_bytes_to_cpy);
+    memcpy(ret + out_byte_counter, text_in + in_byte_counter, valid_bytes_to_cpy);
   }
+  // the out length will be the out bytes copied + the missing end bytes copied
+  *out_len = valid_bytes_to_cpy + out_byte_counter;
   return ret;
 }
 
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc
index 8ffaace624a..2460633d268 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -175,14 +175,22 @@ TEST(TestStringOps, TestConvertReplaceInvalidUtf8Char) {
   EXPECT_TRUE(ctx.has_error());
   ctx.Reset();
 
-  // full valid utf8, but invalid replacement char length
+  // invalid utf8 (xa0 and xa1 are invalid) with empty replacement char length
   std::string f("ok-\xa0\xa1-valid");
   auto f_in_out_len = static_cast<int>(f.length());
   const char* f_str = convert_replace_invalid_fromUTF8_binary(
       ctx_ptr, f.data(), f_in_out_len, "", 0, &f_in_out_len);
-  EXPECT_EQ(std::string(f_str, f_in_out_len), "ok-\xa0\xa1-valid");
+  EXPECT_EQ(std::string(f_str, f_in_out_len), "ok--valid");
   EXPECT_FALSE(ctx.has_error());
+  ctx.Reset();
 
+  // invalid utf8 (xa0 and xa1 are invalid) with empty replacement char length
+  std::string g("\xa0\xa1-ok-\xa0\xa1-valid-\xa0\xa1");
+  auto g_in_out_len = static_cast<int>(g.length());
+  const char* g_str = convert_replace_invalid_fromUTF8_binary(
+      ctx_ptr, g.data(), g_in_out_len, "", 0, &g_in_out_len);
+  EXPECT_EQ(std::string(g_str, g_in_out_len), "-ok--valid-");
+  EXPECT_FALSE(ctx.has_error());
   ctx.Reset();
 }
 

From c0651781a3f79d459ad7015ee6ec2758f6eb5783 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Thu, 17 Jun 2021 15:20:44 +0200
Subject: [PATCH 30/61] ARROW-13101: [Python][Doc] pyarrow.FixedSizeListArray
 does not appear in the documentation

Closes #10545 from westonpace/bugfix/ARROW-13101--python-doc-pyarrow-fixedsizelistarray-does-not

Authored-by: Weston Pace <weston.pace@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 docs/source/python/api/arrays.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst
index 81a00d8de3d..17b061dc7d8 100644
--- a/docs/source/python/api/arrays.rst
+++ b/docs/source/python/api/arrays.rst
@@ -68,6 +68,7 @@ may expose data type-specific methods or properties.
    Decimal128Array
    DictionaryArray
    ListArray
+   FixedSizeListArray
    LargeListArray
    StructArray
    UnionArray

From 799a11330d64ad2ca9fc5ad0662bbbeca727f22b Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 17 Jun 2021 21:35:49 +0200
Subject: [PATCH 31/61] ARROW-13092: [C++] Return an error in CreateDir if
 target is a file

Closes #10541 from pitrou/ARROW-13092-create-dir-is-a-file

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/filesystem/test_util.cc |  3 +++
 cpp/src/arrow/util/io_util.cc         | 18 +++++++++++--
 cpp/src/arrow/util/io_util_test.cc    | 37 ++++++++++++++++++---------
 3 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc
index bbff33f4d32..be9d99d72b8 100644
--- a/cpp/src/arrow/filesystem/test_util.cc
+++ b/cpp/src/arrow/filesystem/test_util.cc
@@ -208,6 +208,9 @@ void GenericFileSystemTest::TestCreateDir(FileSystem* fs) {
   ASSERT_RAISES(IOError, fs->CreateDir("AB/def/EF/GH", true /* recursive */));
   ASSERT_RAISES(IOError, fs->CreateDir("AB/def/EF", false /* recursive */));
 
+  // Cannot create a directory when there is already a file with the same name
+  ASSERT_RAISES(IOError, fs->CreateDir("AB/def"));
+
   AssertAllDirs(fs, {"AB", "AB/CD", "AB/CD/EF", "AB/GH", "AB/GH/IJ", "XY"});
   AssertAllFiles(fs, {"AB/def"});
 }
diff --git a/cpp/src/arrow/util/io_util.cc b/cpp/src/arrow/util/io_util.cc
index 5e6da2fb9d6..552417e5a13 100644
--- a/cpp/src/arrow/util/io_util.cc
+++ b/cpp/src/arrow/util/io_util.cc
@@ -472,11 +472,18 @@ namespace {
 
 Result<bool> DoCreateDir(const PlatformFilename& dir_path, bool create_parents) {
 #ifdef _WIN32
-  if (CreateDirectoryW(dir_path.ToNative().c_str(), nullptr)) {
+  const auto s = dir_path.ToNative().c_str();
+  if (CreateDirectoryW(s, nullptr)) {
     return true;
   }
   int errnum = GetLastError();
   if (errnum == ERROR_ALREADY_EXISTS) {
+    const auto attrs = GetFileAttributesW(s);
+    if (attrs == INVALID_FILE_ATTRIBUTES || !(attrs & FILE_ATTRIBUTE_DIRECTORY)) {
+      // Note we propagate the original error, not the GetFileAttributesW() error
+      return IOErrorFromWinError(ERROR_ALREADY_EXISTS, "Cannot create directory '",
+                                 dir_path.ToString(), "': non-directory entry exists");
+    }
     return false;
   }
   if (create_parents && errnum == ERROR_PATH_NOT_FOUND) {
@@ -489,10 +496,17 @@ Result<bool> DoCreateDir(const PlatformFilename& dir_path, bool create_parents)
   return IOErrorFromWinError(GetLastError(), "Cannot create directory '",
                              dir_path.ToString(), "'");
 #else
-  if (mkdir(dir_path.ToNative().c_str(), S_IRWXU | S_IRWXG | S_IRWXO) == 0) {
+  const auto s = dir_path.ToNative().c_str();
+  if (mkdir(s, S_IRWXU | S_IRWXG | S_IRWXO) == 0) {
     return true;
   }
   if (errno == EEXIST) {
+    struct stat st;
+    if (stat(s, &st) || !S_ISDIR(st.st_mode)) {
+      // Note we propagate the original errno, not the stat() errno
+      return IOErrorFromErrno(EEXIST, "Cannot create directory '", dir_path.ToString(),
+                              "': non-directory entry exists");
+    }
     return false;
   }
   if (create_parents && errno == ENOENT) {
diff --git a/cpp/src/arrow/util/io_util_test.cc b/cpp/src/arrow/util/io_util_test.cc
index a423ecd0152..c09e4b974dd 100644
--- a/cpp/src/arrow/util/io_util_test.cc
+++ b/cpp/src/arrow/util/io_util_test.cc
@@ -29,6 +29,7 @@
 #include <pthread.h>
 #endif
 
+#include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 
 #include "arrow/testing/gtest_util.h"
@@ -53,6 +54,12 @@ void AssertNotExists(const PlatformFilename& path) {
   ASSERT_FALSE(exists) << "Path '" << path.ToString() << "' exists";
 }
 
+void TouchFile(const PlatformFilename& path) {
+  int fd = -1;
+  ASSERT_OK_AND_ASSIGN(fd, FileOpenWritable(path));
+  ASSERT_OK(FileClose(fd));
+}
+
 TEST(ErrnoFromStatus, Basics) {
   Status st;
   st = Status::OK();
@@ -370,7 +377,7 @@ TEST(CreateDirDeleteDir, Basics) {
   const std::string BASE =
       temp_dir->path().Join("xxx-io-util-test-dir2").ValueOrDie().ToString();
   bool created, deleted;
-  PlatformFilename parent, child;
+  PlatformFilename parent, child, child_file;
 
   ASSERT_OK_AND_ASSIGN(parent, PlatformFilename::FromString(BASE));
   ASSERT_EQ(parent.ToString(), BASE);
@@ -392,6 +399,11 @@ TEST(CreateDirDeleteDir, Basics) {
   ASSERT_TRUE(created);
   AssertExists(child);
 
+  ASSERT_OK_AND_ASSIGN(child_file, PlatformFilename::FromString(BASE + "/some-file"));
+  TouchFile(child_file);
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      IOError, ::testing::HasSubstr("non-directory entry exists"), CreateDir(child_file));
+
   ASSERT_OK_AND_ASSIGN(deleted, DeleteDirTree(parent));
   ASSERT_TRUE(deleted);
   AssertNotExists(parent);
@@ -436,9 +448,7 @@ TEST(DeleteDirContents, Basics) {
   ASSERT_OK_AND_ASSIGN(child2, PlatformFilename::FromString(BASE + "/child-file"));
   ASSERT_OK_AND_ASSIGN(created, CreateDir(child1));
   ASSERT_TRUE(created);
-  int fd = -1;
-  ASSERT_OK_AND_ASSIGN(fd, FileOpenWritable(child2));
-  ASSERT_OK(FileClose(fd));
+  TouchFile(child2);
   AssertExists(child1);
   AssertExists(child2);
 
@@ -522,6 +532,14 @@ TEST(CreateDirTree, Basics) {
   ASSERT_OK_AND_ASSIGN(fn, temp_dir->path().Join("EF"));
   ASSERT_OK_AND_ASSIGN(created, CreateDirTree(fn));
   ASSERT_TRUE(created);
+
+  ASSERT_OK_AND_ASSIGN(fn, temp_dir->path().Join("AB/file"));
+  TouchFile(fn);
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      IOError, ::testing::HasSubstr("non-directory entry exists"), CreateDirTree(fn));
+
+  ASSERT_OK_AND_ASSIGN(fn, temp_dir->path().Join("AB/file/sub"));
+  ASSERT_RAISES(IOError, CreateDirTree(fn));
 }
 
 TEST(ListDir, Basics) {
@@ -546,9 +564,7 @@ TEST(ListDir, Basics) {
   ASSERT_OK_AND_ASSIGN(fn, temp_dir->path().Join("AB/EF/GH"));
   ASSERT_OK(CreateDirTree(fn));
   ASSERT_OK_AND_ASSIGN(fn, temp_dir->path().Join("AB/ghi.txt"));
-  int fd = -1;
-  ASSERT_OK_AND_ASSIGN(fd, FileOpenWritable(fn));
-  ASSERT_OK(FileClose(fd));
+  TouchFile(fn);
 
   ASSERT_OK_AND_ASSIGN(fn, temp_dir->path().Join("AB"));
   ASSERT_OK_AND_ASSIGN(entries, ListDir(fn));
@@ -568,15 +584,13 @@ TEST(ListDir, Basics) {
 TEST(DeleteFile, Basics) {
   std::unique_ptr<TemporaryDir> temp_dir;
   PlatformFilename fn;
-  int fd;
   bool deleted;
 
   ASSERT_OK_AND_ASSIGN(temp_dir, TemporaryDir::Make("io-util-test-"));
   ASSERT_OK_AND_ASSIGN(fn, temp_dir->path().Join("test-file"));
 
   AssertNotExists(fn);
-  ASSERT_OK_AND_ASSIGN(fd, FileOpenWritable(fn));
-  ASSERT_OK(FileClose(fd));
+  TouchFile(fn);
   AssertExists(fn);
   ASSERT_OK_AND_ASSIGN(deleted, DeleteFile(fn));
   ASSERT_TRUE(deleted);
@@ -638,8 +652,7 @@ TEST(FileUtils, LongPaths) {
   AssertExists(long_path);
   ASSERT_OK_AND_ASSIGN(long_filename,
                        PlatformFilename::FromString(fs.str() + "/file.txt"));
-  ASSERT_OK_AND_ASSIGN(fd, FileOpenWritable(long_filename));
-  ASSERT_OK(FileClose(fd));
+  TouchFile(long_filename);
   AssertExists(long_filename);
   fd = -1;
   ASSERT_OK_AND_ASSIGN(fd, FileOpenReadable(long_filename));

From 1108dce1d01f811464c172da5eb736caf544df5d Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Thu, 17 Jun 2021 19:02:35 -0500
Subject: [PATCH 32/61] ARROW-12940: [R] Expose C interface as R6 methods

Add methods for exporting/importing the data types that are supported in the C-interface

Closes #10536 from jonkeane/ARROW-12940-C-interface-as-methods

Authored-by: Jonathan Keane <jkeane@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 r/DESCRIPTION                       |  2 +-
 r/R/array.R                         | 15 ++++----
 r/R/field.R                         |  5 ++-
 r/R/python.R                        | 24 ++++++-------
 r/R/record-batch-reader.R           |  7 ++--
 r/R/record-batch.R                  |  9 +++--
 r/R/schema.R                        |  5 ++-
 r/R/type.R                          |  6 +++-
 r/man/RecordBatchReader.Rd          |  2 +-
 r/man/array.Rd                      |  2 +-
 r/tests/testthat/test-Array.R       | 20 +++++++++++
 r/tests/testthat/test-RecordBatch.R | 56 ++++++++++++++++++++++++++---
 r/tests/testthat/test-Table.R       |  6 ++--
 r/tests/testthat/test-data-type.R   | 13 +++++++
 r/tests/testthat/test-field.R       | 13 +++++++
 r/tests/testthat/test-python.R      |  1 -
 r/tests/testthat/test-schema.R      | 13 +++++++
 17 files changed, 162 insertions(+), 37 deletions(-)

diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index a78acdd4a8f..a6536015530 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -54,6 +54,7 @@ Suggests:
     withr
 LinkingTo: cpp11 (>= 0.2.0)
 Collate:
+    'arrowExports.R'
     'enums.R'
     'arrow-package.R'
     'type.R'
@@ -61,7 +62,6 @@ Collate:
     'arrow-datum.R'
     'array.R'
     'arrow-tabular.R'
-    'arrowExports.R'
     'buffer.R'
     'chunked-array.R'
     'io.R'
diff --git a/r/R/array.R b/r/R/array.R
index 0a117e5e74f..93d148ec29b 100644
--- a/r/R/array.R
+++ b/r/R/array.R
@@ -88,23 +88,23 @@
 #' my_array <- Array$create(1:10)
 #' my_array$type
 #' my_array$cast(int8())
-#' 
+#'
 #' # Check if value is null; zero-indexed
 #' na_array <- Array$create(c(1:5, NA))
 #' na_array$IsNull(0)
 #' na_array$IsNull(5)
 #' na_array$IsValid(5)
 #' na_array$null_count
-#' 
+#'
 #' # zero-copy slicing; the offset of the new Array will be the same as the index passed to $Slice
 #' new_array <- na_array$Slice(5)
 #' new_array$offset
-#' 
+#'
 #' # Compare 2 arrays
 #' na_array2 = na_array
 #' na_array2 == na_array # element-wise comparison
-#' na_array2$Equals(na_array) # overall comparison 
-#' 
+#' na_array2$Equals(na_array) # overall comparison
+#'
 #' @export
 Array <- R6Class("Array",
   inherit = ArrowDatum,
@@ -167,7 +167,8 @@ Array <- R6Class("Array",
     View = function(type) {
       Array$create(Array__View(self, as_type(type)))
     },
-    Validate = function() Array__Validate(self)
+    Validate = function() Array__Validate(self),
+    export_to_c = function(array_ptr, schema_ptr) ExportArray(self, array_ptr, schema_ptr)
   ),
   active = list(
     null_count = function() Array__null_count(self),
@@ -188,6 +189,8 @@ Array$create <- function(x, type = NULL) {
   }
   vec_to_arrow(x, type)
 }
+#' @include arrowExports.R
+Array$import_from_c <- ImportArray
 
 #' @rdname array
 #' @usage NULL
diff --git a/r/R/field.R b/r/R/field.R
index e4fba2af0b8..60d8ffde22b 100644
--- a/r/R/field.R
+++ b/r/R/field.R
@@ -38,7 +38,8 @@ Field <- R6Class("Field", inherit = ArrowObject,
     },
     Equals = function(other, ...) {
       inherits(other, "Field") && Field__Equals(self, other)
-    }
+    },
+    export_to_c = function(ptr) ExportField(self, ptr)
   ),
 
   active = list(
@@ -59,6 +60,8 @@ Field$create <- function(name, type, metadata) {
   assert_that(missing(metadata), msg = "metadata= is currently ignored")
   Field__initialize(enc2utf8(name), type, TRUE)
 }
+#' @include arrowExports.R
+Field$import_from_c <- ImportField
 
 #' @param name field name
 #' @param type logical type, instance of [DataType]
diff --git a/r/R/python.R b/r/R/python.R
index 52e4bcd7ac8..9d1ecf6347a 100644
--- a/r/R/python.R
+++ b/r/R/python.R
@@ -24,7 +24,7 @@ py_to_r.pyarrow.lib.Array <- function(x, ...) {
   })
 
   x$`_export_to_c`(array_ptr, schema_ptr)
-  ImportArray(array_ptr, schema_ptr)
+  Array$import_from_c(array_ptr, schema_ptr)
 }
 
 r_to_py.Array <- function(x, convert = FALSE) {
@@ -37,7 +37,7 @@ r_to_py.Array <- function(x, convert = FALSE) {
 
   # Import with convert = FALSE so that `_import_from_c` returns a Python object
   pa <- reticulate::import("pyarrow", convert = FALSE)
-  ExportArray(x, array_ptr, schema_ptr)
+  x$export_to_c(array_ptr, schema_ptr)
   out <- pa$Array$`_import_from_c`(array_ptr, schema_ptr)
   # But set the convert attribute on the return object to the requested value
   assign("convert", convert, out)
@@ -54,7 +54,7 @@ py_to_r.pyarrow.lib.RecordBatch <- function(x, ...) {
 
   x$`_export_to_c`(array_ptr, schema_ptr)
 
-  ImportRecordBatch(array_ptr, schema_ptr)
+  RecordBatch$import_from_c(array_ptr, schema_ptr)
 }
 
 r_to_py.RecordBatch <- function(x, convert = FALSE) {
@@ -67,7 +67,7 @@ r_to_py.RecordBatch <- function(x, convert = FALSE) {
 
   # Import with convert = FALSE so that `_import_from_c` returns a Python object
   pa <- reticulate::import("pyarrow", convert = FALSE)
-  ExportRecordBatch(x, array_ptr, schema_ptr)
+  x$export_to_c(array_ptr, schema_ptr)
   out <- pa$RecordBatch$`_import_from_c`(array_ptr, schema_ptr)
   # But set the convert attribute on the return object to the requested value
   assign("convert", convert, out)
@@ -108,7 +108,7 @@ py_to_r.pyarrow.lib.Schema <- function(x, ...) {
   on.exit(delete_arrow_schema(schema_ptr))
 
   x$`_export_to_c`(schema_ptr)
-  ImportSchema(schema_ptr)
+  Schema$import_from_c(schema_ptr)
 }
 
 r_to_py.Schema <- function(x, convert = FALSE) {
@@ -117,7 +117,7 @@ r_to_py.Schema <- function(x, convert = FALSE) {
 
   # Import with convert = FALSE so that `_import_from_c` returns a Python object
   pa <- reticulate::import("pyarrow", convert = FALSE)
-  ExportSchema(x, schema_ptr)
+  x$export_to_c(schema_ptr)
   out <- pa$Schema$`_import_from_c`(schema_ptr)
   # But set the convert attribute on the return object to the requested value
   assign("convert", convert, out)
@@ -129,7 +129,7 @@ py_to_r.pyarrow.lib.Field <- function(x, ...) {
   on.exit(delete_arrow_schema(schema_ptr))
 
   x$`_export_to_c`(schema_ptr)
-  ImportField(schema_ptr)
+  Field$import_from_c(schema_ptr)
 }
 
 r_to_py.Field <- function(x, convert = FALSE) {
@@ -138,7 +138,7 @@ r_to_py.Field <- function(x, convert = FALSE) {
 
   # Import with convert = FALSE so that `_import_from_c` returns a Python object
   pa <- reticulate::import("pyarrow", convert = FALSE)
-  ExportField(x, schema_ptr)
+  x$export_to_c(schema_ptr)
   out <- pa$Field$`_import_from_c`(schema_ptr)
   # But set the convert attribute on the return object to the requested value
   assign("convert", convert, out)
@@ -150,7 +150,7 @@ py_to_r.pyarrow.lib.DataType <- function(x, ...) {
   on.exit(delete_arrow_schema(schema_ptr))
 
   x$`_export_to_c`(schema_ptr)
-  ImportType(schema_ptr)
+  DataType$import_from_c(schema_ptr)
 }
 
 r_to_py.DataType <- function(x, convert = FALSE) {
@@ -159,7 +159,7 @@ r_to_py.DataType <- function(x, convert = FALSE) {
 
   # Import with convert = FALSE so that `_import_from_c` returns a Python object
   pa <- reticulate::import("pyarrow", convert = FALSE)
-  ExportType(x, schema_ptr)
+  x$export_to_c(schema_ptr)
   out <- pa$DataType$`_import_from_c`(schema_ptr)
   # But set the convert attribute on the return object to the requested value
   assign("convert", convert, out)
@@ -171,7 +171,7 @@ py_to_r.pyarrow.lib.RecordBatchReader <- function(x, ...) {
   on.exit(delete_arrow_array_stream(stream_ptr))
 
   x$`_export_to_c`(stream_ptr)
-  ImportRecordBatchReader(stream_ptr)
+  RecordBatchFileReader$import_from_c(stream_ptr)
 }
 
 r_to_py.RecordBatchReader <- function(x, convert = FALSE) {
@@ -180,7 +180,7 @@ r_to_py.RecordBatchReader <- function(x, convert = FALSE) {
 
   # Import with convert = FALSE so that `_import_from_c` returns a Python object
   pa <- reticulate::import("pyarrow", convert = FALSE)
-  ExportRecordBatchReader(x, stream_ptr)
+  x$export_to_c(stream_ptr)
   # TODO: handle subclasses of RecordBatchReader?
   out <- pa$lib$RecordBatchReader$`_import_from_c`(stream_ptr)
   # But set the convert attribute on the return object to the requested value
diff --git a/r/R/record-batch-reader.R b/r/R/record-batch-reader.R
index e00d24d8c6b..9fffea7da37 100644
--- a/r/R/record-batch-reader.R
+++ b/r/R/record-batch-reader.R
@@ -21,7 +21,7 @@
 #' communication (IPC)](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc):
 #' a "stream" format and a "file" format, known as Feather.
 #' `RecordBatchStreamReader` and `RecordBatchFileReader` are
-#' interfaces for accessing record batches from input sources those formats,
+#' interfaces for accessing record batches from input sources in those formats,
 #' respectively.
 #'
 #' For guidance on how to use these classes, see the examples section.
@@ -94,7 +94,8 @@ RecordBatchReader <- R6Class("RecordBatchReader", inherit = ArrowObject,
   public = list(
     read_next_batch = function() RecordBatchReader__ReadNext(self),
     batches = function() RecordBatchReader__batches(self),
-    read_table = function() Table__from_RecordBatchReader(self)
+    read_table = function() Table__from_RecordBatchReader(self),
+    export_to_c = function(stream_ptr) ExportRecordBatchReader(self, stream_ptr)
   ),
   active = list(
     schema = function() RecordBatchReader__schema(self)
@@ -115,6 +116,8 @@ RecordBatchStreamReader$create <- function(stream) {
   assert_is(stream, "InputStream")
   ipc___RecordBatchStreamReader__Open(stream)
 }
+#' @include arrowExports.R
+RecordBatchReader$import_from_c <- RecordBatchStreamReader$import_from_c <- ImportRecordBatchReader
 
 #' @rdname RecordBatchReader
 #' @usage NULL
diff --git a/r/R/record-batch.R b/r/R/record-batch.R
index 0ba6b4bd45d..c42834762ef 100644
--- a/r/R/record-batch.R
+++ b/r/R/record-batch.R
@@ -118,6 +118,9 @@ RecordBatch <- R6Class("RecordBatch", inherit = ArrowTabular,
     invalidate = function() {
       .Call(`_arrow_RecordBatch__Reset`, self)
       super$invalidate()
+    },
+    export_to_c = function(array_ptr, schema_ptr) {
+      ExportRecordBatch(self, array_ptr, schema_ptr)
     }
   ),
 
@@ -148,7 +151,7 @@ RecordBatch$create <- function(..., schema = NULL) {
   if (length(arrays) == 1 && inherits(arrays[[1]], c("raw", "Buffer", "InputStream", "Message"))) {
     return(RecordBatch$from_message(arrays[[1]], schema))
   }
-  
+
   # Else, a list of arrays or data.frames
   # making sure there are always names
   if (is.null(names(arrays))) {
@@ -161,7 +164,7 @@ RecordBatch$create <- function(..., schema = NULL) {
     out <- RecordBatch__from_arrays(schema, arrays)
     return(dplyr::group_by(out, !!!dplyr::groups(arrays[[1]])))
   }
-  
+
   # If any arrays are length 1, recycle them
   arrays <- recycle_scalars(arrays)
 
@@ -182,6 +185,8 @@ RecordBatch$from_message <- function(obj, schema) {
     ipc___ReadRecordBatch__Message__Schema(obj, schema)
   }
 }
+#' @include arrowExports.R
+RecordBatch$import_from_c <- ImportRecordBatch
 
 #' @param ... A `data.frame` or a named set of Arrays or vectors. If given a
 #' mixture of data.frames and vectors, the inputs will be autospliced together
diff --git a/r/R/schema.R b/r/R/schema.R
index d0491fdf6e3..32cb1522614 100644
--- a/r/R/schema.R
+++ b/r/R/schema.R
@@ -112,7 +112,8 @@ Schema <- R6Class("Schema",
     },
     Equals = function(other, check_metadata = FALSE, ...) {
       inherits(other, "Schema") && Schema__Equals(self, other, isTRUE(check_metadata))
-    }
+    },
+    export_to_c = function(ptr) ExportSchema(self, ptr)
   ),
   active = list(
     names = function() {
@@ -136,6 +137,8 @@ Schema <- R6Class("Schema",
   )
 )
 Schema$create <- function(...) schema_(.fields(list2(...)))
+#' @include arrowExports.R
+Schema$import_from_c <- ImportSchema
 
 prepare_key_value_metadata <- function(metadata) {
   # key-value-metadata must be a named character vector;
diff --git a/r/R/type.R b/r/R/type.R
index 0b9e1dbd03c..c96f43bbb46 100644
--- a/r/R/type.R
+++ b/r/R/type.R
@@ -39,7 +39,8 @@ DataType <- R6Class("DataType",
     },
     fields = function() {
       DataType__fields(self)
-    }
+    },
+    export_to_c = function(ptr) ExportType(self, ptr)
   ),
 
   active = list(
@@ -49,6 +50,9 @@ DataType <- R6Class("DataType",
   )
 )
 
+#' @include arrowExports.R
+DataType$import_from_c <- ImportType
+
 INTEGER_TYPES <- as.character(outer(c("uint", "int"), c(8, 16, 32, 64), paste0))
 FLOAT_TYPES <- c("float16", "float32", "float64", "halffloat", "float", "double")
 
diff --git a/r/man/RecordBatchReader.Rd b/r/man/RecordBatchReader.Rd
index d2e1a6919e6..90c796a6693 100644
--- a/r/man/RecordBatchReader.Rd
+++ b/r/man/RecordBatchReader.Rd
@@ -10,7 +10,7 @@
 Apache Arrow defines two formats for \href{https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc}{serializing data for interprocess communication (IPC)}:
 a "stream" format and a "file" format, known as Feather.
 \code{RecordBatchStreamReader} and \code{RecordBatchFileReader} are
-interfaces for accessing record batches from input sources those formats,
+interfaces for accessing record batches from input sources in those formats,
 respectively.
 
 For guidance on how to use these classes, see the examples section.
diff --git a/r/man/array.Rd b/r/man/array.Rd
index 0c1aed407ac..71957aff90c 100644
--- a/r/man/array.Rd
+++ b/r/man/array.Rd
@@ -102,6 +102,6 @@ new_array$offset
 # Compare 2 arrays
 na_array2 = na_array
 na_array2 == na_array # element-wise comparison
-na_array2$Equals(na_array) # overall comparison 
+na_array2$Equals(na_array) # overall comparison
 \dontshow{\}) # examplesIf}
 }
diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R
index 26d0a3005e4..a9f20c89574 100644
--- a/r/tests/testthat/test-Array.R
+++ b/r/tests/testthat/test-Array.R
@@ -819,3 +819,23 @@ test_that("auto int64 conversion to int can be disabled (ARROW-10093)", {
     expect_true(inherits(as.data.frame(batch)$x, "integer64"))
   })
 })
+
+
+test_that("Array to C-interface", {
+  # create a struct array since that's one of the more complicated array types
+  df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10])
+  arr <- Array$create(df)
+
+  # export the array via the C-interface
+  schema_ptr <- allocate_arrow_schema()
+  array_ptr <- allocate_arrow_array()
+  on.exit({
+    delete_arrow_schema(schema_ptr)
+    delete_arrow_array(array_ptr)
+  })
+  arr$export_to_c(array_ptr, schema_ptr)
+
+  # then import it and check that the roundtripped value is the same
+  circle <- Array$import_from_c(array_ptr, schema_ptr)
+  expect_equal(arr, circle)
+})
diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R
index 6617805db54..58afe4ef87e 100644
--- a/r/tests/testthat/test-RecordBatch.R
+++ b/r/tests/testthat/test-RecordBatch.R
@@ -553,15 +553,61 @@ test_that("ARROW-11769 - grouping preserved in record batch creation", {
 })
 
 test_that("ARROW-12729 - length returns number of columns in RecordBatch", {
-  
+
   tbl <- tibble::tibble(
     int = 1:10,
     fct = factor(rep(c("A", "B"), 5)),
     fct2 = factor(rep(c("C", "D"), each = 5)),
   )
-  
+
   rb <- record_batch(!!!tbl)
-  
+
   expect_identical(length(rb), 3L)
-  
-})
\ No newline at end of file
+
+})
+
+test_that("RecordBatchReader to C-interface", {
+  tab <- Table$create(example_data)
+
+  # export the RecordBatchReader via the C-interface
+  stream_ptr <- allocate_arrow_array_stream()
+  on.exit(delete_arrow_array_stream(stream_ptr))
+  scan <- Scanner$create(tab)
+  reader <- scan$ToRecordBatchReader()
+  reader$export_to_c(stream_ptr)
+
+  # then import it and check that the roundtripped value is the same
+  circle <- RecordBatchStreamReader$import_from_c(stream_ptr)
+  tab_from_c_new <- circle$read_table()
+  expect_equal(tab, tab_from_c_new)
+
+  # export the RecordBatchStreamReader via the C-interface
+  stream_ptr_new <- allocate_arrow_array_stream()
+  on.exit(delete_arrow_array_stream(stream_ptr_new))
+  bytes <- write_to_raw(example_data)
+  expect_type(bytes, "raw")
+  reader_new <- RecordBatchStreamReader$create(bytes)
+  reader_new$export_to_c(stream_ptr_new)
+
+  # then import it and check that the roundtripped value is the same
+  circle_new <- RecordBatchStreamReader$import_from_c(stream_ptr_new)
+  tab_from_c_new <- circle_new$read_table()
+  expect_equal(tab, tab_from_c_new)
+})
+
+test_that("RecordBatch to C-interface", {
+  batch <- RecordBatch$create(example_data)
+
+  # export the RecordBatch via the C-interface
+  schema_ptr <- allocate_arrow_schema()
+  array_ptr <- allocate_arrow_array()
+  on.exit({
+    delete_arrow_schema(schema_ptr)
+    delete_arrow_array(array_ptr)
+  })
+  batch$export_to_c(array_ptr, schema_ptr)
+
+  # then import it and check that the roundtripped value is the same
+  circle <- RecordBatch$import_from_c(array_ptr, schema_ptr)
+  expect_equal(batch, circle)
+})
diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R
index 6dd36b248ec..9a40e40edf4 100644
--- a/r/tests/testthat/test-Table.R
+++ b/r/tests/testthat/test-Table.R
@@ -543,9 +543,9 @@ test_that("ARROW-12729 - length returns number of columns in Table", {
     fct = factor(rep(c("A", "B"), 5)),
     fct2 = factor(rep(c("C", "D"), each = 5)),
   )
-  
+
   tab <- Table$create(!!!tbl)
-  
+
   expect_identical(length(tab), 3L)
-  
+
 })
diff --git a/r/tests/testthat/test-data-type.R b/r/tests/testthat/test-data-type.R
index 5c0a31191a1..412abef98e9 100644
--- a/r/tests/testthat/test-data-type.R
+++ b/r/tests/testthat/test-data-type.R
@@ -411,3 +411,16 @@ test_that("FixedSizeBinary", {
   expect_error(fixed_size_binary("four"))
   expect_error(fixed_size_binary(c(2, 4)))
 })
+
+test_that("DataType to C-interface", {
+  datatype <- timestamp("ms", timezone = "Asia/Pyongyang")
+
+  # export the datatype via the C-interface
+  ptr <- allocate_arrow_schema()
+  on.exit(delete_arrow_schema(ptr))
+  datatype$export_to_c(ptr)
+
+  # then import it and check that the roundtripped value is the same
+  circle <- DataType$import_from_c(ptr)
+  expect_equal(circle, datatype)
+})
diff --git a/r/tests/testthat/test-field.R b/r/tests/testthat/test-field.R
index f72cb379a5e..aacb5012e70 100644
--- a/r/tests/testthat/test-field.R
+++ b/r/tests/testthat/test-field.R
@@ -36,3 +36,16 @@ test_that("Print method for field", {
     "Field\nzz: dictionary<values=string, indices=int32>"
   )
 })
+
+test_that("Field to C-interface", {
+  field <- field("x", time32("s"))
+
+  # export the field via the C-interface
+  ptr <- allocate_arrow_schema()
+  on.exit(delete_arrow_schema(ptr))
+  field$export_to_c(ptr)
+
+  # then import it and check that the roundtripped value is the same
+  circle <- Field$import_from_c(ptr)
+  expect_equal(circle, field)
+})
diff --git a/r/tests/testthat/test-python.R b/r/tests/testthat/test-python.R
index c3a9e269ad6..c7bedc518ef 100644
--- a/r/tests/testthat/test-python.R
+++ b/r/tests/testthat/test-python.R
@@ -104,7 +104,6 @@ test_that("DataType roundtrip", {
 })
 
 test_that("Field roundtrip", {
-  skip("TODO in pyarrow: 'pyarrow.lib.Field' has no attribute '_import_from_c'")
   r <- field("x", time32("s"))
   py <- reticulate::r_to_py(r)
   expect_s3_class(py, "pyarrow.lib.Field")
diff --git a/r/tests/testthat/test-schema.R b/r/tests/testthat/test-schema.R
index 87dad175e2b..9509c888578 100644
--- a/r/tests/testthat/test-schema.R
+++ b/r/tests/testthat/test-schema.R
@@ -174,3 +174,16 @@ test_that("unify_schemas", {
     schema(b = double(), c = bool(), k = utf8())
   )
 })
+
+test_that("Schema to C-interface", {
+  schema <- schema(b = double(), c = bool())
+
+  # export the schema via the C-interface
+  ptr <- allocate_arrow_schema()
+  on.exit(delete_arrow_schema(ptr))
+  schema$export_to_c(ptr)
+
+  # then import it and check that the roundtripped value is the same
+  circle <- Schema$import_from_c(ptr)
+  expect_equal(circle, schema)
+})

From 7a462756b51559850ac3a32d24497e7ebedce634 Mon Sep 17 00:00:00 2001
From: Yibo Cai <yibo.cai@arm.com>
Date: Fri, 18 Jun 2021 14:20:47 -0400
Subject: [PATCH 33/61] ARROW-12074: [C++][Compute] Add scalar arithmetic
 kernels for decimal

Add basic binary arithmetic (+,-,*,/) kernels for decimal types.

Closes #10364 from cyb70289/decimal-arith

Authored-by: Yibo Cai <yibo.cai@arm.com>
Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/arrow/compute/kernel.h                |   3 +
 .../arrow/compute/kernels/codegen_internal.h  |  46 +++
 .../compute/kernels/scalar_arithmetic.cc      | 240 +++++++++++-
 .../compute/kernels/scalar_arithmetic_test.cc | 363 ++++++++++++++++++
 cpp/src/arrow/type.cc                         |  11 +
 cpp/src/arrow/type.h                          |   4 +
 cpp/src/arrow/type_traits.h                   |  11 +
 docs/source/cpp/compute.rst                   |  37 +-
 8 files changed, 700 insertions(+), 15 deletions(-)

diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index 0d5fa147727..f8d15952e73 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -321,6 +321,9 @@ class ARROW_EXPORT OutputType {
     this->resolver_ = other.resolver_;
   }
 
+  OutputType& operator=(const OutputType&) = default;
+  OutputType& operator=(OutputType&&) = default;
+
   /// \brief Return the shape and type of the expected output value of the
   /// kernel given the value descriptors (shapes and types) of the input
   /// arguments. The resolver may make use of state information kept in the
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index 891f90a97d4..6a5cee124c0 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -149,6 +149,8 @@ struct GetViewType<Decimal128Type> {
   static T LogicalValue(PhysicalType value) {
     return Decimal128(reinterpret_cast<const uint8_t*>(value.data()));
   }
+
+  static T LogicalValue(T value) { return value; }
 };
 
 template <>
@@ -159,6 +161,8 @@ struct GetViewType<Decimal256Type> {
   static T LogicalValue(PhysicalType value) {
     return Decimal256(reinterpret_cast<const uint8_t*>(value.data()));
   }
+
+  static T LogicalValue(T value) { return value; }
 };
 
 template <typename Type, typename Enable = void>
@@ -243,6 +247,18 @@ struct ArrayIterator<Type, enable_if_base_binary<Type>> {
   }
 };
 
+template <typename Type>
+struct ArrayIterator<Type, enable_if_decimal<Type>> {
+  using T = typename TypeTraits<Type>::ScalarType::ValueType;
+  using endian_agnostic = std::array<uint8_t, sizeof(T)>;
+  const endian_agnostic* values;
+
+  explicit ArrayIterator(const ArrayData& data)
+      : values(data.GetValues<endian_agnostic>(1)) {}
+
+  T operator()() { return T{values++->data()}; }
+};
+
 // Iterator over various output array types, taking a GetOutputType<Type>
 
 template <typename Type, typename Enable = void>
@@ -262,6 +278,20 @@ struct OutputArrayWriter<Type, enable_if_has_c_type_not_boolean<Type>> {
   void WriteNull() { *values++ = T{}; }
 };
 
+template <typename Type>
+struct OutputArrayWriter<Type, enable_if_decimal<Type>> {
+  using T = typename TypeTraits<Type>::ScalarType::ValueType;
+  using endian_agnostic = std::array<uint8_t, sizeof(T)>;
+  endian_agnostic* values;
+
+  explicit OutputArrayWriter(ArrayData* data)
+      : values(data->GetMutableValues<endian_agnostic>(1)) {}
+
+  void Write(T value) { value.ToBytes(values++->data()); }
+
+  void WriteNull() { T{}.ToBytes(values++->data()); }
+};
+
 // (Un)box Scalar to / from C++ value
 
 template <typename Type, typename Enable = void>
@@ -538,6 +568,22 @@ struct OutputAdapter<Type, enable_if_base_binary<Type>> {
   }
 };
 
+template <typename Type>
+struct OutputAdapter<Type, enable_if_decimal<Type>> {
+  using T = typename TypeTraits<Type>::ScalarType::ValueType;
+  using endian_agnostic = std::array<uint8_t, sizeof(T)>;
+
+  template <typename Generator>
+  static Status Write(KernelContext*, Datum* out, Generator&& generator) {
+    ArrayData* out_arr = out->mutable_array();
+    auto out_data = out_arr->GetMutableValues<endian_agnostic>(1);
+    for (int64_t i = 0; i < out_arr->length; ++i) {
+      generator().ToBytes(out_data++->data());
+    }
+    return Status::OK();
+  }
+};
+
 // A kernel exec generator for unary functions that addresses both array and
 // scalar inputs and dispatches input iteration and output writing to other
 // templates
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
index 743d2e3fc0e..f51484e53ff 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -15,11 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <algorithm>
 #include <cmath>
 #include <limits>
+#include <utility>
 
 #include "arrow/compute/kernels/common.h"
 #include "arrow/type_traits.h"
+#include "arrow/util/decimal.h"
 #include "arrow/util/int_util_internal.h"
 #include "arrow/util/macros.h"
 
@@ -62,6 +65,11 @@ using enable_if_integer =
 template <typename T>
 using enable_if_floating_point = enable_if_t<std::is_floating_point<T>::value, T>;
 
+template <typename T>
+using enable_if_decimal =
+    enable_if_t<std::is_same<Decimal128, T>::value || std::is_same<Decimal256, T>::value,
+                T>;
+
 template <typename T, typename Unsigned = typename std::make_unsigned<T>::type>
 constexpr Unsigned to_unsigned(T signed_) {
   return static_cast<Unsigned>(signed_);
@@ -126,11 +134,16 @@ struct Add {
                                                     Status*) {
     return arrow::internal::SafeSignedAdd(left, right);
   }
+
+  template <typename T, typename Arg0, typename Arg1>
+  static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+    return left + right;
+  }
 };
 
 struct AddChecked {
   template <typename T, typename Arg0, typename Arg1>
-  enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+  static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     T result = 0;
     if (ARROW_PREDICT_FALSE(AddWithOverflow(left, right, &result))) {
@@ -140,10 +153,16 @@ struct AddChecked {
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+  static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+                                          Status*) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     return left + right;
   }
+
+  template <typename T, typename Arg0, typename Arg1>
+  static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+    return left + right;
+  }
 };
 
 struct Subtract {
@@ -164,11 +183,16 @@ struct Subtract {
                                                     Status*) {
     return arrow::internal::SafeSignedSubtract(left, right);
   }
+
+  template <typename T, typename Arg0, typename Arg1>
+  static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+    return left + (-right);
+  }
 };
 
 struct SubtractChecked {
   template <typename T, typename Arg0, typename Arg1>
-  enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+  static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     T result = 0;
     if (ARROW_PREDICT_FALSE(SubtractWithOverflow(left, right, &result))) {
@@ -178,10 +202,16 @@ struct SubtractChecked {
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+  static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+                                          Status*) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     return left - right;
   }
+
+  template <typename T, typename Arg0, typename Arg1>
+  static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+    return left + (-right);
+  }
 };
 
 struct Multiply {
@@ -224,11 +254,16 @@ struct Multiply {
   static constexpr uint16_t Call(KernelContext*, uint16_t left, uint16_t right, Status*) {
     return static_cast<uint32_t>(left) * static_cast<uint32_t>(right);
   }
+
+  template <typename T, typename Arg0, typename Arg1>
+  static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+    return left * right;
+  }
 };
 
 struct MultiplyChecked {
   template <typename T, typename Arg0, typename Arg1>
-  enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+  static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     T result = 0;
     if (ARROW_PREDICT_FALSE(MultiplyWithOverflow(left, right, &result))) {
@@ -238,10 +273,16 @@ struct MultiplyChecked {
   }
 
   template <typename T, typename Arg0, typename Arg1>
-  enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+  static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+                                          Status*) {
     static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
     return left * right;
   }
+
+  template <typename T, typename Arg0, typename Arg1>
+  static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+    return left * right;
+  }
 };
 
 struct Divide {
@@ -263,6 +304,16 @@ struct Divide {
     }
     return result;
   }
+
+  template <typename T, typename Arg0, typename Arg1>
+  static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+    if (right == Arg1()) {
+      *st = Status::Invalid("Divide by zero");
+      return T();
+    } else {
+      return left / right;
+    }
+  }
 };
 
 struct DivideChecked {
@@ -290,6 +341,12 @@ struct DivideChecked {
     }
     return left / right;
   }
+
+  template <typename T, typename Arg0, typename Arg1>
+  static enable_if_decimal<T> Call(KernelContext* ctx, Arg0 left, Arg1 right,
+                                   Status* st) {
+    return Divide::Call<T>(ctx, left, right, st);
+  }
 };
 
 struct Negate {
@@ -304,7 +361,7 @@ struct Negate {
   }
 
   template <typename T, typename Arg>
-  static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status* st) {
+  static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status*) {
     return arrow::internal::SafeSignedNegate(arg);
   }
 };
@@ -428,12 +485,157 @@ ArrayKernelExec ArithmeticExecFromOp(detail::GetTypeId get_id) {
   }
 }
 
+Status CastBinaryDecimalArgs(const std::string& func_name,
+                             std::vector<ValueDescr>* values) {
+  auto& left_type = (*values)[0].type;
+  auto& right_type = (*values)[1].type;
+  DCHECK(is_decimal(left_type->id()) || is_decimal(right_type->id()));
+
+  // decimal + float = float
+  if (is_floating(left_type->id())) {
+    right_type = left_type;
+    return Status::OK();
+  } else if (is_floating(right_type->id())) {
+    left_type = right_type;
+    return Status::OK();
+  }
+
+  // precision, scale of left and right args
+  int32_t p1, s1, p2, s2;
+
+  // decimal + integer = decimal
+  if (is_decimal(left_type->id())) {
+    auto decimal = checked_cast<const DecimalType*>(left_type.get());
+    p1 = decimal->precision();
+    s1 = decimal->scale();
+  } else {
+    DCHECK(is_integer(left_type->id()));
+    p1 = static_cast<int32_t>(std::ceil(std::log10(bit_width(left_type->id()))));
+    s1 = 0;
+  }
+  if (is_decimal(right_type->id())) {
+    auto decimal = checked_cast<const DecimalType*>(right_type.get());
+    p2 = decimal->precision();
+    s2 = decimal->scale();
+  } else {
+    DCHECK(is_integer(right_type->id()));
+    p2 = static_cast<int32_t>(std::ceil(std::log10(bit_width(right_type->id()))));
+    s2 = 0;
+  }
+  if (s1 < 0 || s2 < 0) {
+    return Status::NotImplemented("Decimals with negative scales not supported");
+  }
+
+  // decimal128 + decimal256 = decimal256
+  Type::type casted_type_id = Type::DECIMAL128;
+  if (left_type->id() == Type::DECIMAL256 || right_type->id() == Type::DECIMAL256) {
+    casted_type_id = Type::DECIMAL256;
+  }
+
+  // decimal promotion rules compatible with amazon redshift
+  // https://docs.aws.amazon.com/redshift/latest/dg/r_numeric_computations201.html
+  int32_t left_scaleup, right_scaleup;
+
+  // "add_checked" -> "add"
+  const std::string op = func_name.substr(0, func_name.find("_"));
+  if (op == "add" || op == "subtract") {
+    left_scaleup = std::max(s1, s2) - s1;
+    right_scaleup = std::max(s1, s2) - s2;
+  } else if (op == "multiply") {
+    left_scaleup = right_scaleup = 0;
+  } else if (op == "divide") {
+    left_scaleup = std::max(4, s1 + p2 - s2 + 1) + s2 - s1;
+    right_scaleup = 0;
+  } else {
+    return Status::Invalid("Invalid decimal function: ", func_name);
+  }
+
+  ARROW_ASSIGN_OR_RAISE(
+      left_type, DecimalType::Make(casted_type_id, p1 + left_scaleup, s1 + left_scaleup));
+  ARROW_ASSIGN_OR_RAISE(right_type, DecimalType::Make(casted_type_id, p2 + right_scaleup,
+                                                      s2 + right_scaleup));
+  return Status::OK();
+}
+
+// resolve decimal binary operation output type per *casted* args
+template <typename OutputGetter>
+Result<ValueDescr> ResolveDecimalBinaryOperationOutput(
+    const std::vector<ValueDescr>& args, OutputGetter&& getter) {
+  // casted args should be same size decimals
+  auto left_type = checked_cast<const DecimalType*>(args[0].type.get());
+  auto right_type = checked_cast<const DecimalType*>(args[1].type.get());
+  DCHECK_EQ(left_type->id(), right_type->id());
+
+  int32_t precision, scale;
+  std::tie(precision, scale) = getter(left_type->precision(), left_type->scale(),
+                                      right_type->precision(), right_type->scale());
+  ARROW_ASSIGN_OR_RAISE(auto type, DecimalType::Make(left_type->id(), precision, scale));
+  return ValueDescr(std::move(type), GetBroadcastShape(args));
+}
+
+Result<ValueDescr> ResolveDecimalAdditionOrSubtractionOutput(
+    KernelContext*, const std::vector<ValueDescr>& args) {
+  return ResolveDecimalBinaryOperationOutput(
+      args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
+        DCHECK_EQ(s1, s2);
+        const int32_t scale = s1;
+        const int32_t precision = std::max(p1 - s1, p2 - s2) + scale + 1;
+        return std::make_pair(precision, scale);
+      });
+}
+
+Result<ValueDescr> ResolveDecimalMultiplicationOutput(
+    KernelContext*, const std::vector<ValueDescr>& args) {
+  return ResolveDecimalBinaryOperationOutput(
+      args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
+        const int32_t scale = s1 + s2;
+        const int32_t precision = p1 + p2 + 1;
+        return std::make_pair(precision, scale);
+      });
+}
+
+Result<ValueDescr> ResolveDecimalDivisionOutput(KernelContext*,
+                                                const std::vector<ValueDescr>& args) {
+  return ResolveDecimalBinaryOperationOutput(
+      args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
+        DCHECK_GE(s1, s2);
+        const int32_t scale = s1 - s2;
+        const int32_t precision = p1;
+        return std::make_pair(precision, scale);
+      });
+}
+
+template <typename Op>
+void AddDecimalBinaryKernels(const std::string& name,
+                             std::shared_ptr<ScalarFunction>* func) {
+  OutputType out_type(null());
+  const std::string op = name.substr(0, name.find("_"));
+  if (op == "add" || op == "subtract") {
+    out_type = OutputType(ResolveDecimalAdditionOrSubtractionOutput);
+  } else if (op == "multiply") {
+    out_type = OutputType(ResolveDecimalMultiplicationOutput);
+  } else if (op == "divide") {
+    out_type = OutputType(ResolveDecimalDivisionOutput);
+  } else {
+    DCHECK(false);
+  }
+
+  auto in_type128 = InputType(Type::DECIMAL128);
+  auto in_type256 = InputType(Type::DECIMAL256);
+  auto exec128 = ScalarBinaryNotNullEqualTypes<Decimal128Type, Decimal128Type, Op>::Exec;
+  auto exec256 = ScalarBinaryNotNullEqualTypes<Decimal256Type, Decimal256Type, Op>::Exec;
+  DCHECK_OK((*func)->AddKernel({in_type128, in_type128}, out_type, exec128));
+  DCHECK_OK((*func)->AddKernel({in_type256, in_type256}, out_type, exec256));
+}
+
 struct ArithmeticFunction : ScalarFunction {
   using ScalarFunction::ScalarFunction;
 
   Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
     RETURN_NOT_OK(CheckArity(*values));
 
+    RETURN_NOT_OK(CheckDecimals(values));
+
     using arrow::compute::detail::DispatchExactImpl;
     if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
 
@@ -451,6 +653,22 @@ struct ArithmeticFunction : ScalarFunction {
     if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
     return arrow::compute::detail::NoMatchingKernel(this, *values);
   }
+
+  Status CheckDecimals(std::vector<ValueDescr>* values) const {
+    bool has_decimal = false;
+    for (const auto& value : *values) {
+      if (is_decimal(value.type->id())) {
+        has_decimal = true;
+        break;
+      }
+    }
+    if (!has_decimal) return Status::OK();
+
+    if (values->size() == 2) {
+      return CastBinaryDecimalArgs(name(), values);
+    }
+    return Status::OK();
+  }
 };
 
 template <typename Op>
@@ -617,16 +835,19 @@ void RegisterScalarArithmetic(FunctionRegistry* registry) {
 
   // ----------------------------------------------------------------------
   auto add = MakeArithmeticFunction<Add>("add", &add_doc);
+  AddDecimalBinaryKernels<Add>("add", &add);
   DCHECK_OK(registry->AddFunction(std::move(add)));
 
   // ----------------------------------------------------------------------
   auto add_checked =
       MakeArithmeticFunctionNotNull<AddChecked>("add_checked", &add_checked_doc);
+  AddDecimalBinaryKernels<AddChecked>("add_checked", &add_checked);
   DCHECK_OK(registry->AddFunction(std::move(add_checked)));
 
   // ----------------------------------------------------------------------
   // subtract
   auto subtract = MakeArithmeticFunction<Subtract>("subtract", &sub_doc);
+  AddDecimalBinaryKernels<Subtract>("subtract", &subtract);
 
   // Add subtract(timestamp, timestamp) -> duration
   for (auto unit : AllTimeUnits()) {
@@ -640,24 +861,29 @@ void RegisterScalarArithmetic(FunctionRegistry* registry) {
   // ----------------------------------------------------------------------
   auto subtract_checked = MakeArithmeticFunctionNotNull<SubtractChecked>(
       "subtract_checked", &sub_checked_doc);
+  AddDecimalBinaryKernels<SubtractChecked>("subtract_checked", &subtract_checked);
   DCHECK_OK(registry->AddFunction(std::move(subtract_checked)));
 
   // ----------------------------------------------------------------------
   auto multiply = MakeArithmeticFunction<Multiply>("multiply", &mul_doc);
+  AddDecimalBinaryKernels<Multiply>("multiply", &multiply);
   DCHECK_OK(registry->AddFunction(std::move(multiply)));
 
   // ----------------------------------------------------------------------
   auto multiply_checked = MakeArithmeticFunctionNotNull<MultiplyChecked>(
       "multiply_checked", &mul_checked_doc);
+  AddDecimalBinaryKernels<MultiplyChecked>("multiply_checked", &multiply_checked);
   DCHECK_OK(registry->AddFunction(std::move(multiply_checked)));
 
   // ----------------------------------------------------------------------
   auto divide = MakeArithmeticFunctionNotNull<Divide>("divide", &div_doc);
+  AddDecimalBinaryKernels<Divide>("divide", &divide);
   DCHECK_OK(registry->AddFunction(std::move(divide)));
 
   // ----------------------------------------------------------------------
   auto divide_checked =
       MakeArithmeticFunctionNotNull<DivideChecked>("divide_checked", &div_checked_doc);
+  AddDecimalBinaryKernels<DivideChecked>("divide_checked", &divide_checked);
   DCHECK_OK(registry->AddFunction(std::move(divide_checked)));
 
   // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
index c4bfac459dc..3ee862c834e 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
@@ -1148,5 +1148,368 @@ TYPED_TEST(TestUnaryArithmeticFloating, AbsoluteValue) {
   }
 }
 
+TEST(TestBinaryDecimalArithmetic, DispatchBest) {
+  // decimal, floating point
+  for (std::string name : {"add", "subtract", "multiply", "divide"}) {
+    for (std::string suffix : {"", "_checked"}) {
+      name += suffix;
+
+      CheckDispatchBest(name, {decimal128(1, 0), float32()}, {float32(), float32()});
+      CheckDispatchBest(name, {decimal256(1, 0), float64()}, {float64(), float64()});
+      CheckDispatchBest(name, {float32(), decimal256(1, 0)}, {float32(), float32()});
+      CheckDispatchBest(name, {float64(), decimal128(1, 0)}, {float64(), float64()});
+    }
+  }
+
+  // decimal, decimal
+  for (std::string name : {"add", "subtract"}) {
+    for (std::string suffix : {"", "_checked"}) {
+      name += suffix;
+
+      CheckDispatchBest(name, {decimal128(2, 1), decimal128(2, 1)},
+                        {decimal128(3, 1), decimal128(3, 1)});
+      CheckDispatchBest(name, {decimal256(2, 1), decimal256(2, 1)},
+                        {decimal256(3, 1), decimal256(3, 1)});
+      CheckDispatchBest(name, {decimal128(2, 1), decimal256(2, 1)},
+                        {decimal256(3, 1), decimal256(3, 1)});
+      CheckDispatchBest(name, {decimal256(2, 1), decimal128(2, 1)},
+                        {decimal256(3, 1), decimal256(3, 1)});
+    }
+  }
+  {
+    std::string name = "multiply";
+    for (std::string suffix : {"", "_checked"}) {
+      name += suffix;
+
+      CheckDispatchBest(name, {decimal128(2, 1), decimal128(2, 1)},
+                        {decimal128(5, 2), decimal128(5, 2)});
+      CheckDispatchBest(name, {decimal256(2, 1), decimal256(2, 1)},
+                        {decimal256(5, 2), decimal256(5, 2)});
+      CheckDispatchBest(name, {decimal128(2, 1), decimal256(2, 1)},
+                        {decimal256(5, 2), decimal256(5, 2)});
+      CheckDispatchBest(name, {decimal256(2, 1), decimal128(2, 1)},
+                        {decimal256(5, 2), decimal256(5, 2)});
+    }
+  }
+  {
+    std::string name = "divide";
+    for (std::string suffix : {"", "_checked"}) {
+      name += suffix;
+
+      CheckDispatchBest(name, {decimal128(2, 1), decimal128(2, 1)},
+                        {decimal128(6, 4), decimal128(6, 4)});
+      CheckDispatchBest(name, {decimal256(2, 1), decimal256(2, 1)},
+                        {decimal256(6, 4), decimal256(6, 4)});
+      CheckDispatchBest(name, {decimal128(2, 1), decimal256(2, 1)},
+                        {decimal256(6, 4), decimal256(6, 4)});
+      CheckDispatchBest(name, {decimal256(2, 1), decimal128(2, 1)},
+                        {decimal256(6, 4), decimal256(6, 4)});
+    }
+  }
+
+  // TODO(ARROW-13067): add 'integer, decimal' tests
+}
+
+// reference result from bc (precsion=100, scale=40)
+TEST(TestBinaryArithmeticDecimal, AddSubtract) {
+  // array array, decimal128
+  {
+    auto left = ArrayFromJSON(decimal128(30, 3),
+                              R"([
+        "1.000",
+        "-123456789012345678901234567.890",
+        "98765432109876543210.987",
+        "-999999999999999999999999999.999"
+      ])");
+    auto right = ArrayFromJSON(decimal128(20, 9),
+                               R"([
+        "-1.000000000",
+        "12345678901.234567890",
+        "98765.432101234",
+        "-99999999999.999999999"
+      ])");
+    auto added = ArrayFromJSON(decimal128(37, 9),
+                               R"([
+      "0.000000000",
+      "-123456789012345666555555666.655432110",
+      "98765432109876641976.419101234",
+      "-1000000000000000099999999999.998999999"
+    ])");
+    auto subtracted = ArrayFromJSON(decimal128(37, 9),
+                                    R"([
+      "2.000000000",
+      "-123456789012345691246913469.124567890",
+      "98765432109876444445.554898766",
+      "-999999999999999899999999999.999000001"
+    ])");
+    CheckScalarBinary("add", left, right, added);
+    CheckScalarBinary("subtract", left, right, subtracted);
+  }
+
+  // array array, decimal256
+  {
+    auto left = ArrayFromJSON(decimal256(30, 20),
+                              R"([
+        "-1.00000000000000000001",
+        "1234567890.12345678900000000000",
+        "-9876543210.09876543210987654321",
+        "9999999999.99999999999999999999"
+      ])");
+    auto right = ArrayFromJSON(decimal256(30, 10),
+                               R"([
+        "1.0000000000",
+        "-1234567890.1234567890",
+        "6789.5432101234",
+        "99999999999999999999.9999999999"
+      ])");
+    auto added = ArrayFromJSON(decimal256(41, 20),
+                               R"([
+      "-0.00000000000000000001",
+      "0.00000000000000000000",
+      "-9876536420.55555530870987654321",
+      "100000000009999999999.99999999989999999999"
+    ])");
+    auto subtracted = ArrayFromJSON(decimal256(41, 20),
+                                    R"([
+      "-2.00000000000000000001",
+      "2469135780.24691357800000000000",
+      "-9876549999.64197555550987654321",
+      "-99999999989999999999.99999999990000000001"
+    ])");
+    CheckScalarBinary("add", left, right, added);
+    CheckScalarBinary("subtract", left, right, subtracted);
+  }
+
+  // scalar array
+  {
+    auto left = ScalarFromJSON(decimal128(6, 1), R"("12345.6")");
+    auto right = ArrayFromJSON(decimal128(10, 3),
+                               R"(["1.234", "1234.000", "-9876.543", "666.888"])");
+    auto added = ArrayFromJSON(decimal128(11, 3),
+                               R"(["12346.834", "13579.600", "2469.057", "13012.488"])");
+    auto left_sub_right = ArrayFromJSON(
+        decimal128(11, 3), R"(["12344.366", "11111.600", "22222.143", "11678.712"])");
+    auto right_sub_left = ArrayFromJSON(
+        decimal128(11, 3), R"(["-12344.366", "-11111.600", "-22222.143", "-11678.712"])");
+    CheckScalarBinary("add", left, right, added);
+    CheckScalarBinary("add", right, left, added);
+    CheckScalarBinary("subtract", left, right, left_sub_right);
+    CheckScalarBinary("subtract", right, left, right_sub_left);
+  }
+
+  // scalar scalar
+  {
+    auto left = ScalarFromJSON(decimal256(3, 0), R"("666")");
+    auto right = ScalarFromJSON(decimal256(3, 0), R"("888")");
+    auto added = ScalarFromJSON(decimal256(4, 0), R"("1554")");
+    auto subtracted = ScalarFromJSON(decimal256(4, 0), R"("-222")");
+    CheckScalarBinary("add", left, right, added);
+    CheckScalarBinary("subtract", left, right, subtracted);
+  }
+
+  // decimal128 decimal256
+  {
+    auto left = ScalarFromJSON(decimal128(3, 0), R"("666")");
+    auto right = ScalarFromJSON(decimal256(3, 0), R"("888")");
+    auto added = ScalarFromJSON(decimal256(4, 0), R"("1554")");
+    CheckScalarBinary("add", left, right, added);
+    CheckScalarBinary("add", right, left, added);
+  }
+
+  // decimal float
+  {
+    auto left = ScalarFromJSON(decimal128(3, 0), R"("666")");
+    ASSIGN_OR_ABORT(auto right, arrow::MakeScalar(float64(), 888));
+    ASSIGN_OR_ABORT(auto added, arrow::MakeScalar(float64(), 1554));
+    CheckScalarBinary("add", left, right, added);
+    CheckScalarBinary("add", right, left, added);
+  }
+
+  // TODO: decimal integer
+
+  // failed case: result maybe overflow
+  {
+    std::shared_ptr<Scalar> left, right;
+
+    left = ScalarFromJSON(decimal128(21, 20), R"("0.12345678901234567890")");
+    right = ScalarFromJSON(decimal128(21, 1), R"("1.0")");
+    ASSERT_RAISES(Invalid, CallFunction("add", {left, right}));
+    ASSERT_RAISES(Invalid, CallFunction("subtract", {left, right}));
+
+    left = ScalarFromJSON(decimal256(75, 0), R"("0")");
+    right = ScalarFromJSON(decimal256(2, 1), R"("0.0")");
+    ASSERT_RAISES(Invalid, CallFunction("add", {left, right}));
+    ASSERT_RAISES(Invalid, CallFunction("subtract", {left, right}));
+  }
+}
+
+TEST(TestBinaryArithmeticDecimal, Multiply) {
+  // array array, decimal128
+  {
+    auto left = ArrayFromJSON(decimal128(20, 10),
+                              R"([
+        "1234567890.1234567890",
+        "-0.0000000001",
+        "-9999999999.9999999999"
+      ])");
+    auto right = ArrayFromJSON(decimal128(13, 3),
+                               R"([
+        "1234567890.123",
+        "0.001",
+        "-9999999999.999"
+      ])");
+    auto expected = ArrayFromJSON(decimal128(34, 13),
+                                  R"([
+      "1524157875323319737.9870903950470",
+      "-0.0000000000001",
+      "99999999999989999999.0000000000001"
+    ])");
+    CheckScalarBinary("multiply", left, right, expected);
+  }
+
+  // array array, decimal26
+  {
+    auto left = ArrayFromJSON(decimal256(30, 3),
+                              R"([
+        "123456789012345678901234567.890",
+        "0.000"
+      ])");
+    auto right = ArrayFromJSON(decimal256(20, 9),
+                               R"([
+        "-12345678901.234567890",
+        "99999999999.999999999"
+      ])");
+    auto expected = ArrayFromJSON(decimal256(51, 12),
+                                  R"([
+      "-1524157875323883675034293577501905199.875019052100",
+      "0.000000000000"
+    ])");
+    CheckScalarBinary("multiply", left, right, expected);
+  }
+
+  // scalar array
+  {
+    auto left = ScalarFromJSON(decimal128(3, 2), R"("3.14")");
+    auto right = ArrayFromJSON(decimal128(1, 0), R"(["1", "2", "3", "4", "5"])");
+    auto expected =
+        ArrayFromJSON(decimal128(5, 2), R"(["3.14", "6.28", "9.42", "12.56", "15.70"])");
+    CheckScalarBinary("multiply", left, right, expected);
+    CheckScalarBinary("multiply", right, left, expected);
+  }
+
+  // scalar scalar
+  {
+    auto left = ScalarFromJSON(decimal128(1, 0), R"("1")");
+    auto right = ScalarFromJSON(decimal128(1, 0), R"("1")");
+    auto expected = ScalarFromJSON(decimal128(3, 0), R"("1")");
+    CheckScalarBinary("multiply", left, right, expected);
+  }
+
+  // decimal128 decimal256
+  {
+    auto left = ScalarFromJSON(decimal128(3, 2), R"("6.66")");
+    auto right = ScalarFromJSON(decimal256(3, 1), R"("88.8")");
+    auto expected = ScalarFromJSON(decimal256(7, 3), R"("591.408")");
+    CheckScalarBinary("multiply", left, right, expected);
+    CheckScalarBinary("multiply", right, left, expected);
+  }
+
+  // decimal float
+  {
+    auto left = ScalarFromJSON(decimal128(3, 0), R"("666")");
+    ASSIGN_OR_ABORT(auto right, arrow::MakeScalar(float64(), 888));
+    ASSIGN_OR_ABORT(auto expected, arrow::MakeScalar(float64(), 591408));
+    CheckScalarBinary("multiply", left, right, expected);
+    CheckScalarBinary("multiply", right, left, expected);
+  }
+
+  // TODO: decimal integer
+
+  // failed case: result maybe overflow
+  {
+    auto left = ScalarFromJSON(decimal128(20, 0), R"("1")");
+    auto right = ScalarFromJSON(decimal128(18, 1), R"("1.0")");
+    ASSERT_RAISES(Invalid, CallFunction("multiply", {left, right}));
+  }
+}
+
+TEST(TestBinaryArithmeticDecimal, Divide) {
+  // array array, decimal128
+  {
+    auto left = ArrayFromJSON(decimal128(13, 3), R"(["1234567890.123", "0.001"])");
+    auto right = ArrayFromJSON(decimal128(3, 0), R"(["-987", "999"])");
+    auto expected =
+        ArrayFromJSON(decimal128(17, 7), R"(["-1250828.6627386", "0.0000010"])");
+    CheckScalarBinary("divide", left, right, expected);
+  }
+
+  // array array, decimal256
+  {
+    auto left = ArrayFromJSON(decimal256(20, 10),
+                              R"(["1234567890.1234567890", "9999999999.9999999999"])");
+    auto right = ArrayFromJSON(decimal256(13, 3), R"(["1234567890.123", "0.001"])");
+    auto expected = ArrayFromJSON(
+        decimal256(34, 21),
+        R"(["1.000000000000369999093", "9999999999999.999999900000000000000"])");
+    CheckScalarBinary("divide", left, right, expected);
+  }
+
+  // scalar array
+  {
+    auto left = ScalarFromJSON(decimal128(1, 0), R"("1")");
+    auto right = ArrayFromJSON(decimal128(1, 0), R"(["1", "2", "3", "4"])");
+    auto left_div_right =
+        ArrayFromJSON(decimal128(5, 4), R"(["1.0000", "0.5000", "0.3333", "0.2500"])");
+    auto right_div_left =
+        ArrayFromJSON(decimal128(5, 4), R"(["1.0000", "2.0000", "3.0000", "4.0000"])");
+    CheckScalarBinary("divide", left, right, left_div_right);
+    CheckScalarBinary("divide", right, left, right_div_left);
+  }
+
+  // scalar scalar
+  {
+    auto left = ScalarFromJSON(decimal256(6, 5), R"("2.71828")");
+    auto right = ScalarFromJSON(decimal256(6, 5), R"("3.14159")");
+    auto expected = ScalarFromJSON(decimal256(13, 7), R"("0.8652561")");
+    CheckScalarBinary("divide", left, right, expected);
+  }
+
+  // decimal128 decimal256
+  {
+    auto left = ScalarFromJSON(decimal256(6, 5), R"("2.71828")");
+    auto right = ScalarFromJSON(decimal128(6, 5), R"("3.14159")");
+    auto left_div_right = ScalarFromJSON(decimal256(13, 7), R"("0.8652561")");
+    auto right_div_left = ScalarFromJSON(decimal256(13, 7), R"("1.1557271")");
+    CheckScalarBinary("divide", left, right, left_div_right);
+    CheckScalarBinary("divide", right, left, right_div_left);
+  }
+
+  // decimal float
+  {
+    auto left = ScalarFromJSON(decimal128(3, 0), R"("100")");
+    ASSIGN_OR_ABORT(auto right, arrow::MakeScalar(float64(), 50));
+    ASSIGN_OR_ABORT(auto left_div_right, arrow::MakeScalar(float64(), 2));
+    ASSIGN_OR_ABORT(auto right_div_left, arrow::MakeScalar(float64(), 0.5));
+    CheckScalarBinary("divide", left, right, left_div_right);
+    CheckScalarBinary("divide", right, left, right_div_left);
+  }
+
+  // TODO: decimal integer
+
+  // failed case: result maybe overflow
+  {
+    auto left = ScalarFromJSON(decimal128(20, 20), R"("0.12345678901234567890")");
+    auto right = ScalarFromJSON(decimal128(20, 0), R"("12345678901234567890")");
+    ASSERT_RAISES(Invalid, CallFunction("divide", {left, right}));
+  }
+
+  // failed case: divide by 0
+  {
+    auto left = ScalarFromJSON(decimal256(1, 0), R"("1")");
+    auto right = ScalarFromJSON(decimal256(1, 0), R"("0")");
+    ASSERT_RAISES(Invalid, CallFunction("divide", {left, right}));
+  }
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 344585446fc..65c783ce847 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -771,6 +771,17 @@ std::vector<std::shared_ptr<Field>> StructType::GetAllFieldsByName(
   return result;
 }
 
+Result<std::shared_ptr<DataType>> DecimalType::Make(Type::type type_id, int32_t precision,
+                                                    int32_t scale) {
+  if (type_id == Type::DECIMAL128) {
+    return Decimal128Type::Make(precision, scale);
+  } else if (type_id == Type::DECIMAL256) {
+    return Decimal256Type::Make(precision, scale);
+  } else {
+    return Status::Invalid("Not a decimal type_id: ", type_id);
+  }
+}
+
 // Taken from the Apache Impala codebase. The comments next
 // to the return values are the maximum value that can be represented in 2's
 // complement with the returned number of bytes.
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 1d3d1e27f92..b933da66089 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -880,6 +880,10 @@ class ARROW_EXPORT DecimalType : public FixedSizeBinaryType {
                        int32_t scale)
       : FixedSizeBinaryType(byte_width, type_id), precision_(precision), scale_(scale) {}
 
+  /// Constructs concrete decimal types
+  static Result<std::shared_ptr<DataType>> Make(Type::type type_id, int32_t precision,
+                                                int32_t scale);
+
   int32_t precision() const { return precision_; }
   int32_t scale() const { return scale_; }
 
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index b74aa3b0adb..86664bbb162 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -845,6 +845,17 @@ static inline bool is_floating(Type::type type_id) {
   return false;
 }
 
+static inline bool is_decimal(Type::type type_id) {
+  switch (type_id) {
+    case Type::DECIMAL128:
+    case Type::DECIMAL256:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
 static inline bool is_primitive(Type::type type_id) {
   switch (type_id) {
     case Type::BOOL:
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index dfdd64d19c6..147885560f5 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -266,17 +266,17 @@ an ``Invalid`` :class:`Status` when overflow is detected.
 +--------------------------+------------+--------------------+---------------------+
 | abs_checked              | Unary      | Numeric            | Numeric             |
 +--------------------------+------------+--------------------+---------------------+
-| add                      | Binary     | Numeric            | Numeric             |
+| add                      | Binary     | Numeric            | Numeric (1)         |
 +--------------------------+------------+--------------------+---------------------+
-| add_checked              | Binary     | Numeric            | Numeric             |
+| add_checked              | Binary     | Numeric            | Numeric (1)         |
 +--------------------------+------------+--------------------+---------------------+
-| divide                   | Binary     | Numeric            | Numeric             |
+| divide                   | Binary     | Numeric            | Numeric (1)         |
 +--------------------------+------------+--------------------+---------------------+
-| divide_checked           | Binary     | Numeric            | Numeric             |
+| divide_checked           | Binary     | Numeric            | Numeric (1)         |
 +--------------------------+------------+--------------------+---------------------+
-| multiply                 | Binary     | Numeric            | Numeric             |
+| multiply                 | Binary     | Numeric            | Numeric (1)         |
 +--------------------------+------------+--------------------+---------------------+
-| multiply_checked         | Binary     | Numeric            | Numeric             |
+| multiply_checked         | Binary     | Numeric            | Numeric (1)         |
 +--------------------------+------------+--------------------+---------------------+
 | negate                   | Unary      | Numeric            | Numeric             |
 +--------------------------+------------+--------------------+---------------------+
@@ -286,11 +286,32 @@ an ``Invalid`` :class:`Status` when overflow is detected.
 +--------------------------+------------+--------------------+---------------------+
 | power_checked            | Binary     | Numeric            | Numeric             |
 +--------------------------+------------+--------------------+---------------------+
-| subtract                 | Binary     | Numeric            | Numeric             |
+| subtract                 | Binary     | Numeric            | Numeric (1)         |
 +--------------------------+------------+--------------------+---------------------+
-| subtract_checked         | Binary     | Numeric            | Numeric             |
+| subtract_checked         | Binary     | Numeric            | Numeric (1)         |
 +--------------------------+------------+--------------------+---------------------+
 
+* \(1) Precision and scale of computed DECIMAL results
+
++------------+---------------------------------------------+
+| Operation  | Result precision and scale                  |
++============+=============================================+
+| | add      | | scale = max(s1, s2)                       |
+| | subtract | | precision = max(p1-s1, p2-s2) + 1 + scale |
++------------+---------------------------------------------+
+| multiply   | | scale = s1 + s2                           |
+|            | | precision = p1 + p2 + 1                   |
++------------+---------------------------------------------+
+| divide     | | scale = max(4, s1 + p2 - s2 + 1)          |
+|            | | precision = p1 - s1 + s2 + scale          |
++------------+---------------------------------------------+
+
+It's compatible with Redshift's decimal promotion rules. All decimal digits
+are preserved for `add`, `subtract` and `multiply` operations. The result
+precision of `divide` is at least the sum of precisions of both operands with
+enough scale kept. Error is returned if the result precision is beyond the
+decimal value range.
+
 Comparisons
 ~~~~~~~~~~~
 

From 877421d987ec6e701271feca40ca7e8d5f8f5cad Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Sat, 19 Jun 2021 13:12:09 -0500
Subject: [PATCH 34/61] ARROW-13116: [R] Test for RecordBatchReader to
 C-interface fails on arrow-r-minimal due to missing dependencies

Closes #10554 from thisisnic/arrow-13116

Authored-by: Nic Crane <thisisnic@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 r/tests/testthat/test-RecordBatch.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R
index 58afe4ef87e..c7e8b2dc2bf 100644
--- a/r/tests/testthat/test-RecordBatch.R
+++ b/r/tests/testthat/test-RecordBatch.R
@@ -567,6 +567,8 @@ test_that("ARROW-12729 - length returns number of columns in RecordBatch", {
 })
 
 test_that("RecordBatchReader to C-interface", {
+  skip_if_not_available("dataset")
+  
   tab <- Table$create(example_data)
 
   # export the RecordBatchReader via the C-interface

From 235ed3123422843a123cb35883c692f48a246b83 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Mon, 21 Jun 2021 14:04:42 +0200
Subject: [PATCH 35/61] ARROW-13042: [C++] Check that kernel output is fully
 initialized

Enhance TestInitialized() so that it really triggers Valgrind for every uninitialized bit in a buffer's data, including child and dictionary data.

Call TestInitialized() automatically from kernel tests.

Fix the BufferBuilder and TypedBufferBuilder API to really resize a Buffer to 0 when intended; introduce a FinishWithLength() method for cases where the caller wants to force a particular size.

Other fixes to get all tests to pass under Valgrind.

Closes #10550 from pitrou/ARROW-13042-check-kernel-output-initialized

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/array/builder_primitive.cc      |   5 +-
 cpp/src/arrow/array/builder_primitive.h       |   7 +-
 cpp/src/arrow/array/util.cc                   |   9 +-
 cpp/src/arrow/buffer_builder.h                |  45 ++++-
 cpp/src/arrow/buffer_test.cc                  |  78 +++++++-
 cpp/src/arrow/compute/exec.cc                 |   1 -
 .../arrow/compute/kernels/aggregate_test.cc   |  17 +-
 .../arrow/compute/kernels/codegen_internal.h  |   8 +
 .../arrow/compute/kernels/hash_aggregate.cc   |  45 +++--
 .../compute/kernels/hash_aggregate_test.cc    |   6 +-
 .../compute/kernels/scalar_arithmetic_test.cc |   4 +-
 .../compute/kernels/scalar_cast_internal.cc   |   7 +-
 .../arrow/compute/kernels/scalar_cast_test.cc |  12 +-
 .../compute/kernels/scalar_compare_test.cc    |   5 +-
 .../compute/kernels/scalar_fill_null_test.cc  |   3 +-
 .../compute/kernels/scalar_if_else_test.cc    |   2 +-
 .../compute/kernels/scalar_set_lookup_test.cc |  12 +-
 .../arrow/compute/kernels/scalar_temporal.cc  | 175 +++++++-----------
 .../compute/kernels/scalar_temporal_test.cc   |  32 ++--
 cpp/src/arrow/compute/kernels/test_util.cc    |  61 +++++-
 cpp/src/arrow/compute/kernels/test_util.h     |   2 +
 .../arrow/compute/kernels/vector_hash_test.cc |  13 +-
 .../compute/kernels/vector_selection_test.cc  |  52 +++---
 .../arrow/compute/kernels/vector_sort_test.cc |  11 +-
 cpp/src/arrow/filesystem/s3fs.cc              |   3 +-
 cpp/src/arrow/testing/gtest_util.cc           |  25 ++-
 cpp/src/arrow/testing/gtest_util.h            |   7 +
 cpp/src/arrow/util/hashing.h                  |  12 +-
 cpp/src/arrow/util/windows_fixup.h            |   7 +
 cpp/src/parquet/arrow/writer.cc               |   2 -
 cpp/src/parquet/encoding.cc                   |   2 +-
 cpp/src/parquet/encoding_test.cc              |   2 +-
 32 files changed, 421 insertions(+), 251 deletions(-)

diff --git a/cpp/src/arrow/array/builder_primitive.cc b/cpp/src/arrow/array/builder_primitive.cc
index 037a1ecbf91..e403c42411d 100644
--- a/cpp/src/arrow/array/builder_primitive.cc
+++ b/cpp/src/arrow/array/builder_primitive.cc
@@ -65,9 +65,8 @@ Status BooleanBuilder::Resize(int64_t capacity) {
 }
 
 Status BooleanBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
-  std::shared_ptr<Buffer> null_bitmap, data;
-  RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
-  RETURN_NOT_OK(data_builder_.Finish(&data));
+  ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_));
+  ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
 
   *out = ArrayData::Make(boolean(), length_, {null_bitmap, data}, null_count_);
 
diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h
index e10f11fdd6c..e0f39f97967 100644
--- a/cpp/src/arrow/array/builder_primitive.h
+++ b/cpp/src/arrow/array/builder_primitive.h
@@ -23,6 +23,7 @@
 
 #include "arrow/array/builder_base.h"
 #include "arrow/array/data.h"
+#include "arrow/result.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 
@@ -185,9 +186,9 @@ class NumericBuilder : public ArrayBuilder {
   }
 
   Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
-    std::shared_ptr<Buffer> data, null_bitmap;
-    ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
-    ARROW_RETURN_NOT_OK(data_builder_.Finish(&data));
+    ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
+                          null_bitmap_builder_.FinishWithLength(length_));
+    ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
     *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
     capacity_ = length_ = null_count_ = 0;
     return Status::OK();
diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc
index 297745a2b17..d4852234cd0 100644
--- a/cpp/src/arrow/array/util.cc
+++ b/cpp/src/arrow/array/util.cc
@@ -286,7 +286,7 @@ std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data) {
 // ----------------------------------------------------------------------
 // Misc APIs
 
-namespace internal {
+namespace {
 
 // get the maximum buffer length required, then allocate a single zeroed buffer
 // to use anywhere a buffer is required
@@ -650,12 +650,11 @@ class RepeatedArrayFactory {
   std::shared_ptr<Array> out_;
 };
 
-}  // namespace internal
+}  // namespace
 
 Result<std::shared_ptr<Array>> MakeArrayOfNull(const std::shared_ptr<DataType>& type,
                                                int64_t length, MemoryPool* pool) {
-  ARROW_ASSIGN_OR_RAISE(auto data,
-                        internal::NullArrayFactory(pool, type, length).Create());
+  ARROW_ASSIGN_OR_RAISE(auto data, NullArrayFactory(pool, type, length).Create());
   return MakeArray(data);
 }
 
@@ -664,7 +663,7 @@ Result<std::shared_ptr<Array>> MakeArrayFromScalar(const Scalar& scalar, int64_t
   if (!scalar.is_valid) {
     return MakeArrayOfNull(scalar.type, length, pool);
   }
-  return internal::RepeatedArrayFactory(pool, scalar, length).Create();
+  return RepeatedArrayFactory(pool, scalar, length).Create();
 }
 
 namespace internal {
diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h
index f525ec23c58..c6250ae2b76 100644
--- a/cpp/src/arrow/buffer_builder.h
+++ b/cpp/src/arrow/buffer_builder.h
@@ -64,15 +64,12 @@ class ARROW_EXPORT BufferBuilder {
   /// \brief Resize the buffer to the nearest multiple of 64 bytes
   ///
   /// \param new_capacity the new capacity of the of the builder. Will be
-  /// rounded up to a multiple of 64 bytes for padding \param shrink_to_fit if
-  /// new capacity is smaller than the existing size, reallocate internal
-  /// buffer. Set to false to avoid reallocations when shrinking the builder.
+  /// rounded up to a multiple of 64 bytes for padding
+  /// \param shrink_to_fit if new capacity is smaller than the existing,
+  /// reallocate internal buffer. Set to false to avoid reallocations when
+  /// shrinking the builder.
   /// \return Status
   Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
-    // Resize(0) is a no-op
-    if (new_capacity == 0) {
-      return Status::OK();
-    }
     if (buffer_ == NULLPTR) {
       ARROW_ASSIGN_OR_RAISE(buffer_, AllocateResizableBuffer(new_capacity, pool_));
     } else {
@@ -168,6 +165,17 @@ class ARROW_EXPORT BufferBuilder {
     return out;
   }
 
+  /// \brief Like Finish, but override the final buffer size
+  ///
+  /// This is useful after writing data directly into the builder memory
+  /// without calling the Append methods (basically, when using BufferBuilder
+  /// mostly for memory allocation).
+  Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
+                                                   bool shrink_to_fit = true) {
+    size_ = final_length;
+    return Finish(shrink_to_fit);
+  }
+
   void Reset() {
     buffer_ = NULLPTR;
     capacity_ = size_ = 0;
@@ -273,6 +281,16 @@ class TypedBufferBuilder<
     return out;
   }
 
+  /// \brief Like Finish, but override the final buffer size
+  ///
+  /// This is useful after writing data directly into the builder memory
+  /// without calling the Append methods (basically, when using TypedBufferBuilder
+  /// only for memory allocation).
+  Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
+                                                   bool shrink_to_fit = true) {
+    return bytes_builder_.FinishWithLength(final_length * sizeof(T), shrink_to_fit);
+  }
+
   void Reset() { bytes_builder_.Reset(); }
 
   int64_t length() const { return bytes_builder_.length() / sizeof(T); }
@@ -399,6 +417,19 @@ class TypedBufferBuilder<bool> {
     return out;
   }
 
+  /// \brief Like Finish, but override the final buffer size
+  ///
+  /// This is useful after writing data directly into the builder memory
+  /// without calling the Append methods (basically, when using TypedBufferBuilder
+  /// only for memory allocation).
+  Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
+                                                   bool shrink_to_fit = true) {
+    const auto final_byte_length = BitUtil::BytesForBits(final_length);
+    bytes_builder_.UnsafeAdvance(final_byte_length - bytes_builder_.length());
+    bit_length_ = false_count_ = 0;
+    return bytes_builder_.FinishWithLength(final_byte_length, shrink_to_fit);
+  }
+
   void Reset() {
     bytes_builder_.Reset();
     bit_length_ = false_count_ = 0;
diff --git a/cpp/src/arrow/buffer_test.cc b/cpp/src/arrow/buffer_test.cc
index 02b96c3b493..4295d4ca692 100644
--- a/cpp/src/arrow/buffer_test.cc
+++ b/cpp/src/arrow/buffer_test.cc
@@ -653,18 +653,77 @@ TEST(TestBufferBuilder, ResizeReserve) {
 
   ASSERT_OK(builder.Resize(128));
   ASSERT_EQ(128, builder.capacity());
+  ASSERT_EQ(9, builder.length());
 
   // Do not shrink to fit
   ASSERT_OK(builder.Resize(64, false));
   ASSERT_EQ(128, builder.capacity());
+  ASSERT_EQ(9, builder.length());
 
   // Shrink to fit
   ASSERT_OK(builder.Resize(64));
   ASSERT_EQ(64, builder.capacity());
+  ASSERT_EQ(9, builder.length());
 
   // Reserve elements
   ASSERT_OK(builder.Reserve(60));
   ASSERT_EQ(128, builder.capacity());
+  ASSERT_EQ(9, builder.length());
+}
+
+TEST(TestBufferBuilder, Finish) {
+  const std::string data = "some data";
+  auto data_ptr = data.c_str();
+
+  for (const bool shrink_to_fit : {true, false}) {
+    ARROW_SCOPED_TRACE("shrink_to_fit = ", shrink_to_fit);
+    BufferBuilder builder;
+    ASSERT_OK(builder.Append(data_ptr, 9));
+    ASSERT_OK(builder.Append(data_ptr, 9));
+    ASSERT_EQ(18, builder.length());
+    ASSERT_EQ(64, builder.capacity());
+
+    ASSERT_OK_AND_ASSIGN(auto buf, builder.Finish(shrink_to_fit));
+    ASSERT_EQ(buf->size(), 18);
+    ASSERT_EQ(buf->capacity(), 64);
+  }
+  for (const bool shrink_to_fit : {true, false}) {
+    ARROW_SCOPED_TRACE("shrink_to_fit = ", shrink_to_fit);
+    BufferBuilder builder;
+    ASSERT_OK(builder.Reserve(1024));
+    builder.UnsafeAppend(data_ptr, 9);
+    builder.UnsafeAppend(data_ptr, 9);
+    ASSERT_EQ(18, builder.length());
+    ASSERT_EQ(builder.capacity(), 1024);
+
+    ASSERT_OK_AND_ASSIGN(auto buf, builder.Finish(shrink_to_fit));
+    ASSERT_EQ(buf->size(), 18);
+    ASSERT_EQ(buf->capacity(), shrink_to_fit ? 64 : 1024);
+  }
+}
+
+TEST(TestBufferBuilder, FinishEmpty) {
+  for (const bool shrink_to_fit : {true, false}) {
+    ARROW_SCOPED_TRACE("shrink_to_fit = ", shrink_to_fit);
+    BufferBuilder builder;
+    ASSERT_EQ(0, builder.length());
+    ASSERT_EQ(0, builder.capacity());
+
+    ASSERT_OK_AND_ASSIGN(auto buf, builder.Finish(shrink_to_fit));
+    ASSERT_EQ(buf->size(), 0);
+    ASSERT_EQ(buf->capacity(), 0);
+  }
+  for (const bool shrink_to_fit : {true, false}) {
+    ARROW_SCOPED_TRACE("shrink_to_fit = ", shrink_to_fit);
+    BufferBuilder builder;
+    ASSERT_OK(builder.Reserve(1024));
+    ASSERT_EQ(0, builder.length());
+    ASSERT_EQ(1024, builder.capacity());
+
+    ASSERT_OK_AND_ASSIGN(auto buf, builder.Finish(shrink_to_fit));
+    ASSERT_EQ(buf->size(), 0);
+    ASSERT_EQ(buf->capacity(), shrink_to_fit ? 0 : 1024);
+  }
 }
 
 template <typename T>
@@ -717,7 +776,7 @@ TYPED_TEST(TypedTestBufferBuilder, AppendCopies) {
   }
 }
 
-TEST(TestBufferBuilder, BasicBoolBufferBuilderUsage) {
+TEST(TestBoolBufferBuilder, Basics) {
   TypedBufferBuilder<bool> builder;
 
   ASSERT_OK(builder.Append(false));
@@ -746,7 +805,7 @@ TEST(TestBufferBuilder, BasicBoolBufferBuilderUsage) {
   ASSERT_EQ(built->size(), BitUtil::BytesForBits(nvalues + 1));
 }
 
-TEST(TestBufferBuilder, BoolBufferBuilderAppendCopies) {
+TEST(TestBoolBufferBuilder, AppendCopies) {
   TypedBufferBuilder<bool> builder;
 
   ASSERT_OK(builder.Append(13, true));
@@ -766,6 +825,21 @@ TEST(TestBufferBuilder, BoolBufferBuilderAppendCopies) {
   ASSERT_EQ(built->size(), BitUtil::BytesForBits(13 + 17));
 }
 
+TEST(TestBoolBufferBuilder, Reserve) {
+  TypedBufferBuilder<bool> builder;
+
+  ASSERT_OK(builder.Reserve(13 + 17));
+  builder.UnsafeAppend(13, true);
+  builder.UnsafeAppend(17, false);
+  ASSERT_EQ(builder.length(), 13 + 17);
+  ASSERT_EQ(builder.capacity(), 64 * 8);
+  ASSERT_EQ(builder.false_count(), 17);
+
+  ASSERT_OK_AND_ASSIGN(auto built, builder.Finish());
+  AssertIsCPUBuffer(*built);
+  ASSERT_EQ(built->size(), BitUtil::BytesForBits(13 + 17));
+}
+
 template <typename T>
 class TypedTestBuffer : public ::testing::Test {};
 
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index 0b1f6b5658e..73cb82ef026 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -106,7 +106,6 @@ Result<std::shared_ptr<Buffer>> AllocateDataBuffer(KernelContext* ctx, int64_t l
     int64_t buffer_size = BitUtil::BytesForBits(length * bit_width);
     return ctx->Allocate(buffer_size);
   }
-  return Status::OK();
 }
 
 struct BufferPreallocation {
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index 476caab03d5..4bce02a990b 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -1215,7 +1215,7 @@ class TestPrimitiveModeKernel : public ::testing::Test {
                       const std::vector<CType>& expected_modes,
                       const std::vector<int64_t>& expected_counts) {
     ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, ModeOptions{n}));
-    ASSERT_OK(out.make_array()->ValidateFull());
+    ValidateOutput(out);
     const StructArray out_array(out.array());
     ASSERT_EQ(out_array.length(), expected_modes.size());
     ASSERT_EQ(out_array.num_fields(), 2);
@@ -1256,7 +1256,8 @@ class TestPrimitiveModeKernel : public ::testing::Test {
 
   void AssertModesEmpty(const Datum& array, int n) {
     ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, ModeOptions{n}));
-    ASSERT_OK(out.make_array()->ValidateFull());
+    auto out_array = out.make_array();
+    ValidateOutput(*out_array);
     ASSERT_EQ(out.array()->length, 0);
   }
 
@@ -1397,8 +1398,8 @@ template <typename ArrowType, typename CTYPE = typename ArrowType::c_type>
 void VerifyMode(const std::shared_ptr<Array>& array) {
   auto expected = NaiveMode<ArrowType>(*array);
   ASSERT_OK_AND_ASSIGN(Datum out, Mode(array));
-  ASSERT_OK(out.make_array()->ValidateFull());
   const StructArray out_array(out.array());
+  ValidateOutput(out_array);
   ASSERT_EQ(out_array.length(), 1);
   ASSERT_EQ(out_array.num_fields(), 2);
 
@@ -1756,7 +1757,7 @@ class TestPrimitiveQuantileKernel : public ::testing::Test {
 
       ASSERT_OK_AND_ASSIGN(Datum out, Quantile(array, options));
       const auto& out_array = out.make_array();
-      ASSERT_OK(out_array->ValidateFull());
+      ValidateOutput(*out_array);
       ASSERT_EQ(out_array->length(), options.q.size());
       ASSERT_EQ(out_array->null_count(), 0);
       AssertTypeEqual(out_array->type(), expected[0][i].type());
@@ -1816,7 +1817,8 @@ class TestPrimitiveQuantileKernel : public ::testing::Test {
     for (auto interpolation : this->interpolations_) {
       options.interpolation = interpolation;
       ASSERT_OK_AND_ASSIGN(Datum out, Quantile(array, options));
-      ASSERT_OK(out.make_array()->ValidateFull());
+      auto out_array = out.make_array();
+      ValidateOutput(*out_array);
       ASSERT_EQ(out.array()->length, 0);
     }
   }
@@ -2044,7 +2046,7 @@ class TestRandomQuantileKernel : public TestPrimitiveQuantileKernel<ArrowType> {
     TDigestOptions options(quantiles);
     ASSERT_OK_AND_ASSIGN(Datum out, TDigest(chunked, options));
     const auto& out_array = out.make_array();
-    ASSERT_OK(out_array->ValidateFull());
+    ValidateOutput(*out_array);
     ASSERT_EQ(out_array->length(), quantiles.size());
     ASSERT_EQ(out_array->null_count(), 0);
     AssertTypeEqual(out_array->type(), float64());
@@ -2186,7 +2188,8 @@ TEST_F(TestTDigestKernel, AllNullsOrNaNs) {
   for (const auto& json : tests) {
     auto chunked = ChunkedArrayFromJSON(float64(), json);
     ASSERT_OK_AND_ASSIGN(Datum out, TDigest(chunked, TDigestOptions()));
-    ASSERT_OK(out.make_array()->ValidateFull());
+    auto out_array = out.make_array();
+    ValidateOutput(*out_array);
     ASSERT_EQ(out.array()->length, 0);
   }
 }
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index 6a5cee124c0..140f9fdc669 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -276,6 +276,8 @@ struct OutputArrayWriter<Type, enable_if_has_c_type_not_boolean<Type>> {
   // Note that this doesn't write the null bitmap, which should be consistent
   // with Write / WriteNull calls
   void WriteNull() { *values++ = T{}; }
+
+  void WriteAllNull(int64_t length) { std::memset(values, 0, sizeof(T) * length); }
 };
 
 template <typename Type>
@@ -290,6 +292,8 @@ struct OutputArrayWriter<Type, enable_if_decimal<Type>> {
   void Write(T value) { value.ToBytes(values++->data()); }
 
   void WriteNull() { T{}.ToBytes(values++->data()); }
+
+  void WriteAllNull(int64_t length) { std::memset(values, 0, sizeof(T) * length); }
 };
 
 // (Un)box Scalar to / from C++ value
@@ -918,6 +922,8 @@ struct ScalarBinaryNotNullStateful {
                 op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, u, arg1_val, &st));
           },
           [&]() { writer.WriteNull(); });
+    } else {
+      writer.WriteAllNull(out->mutable_array()->length);
     }
     return st;
   }
@@ -935,6 +941,8 @@ struct ScalarBinaryNotNullStateful {
                 op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, v, &st));
           },
           [&]() { writer.WriteNull(); });
+    } else {
+      writer.WriteAllNull(out->mutable_array()->length);
     }
     return st;
   }
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 5f6503f8c24..e282035d82a 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -442,6 +442,9 @@ struct GrouperImpl : Grouper {
 };
 
 struct GrouperFastImpl : Grouper {
+  static constexpr int kBitmapPaddingForSIMD = 64;  // bits
+  static constexpr int kPaddingForSIMD = 32;        // bytes
+
   static bool CanUse(const std::vector<ValueDescr>& keys) {
 #if ARROW_LITTLE_ENDIAN
     for (size_t i = 0; i < keys.size(); ++i) {
@@ -517,9 +520,8 @@ struct GrouperFastImpl : Grouper {
                                   impl->encode_ctx_.stack, impl->log_minibatch_max_,
                                   equal_func, append_func));
     impl->cols_.resize(num_columns);
-    constexpr int padding_for_SIMD = 32;
     impl->minibatch_hashes_.resize(impl->minibatch_size_max_ +
-                                   padding_for_SIMD / sizeof(uint32_t));
+                                   kPaddingForSIMD / sizeof(uint32_t));
 
     return std::move(impl);
   }
@@ -608,6 +610,22 @@ struct GrouperFastImpl : Grouper {
 
   uint32_t num_groups() const override { return static_cast<uint32_t>(rows_.length()); }
 
+  // Make sure padded buffers end up with the right logical size
+
+  Result<std::shared_ptr<Buffer>> AllocatePaddedBitmap(int64_t length) {
+    ARROW_ASSIGN_OR_RAISE(
+        std::shared_ptr<Buffer> buf,
+        AllocateBitmap(length + kBitmapPaddingForSIMD, ctx_->memory_pool()));
+    return SliceMutableBuffer(buf, 0, BitUtil::BytesForBits(length));
+  }
+
+  Result<std::shared_ptr<Buffer>> AllocatePaddedBuffer(int64_t size) {
+    ARROW_ASSIGN_OR_RAISE(
+        std::shared_ptr<Buffer> buf,
+        AllocateBuffer(size + kBitmapPaddingForSIMD, ctx_->memory_pool()));
+    return SliceMutableBuffer(buf, 0, size);
+  }
+
   Result<ExecBatch> GetUniques() override {
     auto num_columns = static_cast<uint32_t>(col_metadata_.size());
     int64_t num_groups = rows_.length();
@@ -616,28 +634,19 @@ struct GrouperFastImpl : Grouper {
     std::vector<std::shared_ptr<Buffer>> fixedlen_bufs(num_columns);
     std::vector<std::shared_ptr<Buffer>> varlen_bufs(num_columns);
 
-    constexpr int padding_bits = 64;
-    constexpr int padding_for_SIMD = 32;
     for (size_t i = 0; i < num_columns; ++i) {
-      ARROW_ASSIGN_OR_RAISE(non_null_bufs[i], AllocateBitmap(num_groups + padding_bits,
-                                                             ctx_->memory_pool()));
+      ARROW_ASSIGN_OR_RAISE(non_null_bufs[i], AllocatePaddedBitmap(num_groups));
       if (col_metadata_[i].is_fixed_length) {
         if (col_metadata_[i].fixed_length == 0) {
-          ARROW_ASSIGN_OR_RAISE(
-              fixedlen_bufs[i],
-              AllocateBitmap(num_groups + padding_bits, ctx_->memory_pool()));
+          ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i], AllocatePaddedBitmap(num_groups));
         } else {
           ARROW_ASSIGN_OR_RAISE(
               fixedlen_bufs[i],
-              AllocateBuffer(
-                  num_groups * col_metadata_[i].fixed_length + padding_for_SIMD,
-                  ctx_->memory_pool()));
+              AllocatePaddedBuffer(num_groups * col_metadata_[i].fixed_length));
         }
       } else {
-        ARROW_ASSIGN_OR_RAISE(
-            fixedlen_bufs[i],
-            AllocateBuffer((num_groups + 1) * sizeof(uint32_t) + padding_for_SIMD,
-                           ctx_->memory_pool()));
+        ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i],
+                              AllocatePaddedBuffer((num_groups + 1) * sizeof(uint32_t)));
       }
       cols_[i] = arrow::compute::KeyEncoder::KeyColumnArray(
           col_metadata_[i], num_groups, non_null_bufs[i]->mutable_data(),
@@ -657,9 +666,7 @@ struct GrouperFastImpl : Grouper {
         if (!col_metadata_[i].is_fixed_length) {
           auto varlen_size =
               reinterpret_cast<const uint32_t*>(fixedlen_bufs[i]->data())[num_groups];
-          ARROW_ASSIGN_OR_RAISE(
-              varlen_bufs[i],
-              AllocateBuffer(varlen_size + padding_for_SIMD, ctx_->memory_pool()));
+          ARROW_ASSIGN_OR_RAISE(varlen_bufs[i], AllocatePaddedBuffer(varlen_size));
           cols_[i] = arrow::compute::KeyEncoder::KeyColumnArray(
               col_metadata_[i], num_groups, non_null_bufs[i]->mutable_data(),
               fixedlen_bufs[i]->mutable_data(), varlen_bufs[i]->mutable_data());
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
index 5e4f8c5f0e6..a8f8c64663d 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
@@ -116,7 +116,7 @@ void ValidateGroupBy(const std::vector<internal::Aggregate>& aggregates,
   ASSERT_OK_AND_ASSIGN(Datum actual, GroupBy(arguments, keys, aggregates));
 
   ASSERT_OK(expected.make_array()->ValidateFull());
-  ASSERT_OK(actual.make_array()->ValidateFull());
+  ValidateOutput(actual);
 
   AssertDatumsEqual(expected, actual, /*verbose=*/true);
 }
@@ -250,7 +250,7 @@ struct TestGrouper {
       // check that uniques_ are prefixes of new_uniques
       for (int i = 0; i < uniques_.num_values(); ++i) {
         auto new_unique = new_uniques[i].make_array();
-        ASSERT_OK(new_unique->ValidateFull());
+        ValidateOutput(*new_unique);
 
         AssertDatumsEqual(uniques_[i], new_unique->Slice(0, uniques_.length),
                           /*verbose=*/true);
@@ -261,7 +261,7 @@ struct TestGrouper {
 
     // check that the ids encode an equivalent key sequence
     auto ids = id_batch.make_array();
-    ASSERT_OK(ids->ValidateFull());
+    ValidateOutput(*ids);
 
     for (int i = 0; i < key_batch.num_values(); ++i) {
       SCOPED_TRACE(std::to_string(i) + "th key array");
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
index 3ee862c834e..ae2f55c6be6 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
@@ -125,7 +125,7 @@ class TestUnaryArithmetic : public TestBase {
 
   void ValidateAndAssertApproxEqual(const std::shared_ptr<Array>& actual,
                                     const std::shared_ptr<Array>& expected) {
-    ASSERT_OK(actual->ValidateFull());
+    ValidateOutput(*actual);
     AssertArraysApproxEqual(*expected, *actual, /*verbose=*/true, equal_options_);
   }
 
@@ -262,7 +262,7 @@ class TestBinaryArithmetic : public TestBase {
 
   void ValidateAndAssertApproxEqual(const std::shared_ptr<Array>& actual,
                                     const std::shared_ptr<Array>& expected) {
-    ASSERT_OK(actual->ValidateFull());
+    ValidateOutput(*actual);
     AssertArraysApproxEqual(*expected, *actual, /*verbose=*/true, equal_options_);
   }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
index f42635c5dcd..198c82bd97e 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -255,7 +255,12 @@ static bool CanCastFromDictionary(Type::type type_id) {
 
 void AddCommonCasts(Type::type out_type_id, OutputType out_ty, CastFunction* func) {
   // From null to this type
-  DCHECK_OK(func->AddKernel(Type::NA, {null()}, out_ty, CastFromNull));
+  ScalarKernel kernel;
+  kernel.exec = CastFromNull;
+  kernel.signature = KernelSignature::Make({null()}, out_ty);
+  kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+  kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+  DCHECK_OK(func->AddKernel(Type::NA, std::move(kernel)));
 
   // From dictionary to this type
   if (CanCastFromDictionary(out_type_id)) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index ef22fa8cb72..494b15dfbc8 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -113,7 +113,7 @@ static void CheckCastZeroCopy(std::shared_ptr<Array> input,
                               std::shared_ptr<DataType> to_type,
                               CastOptions options = CastOptions::Safe()) {
   ASSERT_OK_AND_ASSIGN(auto converted, Cast(*input, to_type, options));
-  ASSERT_OK(converted->ValidateFull());
+  ValidateOutput(*converted);
 
   ASSERT_EQ(input->data()->buffers.size(), converted->data()->buffers.size());
   for (size_t i = 0; i < input->data()->buffers.size(); ++i) {
@@ -1583,7 +1583,7 @@ TEST(Cast, BinaryOrStringToBinary) {
 
       // invalid utf-8 is not an error for binary
       ASSERT_OK_AND_ASSIGN(auto strings, Cast(*invalid_utf8, to_type));
-      ASSERT_OK(strings->ValidateFull());
+      ValidateOutput(*strings);
       AssertBinaryZeroCopy(invalid_utf8, strings);
 
       // invalid utf-8 masked by a null bit is not an error
@@ -1687,12 +1687,12 @@ TEST(Cast, ListToList) {
     auto list_int64 = list_int32->Copy();
     list_int64->type = make_list(int64());
     list_int64->child_data[0] = Cast(list_int32->child_data[0], int64())->array();
-    ASSERT_OK(MakeArray(list_int64)->ValidateFull());
+    ValidateOutput(*list_int64);
 
     auto list_float32 = list_int32->Copy();
     list_float32->type = make_list(float32());
     list_float32->child_data[0] = Cast(list_int32->child_data[0], float32())->array();
-    ASSERT_OK(MakeArray(list_float32)->ValidateFull());
+    ValidateOutput(*list_float32);
 
     CheckCast(MakeArray(list_int32), MakeArray(list_float32));
     CheckCast(MakeArray(list_float32), MakeArray(list_int64));
@@ -1711,7 +1711,7 @@ TEST(Cast, ListToList) {
     auto list_int64 = list_int32->Copy();
     list_int64->type = make_list(int64());
     list_int64->child_data[0] = Cast(list_int32->child_data[0], int64())->array();
-    ASSERT_OK(MakeArray(list_int64)->ValidateFull());
+    ValidateOutput(*list_int64);
 
     CheckCast(MakeArray(list_int32), MakeArray(list_int64));
     CheckCast(MakeArray(list_int64), MakeArray(list_int32));
@@ -1861,7 +1861,7 @@ TEST(Cast, FromDictionary) {
     data->buffers[0] = nullptr;
     data->null_count = 0;
     std::shared_ptr<Array> dict_array = std::make_shared<DictionaryArray>(data);
-    ASSERT_OK(dict_array->ValidateFull());
+    ValidateOutput(*dict_array);
 
     CheckCast(dict_array, no_nulls);
   }
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
index 50327e82032..87f3bd3fc23 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
@@ -673,10 +673,7 @@ class TestVarArgsCompare : public TestBase {
   Datum Eval(VarArgsFunction func, const std::vector<Datum>& args) {
     EXPECT_OK_AND_ASSIGN(auto actual,
                          func(args, element_wise_aggregate_options_, nullptr));
-    if (actual.is_array()) {
-      auto arr = actual.make_array();
-      ARROW_EXPECT_OK(arr->ValidateFull());
-    }
+    ValidateOutput(actual);
     return actual;
   }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_fill_null_test.cc b/cpp/src/arrow/compute/kernels/scalar_fill_null_test.cc
index a0b6fdc63a9..70ce4d5ca7b 100644
--- a/cpp/src/arrow/compute/kernels/scalar_fill_null_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_fill_null_test.cc
@@ -22,6 +22,7 @@
 
 #include "arrow/array/array_base.h"
 #include "arrow/compute/api.h"
+#include "arrow/compute/kernels/test_util.h"
 #include "arrow/result.h"
 #include "arrow/scalar.h"
 #include "arrow/testing/gtest_compat.h"
@@ -38,7 +39,7 @@ void CheckFillNull(const Array& input, const Datum& fill_value, const Array& exp
   auto Check = [&](const Array& input, const Array& expected) {
     ASSERT_OK_AND_ASSIGN(Datum datum_out, FillNull(input, fill_value));
     std::shared_ptr<Array> result = datum_out.make_array();
-    ASSERT_OK(result->ValidateFull());
+    ValidateOutput(*result);
     AssertArraysEqual(expected, *result, /*verbose=*/true);
     if (all_valid) {
       // Check null count of ArrayData is set, not the computed Array.null_count
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
index 0fb0a1fc2d8..2b63af2f26f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
@@ -29,7 +29,7 @@ void CheckIfElseOutput(const Datum& cond, const Datum& left, const Datum& right,
   ASSERT_OK_AND_ASSIGN(Datum datum_out, IfElse(cond, left, right));
   if (datum_out.is_array()) {
     std::shared_ptr<Array> result = datum_out.make_array();
-    ASSERT_OK(result->ValidateFull());
+    ValidateOutput(*result);
     std::shared_ptr<Array> expected_ = expected.make_array();
     AssertArraysEqual(*expected_, *result, /*verbose=*/true);
   } else {  // expecting scalar
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
index 5c8bf98e196..9b6ded0bbe7 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
@@ -57,7 +57,7 @@ void CheckIsIn(const std::shared_ptr<DataType>& type, const std::string& input_j
   ASSERT_OK_AND_ASSIGN(Datum actual_datum,
                        IsIn(input, SetLookupOptions(value_set, skip_nulls)));
   std::shared_ptr<Array> actual = actual_datum.make_array();
-  ASSERT_OK(actual->ValidateFull());
+  ValidateOutput(actual_datum);
   AssertArraysEqual(*expected, *actual, /*verbose=*/true);
 }
 
@@ -68,7 +68,7 @@ void CheckIsInChunked(const std::shared_ptr<ChunkedArray>& input,
   ASSERT_OK_AND_ASSIGN(Datum actual_datum,
                        IsIn(input, SetLookupOptions(value_set, skip_nulls)));
   auto actual = actual_datum.chunked_array();
-  ASSERT_OK(actual->ValidateFull());
+  ValidateOutput(actual_datum);
   AssertChunkedEqual(*expected, *actual);
 }
 
@@ -89,7 +89,7 @@ void CheckIsInDictionary(const std::shared_ptr<DataType>& type,
   ASSERT_OK_AND_ASSIGN(Datum actual_datum,
                        IsIn(input, SetLookupOptions(value_set, skip_nulls)));
   std::shared_ptr<Array> actual = actual_datum.make_array();
-  ASSERT_OK(actual->ValidateFull());
+  ValidateOutput(actual_datum);
   AssertArraysEqual(*expected, *actual, /*verbose=*/true);
 }
 
@@ -436,7 +436,7 @@ class TestIndexInKernel : public ::testing::Test {
     SetLookupOptions options(value_set, skip_nulls);
     ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(input, options));
     std::shared_ptr<Array> actual = actual_datum.make_array();
-    ASSERT_OK(actual->ValidateFull());
+    ValidateOutput(actual_datum);
     AssertArraysEqual(*expected, *actual, /*verbose=*/true);
   }
 
@@ -447,7 +447,7 @@ class TestIndexInKernel : public ::testing::Test {
     ASSERT_OK_AND_ASSIGN(Datum actual,
                          IndexIn(input, SetLookupOptions(value_set, skip_nulls)));
     ASSERT_EQ(Datum::CHUNKED_ARRAY, actual.kind());
-    ASSERT_OK(actual.chunked_array()->ValidateFull());
+    ValidateOutput(actual);
     AssertChunkedEqual(*expected, *actual.chunked_array());
   }
 
@@ -469,7 +469,7 @@ class TestIndexInKernel : public ::testing::Test {
     SetLookupOptions options(value_set, skip_nulls);
     ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(input, options));
     std::shared_ptr<Array> actual = actual_datum.make_array();
-    ASSERT_OK(actual->ValidateFull());
+    ValidateOutput(actual_datum);
     AssertArraysEqual(*expected, *actual, /*verbose=*/true);
   }
 };
diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_temporal.cc
index cc22ccf044a..1694d22ffae 100644
--- a/cpp/src/arrow/compute/kernels/scalar_temporal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_temporal.cc
@@ -17,13 +17,16 @@
 
 #include "arrow/builder.h"
 #include "arrow/compute/kernels/common.h"
+#include "arrow/util/checked_cast.h"
 #include "arrow/util/time.h"
 #include "arrow/vendored/datetime.h"
 
 namespace arrow {
 
-namespace compute {
+using internal::checked_cast;
+using internal::checked_pointer_cast;
 
+namespace compute {
 namespace internal {
 
 namespace {
@@ -45,77 +48,35 @@ using arrow_vendored::date::literals::thu;
 using internal::applicator::ScalarUnaryNotNull;
 using internal::applicator::SimpleUnary;
 
-// Based on ScalarUnaryNotNullStateful. Adds timezone awareness.
-template <typename Op, typename OutType>
-struct ScalarUnaryStatefulTemporal {
-  using ThisType = ScalarUnaryStatefulTemporal<Op, OutType>;
-  using OutValue = typename internal::GetOutputType<OutType>::T;
+const std::string& GetInputTimezone(const Datum& datum) {
+  return checked_cast<const TimestampType&>(*datum.type()).timezone();
+}
 
-  Op op;
-  explicit ScalarUnaryStatefulTemporal(Op op) : op(std::move(op)) {}
-
-  template <typename Type>
-  struct ArrayExec {
-    static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
-                       Datum* out) {
-      const std::string timezone =
-          checked_pointer_cast<const TimestampType>(arg0.type)->timezone();
-      Status st = Status::OK();
-      ArrayData* out_arr = out->mutable_array();
-      auto out_data = out_arr->GetMutableValues<OutValue>(1);
-
-      if (timezone.empty()) {
-        internal::VisitArrayValuesInline<Int64Type>(
-            arg0,
-            [&](int64_t v) {
-              *out_data++ = functor.op.template Call<OutValue>(ctx, v, &st);
-            },
-            [&]() {
-              // null
-              ++out_data;
-            });
-      } else {
-        st = Status::Invalid("Timezone aware timestamps not supported. Timezone found: ",
-                             timezone);
-      }
-      return st;
-    }
-  };
-
-  Status Scalar(KernelContext* ctx, const Scalar& arg0, Datum* out) {
-    const std::string timezone =
-        checked_pointer_cast<const TimestampType>(arg0.type)->timezone();
-    Status st = Status::OK();
-    if (timezone.empty()) {
-      if (arg0.is_valid) {
-        int64_t arg0_val = internal::UnboxScalar<Int64Type>::Unbox(arg0);
-        internal::BoxScalar<OutType>::Box(
-            this->op.template Call<OutValue>(ctx, arg0_val, &st), out->scalar().get());
-      }
-    } else {
-      st = Status::Invalid("Timezone aware timestamps not supported. Timezone found: ",
-                           timezone);
-    }
-    return st;
-  }
+const std::string& GetInputTimezone(const Scalar& scalar) {
+  return checked_cast<const TimestampType&>(*scalar.type).timezone();
+}
 
-  Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    if (batch[0].kind() == Datum::ARRAY) {
-      return ArrayExec<OutType>::Exec(*this, ctx, *batch[0].array(), out);
-    } else {
-      return Scalar(ctx, *batch[0].scalar(), out);
-    }
+const std::string& GetInputTimezone(const ArrayData& array) {
+  return checked_cast<const TimestampType&>(*array.type).timezone();
+}
+
+template <typename T>
+Status TemporalComponentExtractCheckTimezone(const T& input) {
+  const auto& timezone = GetInputTimezone(input);
+  if (!timezone.empty()) {
+    return Status::NotImplemented(
+        "Cannot extract components from timestamp with specific timezone: ", timezone);
   }
-};
+  return Status::OK();
+}
 
 template <typename Op, typename OutType>
-struct ScalarUnaryTemporal {
+struct TemporalComponentExtract {
   using OutValue = typename internal::GetOutputType<OutType>::T;
 
   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    // Seed kernel with dummy state
-    ScalarUnaryStatefulTemporal<Op, OutType> kernel({});
-    return kernel.Exec(ctx, batch, out);
+    RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(batch.values[0]));
+    return ScalarUnaryNotNull<OutType, TimestampType, Op>::Exec(ctx, batch, out);
   }
 };
 
@@ -124,8 +85,8 @@ struct ScalarUnaryTemporal {
 
 template <typename Duration>
 struct Year {
-  template <typename T>
-  static T Call(KernelContext*, int64_t arg, Status*) {
+  template <typename T, typename Arg0>
+  static T Call(KernelContext*, Arg0 arg, Status*) {
     return static_cast<T>(static_cast<const int32_t>(
         year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).year()));
   }
@@ -136,8 +97,8 @@ struct Year {
 
 template <typename Duration>
 struct Month {
-  template <typename T>
-  static T Call(KernelContext*, int64_t arg, Status*) {
+  template <typename T, typename Arg0>
+  static T Call(KernelContext*, Arg0 arg, Status*) {
     return static_cast<T>(static_cast<const uint32_t>(
         year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).month()));
   }
@@ -148,8 +109,8 @@ struct Month {
 
 template <typename Duration>
 struct Day {
-  template <typename T>
-  static T Call(KernelContext*, int64_t arg, Status*) {
+  template <typename T, typename Arg0>
+  static T Call(KernelContext*, Arg0 arg, Status*) {
     return static_cast<T>(static_cast<const uint32_t>(
         year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).day()));
   }
@@ -160,8 +121,8 @@ struct Day {
 
 template <typename Duration>
 struct DayOfWeek {
-  template <typename T>
-  static T Call(KernelContext*, int64_t arg, Status*) {
+  template <typename T, typename Arg0>
+  static T Call(KernelContext*, Arg0 arg, Status*) {
     return static_cast<T>(
         weekday(year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))))
             .iso_encoding() -
@@ -174,8 +135,8 @@ struct DayOfWeek {
 
 template <typename Duration>
 struct DayOfYear {
-  template <typename T>
-  static T Call(KernelContext*, int64_t arg, Status*) {
+  template <typename T, typename Arg0>
+  static T Call(KernelContext*, Arg0 arg, Status*) {
     const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
     return static_cast<T>(
         (t - sys_time<days>(year_month_day(t).year() / jan / 0)).count());
@@ -190,8 +151,8 @@ struct DayOfYear {
 
 template <typename Duration>
 struct ISOYear {
-  template <typename T>
-  static T Call(KernelContext*, int64_t arg, Status*) {
+  template <typename T, typename Arg0>
+  static T Call(KernelContext*, Arg0 arg, Status*) {
     const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
     auto y = year_month_day{t + days{3}}.year();
     auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
@@ -211,8 +172,8 @@ struct ISOYear {
 // https://github.com/HowardHinnant/date/blob/6e921e1b1d21e84a5c82416ba7ecd98e33a436d0/include/date/iso_week.h#L1503
 template <typename Duration>
 struct ISOWeek {
-  template <typename T>
-  static T Call(KernelContext*, int64_t arg, Status*) {
+  template <typename T, typename Arg0>
+  static T Call(KernelContext*, Arg0 arg, Status*) {
     const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
     auto y = year_month_day{t + days{3}}.year();
     auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
@@ -229,8 +190,8 @@ struct ISOWeek {
 
 template <typename Duration>
 struct Quarter {
-  template <typename T>
-  static T Call(KernelContext*, int64_t arg, Status*) {
+  template <typename T, typename Arg0>
+  static T Call(KernelContext*, Arg0 arg, Status*) {
     const auto ymd = year_month_day(floor<days>(sys_time<Duration>(Duration{arg})));
     return static_cast<T>((static_cast<const uint32_t>(ymd.month()) - 1) / 3 + 1);
   }
@@ -241,8 +202,8 @@ struct Quarter {
 
 template <typename Duration>
 struct Hour {
-  template <typename T>
-  static T Call(KernelContext*, int64_t arg, Status*) {
+  template <typename T, typename Arg0>
+  static T Call(KernelContext*, Arg0 arg, Status*) {
     Duration t = Duration{arg};
     return static_cast<T>((t - floor<days>(t)) / std::chrono::hours(1));
   }
@@ -253,8 +214,8 @@ struct Hour {
 
 template <typename Duration>
 struct Minute {
-  template <typename T>
-  static T Call(KernelContext*, int64_t arg, Status*) {
+  template <typename T, typename Arg0>
+  static T Call(KernelContext*, Arg0 arg, Status*) {
     Duration t = Duration{arg};
     return static_cast<T>((t - floor<std::chrono::hours>(t)) / std::chrono::minutes(1));
   }
@@ -265,8 +226,8 @@ struct Minute {
 
 template <typename Duration>
 struct Second {
-  template <typename T>
-  static T Call(KernelContext*, int64_t arg, Status*) {
+  template <typename T, typename Arg0>
+  static T Call(KernelContext*, Arg0 arg, Status*) {
     Duration t = Duration{arg};
     return static_cast<T>((t - floor<std::chrono::minutes>(t)) / std::chrono::seconds(1));
   }
@@ -277,8 +238,8 @@ struct Second {
 
 template <typename Duration>
 struct Subsecond {
-  template <typename T>
-  static T Call(KernelContext*, int64_t arg, Status*) {
+  template <typename T, typename Arg0>
+  static T Call(KernelContext*, Arg0 arg, Status*) {
     Duration t = Duration{arg};
     return static_cast<T>(
         (std::chrono::duration<double>(t - floor<std::chrono::seconds>(t)).count()));
@@ -290,8 +251,8 @@ struct Subsecond {
 
 template <typename Duration>
 struct Millisecond {
-  template <typename T>
-  static T Call(KernelContext*, int64_t arg, Status*) {
+  template <typename T, typename Arg0>
+  static T Call(KernelContext*, Arg0 arg, Status*) {
     Duration t = Duration{arg};
     return static_cast<T>(
         ((t - floor<std::chrono::seconds>(t)) / std::chrono::milliseconds(1)) % 1000);
@@ -303,8 +264,8 @@ struct Millisecond {
 
 template <typename Duration>
 struct Microsecond {
-  template <typename T>
-  static T Call(KernelContext*, int64_t arg, Status*) {
+  template <typename T, typename Arg0>
+  static T Call(KernelContext*, Arg0 arg, Status*) {
     Duration t = Duration{arg};
     return static_cast<T>(
         ((t - floor<std::chrono::seconds>(t)) / std::chrono::microseconds(1)) % 1000);
@@ -316,8 +277,8 @@ struct Microsecond {
 
 template <typename Duration>
 struct Nanosecond {
-  template <typename T>
-  static T Call(KernelContext*, int64_t arg, Status*) {
+  template <typename T, typename Arg0>
+  static T Call(KernelContext*, Arg0 arg, Status*) {
     Duration t = Duration{arg};
     return static_cast<T>(
         ((t - floor<std::chrono::seconds>(t)) / std::chrono::nanoseconds(1)) % 1000);
@@ -345,13 +306,7 @@ inline std::vector<int64_t> get_iso_calendar(int64_t arg) {
 template <typename Duration>
 struct ISOCalendar {
   static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
-    const std::string timezone =
-        checked_pointer_cast<const TimestampType>(in.type)->timezone();
-    if (!timezone.empty()) {
-      return Status::Invalid("Timezone aware timestamps not supported. Timezone found: ",
-                             timezone);
-    }
-
+    RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in));
     if (in.is_valid) {
       const std::shared_ptr<DataType> iso_calendar_type =
           struct_({field("iso_year", int64()), field("iso_week", int64()),
@@ -372,12 +327,8 @@ struct ISOCalendar {
 
   static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
     using BuilderType = typename TypeTraits<Int64Type>::BuilderType;
-    const std::string timezone =
-        checked_pointer_cast<const TimestampType>(in.type)->timezone();
-    if (!timezone.empty()) {
-      return Status::Invalid("Timezone aware timestamps not supported. Timezone found: ",
-                             timezone);
-    }
+
+    RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in));
     const std::shared_ptr<DataType> iso_calendar_type =
         struct_({field("iso_year", int64()), field("iso_week", int64()),
                  field("iso_day_of_week", int64())});
@@ -421,22 +372,24 @@ std::shared_ptr<ScalarFunction> MakeTemporal(std::string name, const FunctionDoc
     InputType in_type{match::TimestampTypeUnit(unit)};
     switch (unit) {
       case TimeUnit::SECOND: {
-        auto exec = ScalarUnaryTemporal<Op<std::chrono::seconds>, OutType>::Exec;
+        auto exec = TemporalComponentExtract<Op<std::chrono::seconds>, OutType>::Exec;
         DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
         break;
       }
       case TimeUnit::MILLI: {
-        auto exec = ScalarUnaryTemporal<Op<std::chrono::milliseconds>, OutType>::Exec;
+        auto exec =
+            TemporalComponentExtract<Op<std::chrono::milliseconds>, OutType>::Exec;
         DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
         break;
       }
       case TimeUnit::MICRO: {
-        auto exec = ScalarUnaryTemporal<Op<std::chrono::microseconds>, OutType>::Exec;
+        auto exec =
+            TemporalComponentExtract<Op<std::chrono::microseconds>, OutType>::Exec;
         DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
         break;
       }
       case TimeUnit::NANO: {
-        auto exec = ScalarUnaryTemporal<Op<std::chrono::nanoseconds>, OutType>::Exec;
+        auto exec = TemporalComponentExtract<Op<std::chrono::nanoseconds>, OutType>::Exec;
         DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
         break;
       }
diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc
index be1054b3705..cc01d25de7c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc
@@ -159,22 +159,22 @@ TEST(ScalarTemporalTest, TestZonedTemporalComponentExtraction) {
     auto unit = timestamp(u, timezone);
     auto timestamps = ArrayFromJSON(unit, times);
 
-    ASSERT_RAISES(Invalid, Year(timestamps));
-    ASSERT_RAISES(Invalid, Month(timestamps));
-    ASSERT_RAISES(Invalid, Day(timestamps));
-    ASSERT_RAISES(Invalid, DayOfWeek(timestamps));
-    ASSERT_RAISES(Invalid, DayOfYear(timestamps));
-    ASSERT_RAISES(Invalid, ISOYear(timestamps));
-    ASSERT_RAISES(Invalid, ISOWeek(timestamps));
-    ASSERT_RAISES(Invalid, ISOCalendar(timestamps));
-    ASSERT_RAISES(Invalid, Quarter(timestamps));
-    ASSERT_RAISES(Invalid, Hour(timestamps));
-    ASSERT_RAISES(Invalid, Minute(timestamps));
-    ASSERT_RAISES(Invalid, Second(timestamps));
-    ASSERT_RAISES(Invalid, Millisecond(timestamps));
-    ASSERT_RAISES(Invalid, Microsecond(timestamps));
-    ASSERT_RAISES(Invalid, Nanosecond(timestamps));
-    ASSERT_RAISES(Invalid, Subsecond(timestamps));
+    ASSERT_RAISES(NotImplemented, Year(timestamps));
+    ASSERT_RAISES(NotImplemented, Month(timestamps));
+    ASSERT_RAISES(NotImplemented, Day(timestamps));
+    ASSERT_RAISES(NotImplemented, DayOfWeek(timestamps));
+    ASSERT_RAISES(NotImplemented, DayOfYear(timestamps));
+    ASSERT_RAISES(NotImplemented, ISOYear(timestamps));
+    ASSERT_RAISES(NotImplemented, ISOWeek(timestamps));
+    ASSERT_RAISES(NotImplemented, ISOCalendar(timestamps));
+    ASSERT_RAISES(NotImplemented, Quarter(timestamps));
+    ASSERT_RAISES(NotImplemented, Hour(timestamps));
+    ASSERT_RAISES(NotImplemented, Minute(timestamps));
+    ASSERT_RAISES(NotImplemented, Second(timestamps));
+    ASSERT_RAISES(NotImplemented, Millisecond(timestamps));
+    ASSERT_RAISES(NotImplemented, Microsecond(timestamps));
+    ASSERT_RAISES(NotImplemented, Nanosecond(timestamps));
+    ASSERT_RAISES(NotImplemented, Subsecond(timestamps));
   }
 }
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/kernels/test_util.cc b/cpp/src/arrow/compute/kernels/test_util.cc
index 18257973150..a1151717d8b 100644
--- a/cpp/src/arrow/compute/kernels/test_util.cc
+++ b/cpp/src/arrow/compute/kernels/test_util.cc
@@ -22,12 +22,14 @@
 #include <string>
 
 #include "arrow/array.h"
+#include "arrow/array/validate.h"
 #include "arrow/chunked_array.h"
 #include "arrow/compute/exec.h"
 #include "arrow/compute/function.h"
 #include "arrow/compute/registry.h"
 #include "arrow/datum.h"
 #include "arrow/result.h"
+#include "arrow/table.h"
 #include "arrow/testing/gtest_util.h"
 
 namespace arrow {
@@ -49,7 +51,7 @@ void CheckScalarNonRecursive(const std::string& func_name, const DatumVector& in
                              const FunctionOptions* options) {
   ASSERT_OK_AND_ASSIGN(Datum out, CallFunction(func_name, inputs, options));
   std::shared_ptr<Array> actual = std::move(out).make_array();
-  ASSERT_OK(actual->ValidateFull());
+  ValidateOutput(*actual);
   AssertArraysEqual(*expected, *actual, /*verbose=*/true);
 }
 
@@ -164,7 +166,9 @@ void CheckScalar(std::string func_name, const DatumVector& inputs,
 
     ASSERT_OK_AND_ASSIGN(Datum out,
                          CallFunction(func_name, GetDatums(chunked_inputs), options));
-    ASSERT_OK(out.chunked_array()->ValidateFull());
+    ValidateOutput(out);
+    auto chunked = out.chunked_array();
+    (void)chunked;
     AssertDatumsEqual(std::make_shared<ChunkedArray>(expected_chunks), out);
   }
 }
@@ -191,7 +195,7 @@ void CheckVectorUnary(std::string func_name, Datum input, std::shared_ptr<Array>
                       const FunctionOptions* options) {
   ASSERT_OK_AND_ASSIGN(Datum out, CallFunction(func_name, {input}, options));
   std::shared_ptr<Array> actual = std::move(out).make_array();
-  ASSERT_OK(actual->ValidateFull());
+  ValidateOutput(*actual);
   AssertArraysEqual(*expected, *actual, /*verbose=*/true);
 }
 
@@ -219,6 +223,57 @@ void CheckScalarBinary(std::string func_name, std::shared_ptr<Scalar> left_input
   CheckScalar(std::move(func_name), {left_input, right_input}, expected, options);
 }
 
+namespace {
+
+void ValidateOutput(const ArrayData& output) {
+  ASSERT_OK(::arrow::internal::ValidateArrayFull(output));
+  TestInitialized(output);
+}
+
+void ValidateOutput(const ChunkedArray& output) {
+  ASSERT_OK(output.ValidateFull());
+  for (const auto& chunk : output.chunks()) {
+    TestInitialized(*chunk);
+  }
+}
+
+void ValidateOutput(const RecordBatch& output) {
+  ASSERT_OK(output.ValidateFull());
+  for (const auto& column : output.column_data()) {
+    TestInitialized(*column);
+  }
+}
+
+void ValidateOutput(const Table& output) {
+  ASSERT_OK(output.ValidateFull());
+  for (const auto& column : output.columns()) {
+    for (const auto& chunk : column->chunks()) {
+      TestInitialized(*chunk);
+    }
+  }
+}
+
+}  // namespace
+
+void ValidateOutput(const Datum& output) {
+  switch (output.kind()) {
+    case Datum::ARRAY:
+      ValidateOutput(*output.array());
+      break;
+    case Datum::CHUNKED_ARRAY:
+      ValidateOutput(*output.chunked_array());
+      break;
+    case Datum::RECORD_BATCH:
+      ValidateOutput(*output.record_batch());
+      break;
+    case Datum::TABLE:
+      ValidateOutput(*output.table());
+      break;
+    default:
+      break;
+  }
+}
+
 void CheckDispatchBest(std::string func_name, std::vector<ValueDescr> original_values,
                        std::vector<ValueDescr> expected_equivalent_values) {
   ASSERT_OK_AND_ASSIGN(auto function, GetFunctionRegistry()->GetFunction(func_name));
diff --git a/cpp/src/arrow/compute/kernels/test_util.h b/cpp/src/arrow/compute/kernels/test_util.h
index 85ed04c183a..f4854087b51 100644
--- a/cpp/src/arrow/compute/kernels/test_util.h
+++ b/cpp/src/arrow/compute/kernels/test_util.h
@@ -135,6 +135,8 @@ void CheckScalarBinary(std::string func_name, std::shared_ptr<Scalar> left_input
 void CheckVectorUnary(std::string func_name, Datum input, std::shared_ptr<Array> expected,
                       const FunctionOptions* options = nullptr);
 
+void ValidateOutput(const Datum& output);
+
 using BinaryTypes =
     ::testing::Types<BinaryType, LargeBinaryType, StringType, LargeStringType>;
 using StringTypes = ::testing::Types<StringType, LargeStringType>;
diff --git a/cpp/src/arrow/compute/kernels/vector_hash_test.cc b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
index a3fa9314e60..c09b042a8be 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
@@ -59,7 +59,7 @@ template <typename T>
 void CheckUnique(const std::shared_ptr<T>& input,
                  const std::shared_ptr<Array>& expected) {
   ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, Unique(input));
-  ASSERT_OK(result->ValidateFull());
+  ValidateOutput(*result);
   // TODO: We probably shouldn't rely on array ordering.
   ASSERT_ARRAYS_EQUAL(*expected, *result);
 }
@@ -84,7 +84,7 @@ void CheckValueCountsNull(const std::shared_ptr<DataType>& type) {
   std::shared_ptr<Array> ex_counts = ArrayFromJSON(int64(), "[]");
 
   ASSERT_OK_AND_ASSIGN(auto result_struct, ValueCounts(input));
-  ASSERT_OK(result_struct->ValidateFull());
+  ValidateOutput(*result_struct);
   ASSERT_NE(result_struct->GetFieldByName(kValuesFieldName), nullptr);
   // TODO: We probably shouldn't rely on value ordering.
   ASSERT_ARRAYS_EQUAL(*ex_values, *result_struct->GetFieldByName(kValuesFieldName));
@@ -96,7 +96,7 @@ void CheckValueCounts(const std::shared_ptr<T>& input,
                       const std::shared_ptr<Array>& expected_values,
                       const std::shared_ptr<Array>& expected_counts) {
   ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, ValueCounts(input));
-  ASSERT_OK(result->ValidateFull());
+  ValidateOutput(*result);
   auto result_struct = std::dynamic_pointer_cast<StructArray>(result);
   ASSERT_EQ(result_struct->num_fields(), 2);
   // TODO: We probably shouldn't rely on value ordering.
@@ -128,7 +128,7 @@ void CheckDictEncode(const std::shared_ptr<Array>& input,
 
   ASSERT_OK_AND_ASSIGN(Datum datum_out, DictionaryEncode(input));
   std::shared_ptr<Array> result = MakeArray(datum_out.array());
-  ASSERT_OK(result->ValidateFull());
+  ValidateOutput(*result);
 
   ASSERT_ARRAYS_EQUAL(expected, *result);
 }
@@ -691,10 +691,7 @@ TEST_F(TestHashKernel, ZeroLengthDictionaryEncode) {
   // ARROW-7008
   auto values = ArrayFromJSON(utf8(), "[]");
   ASSERT_OK_AND_ASSIGN(Datum datum_result, DictionaryEncode(values));
-
-  std::shared_ptr<Array> result = datum_result.make_array();
-  const auto& dict_result = checked_cast<const DictionaryArray&>(*result);
-  ASSERT_OK(dict_result.ValidateFull());
+  ValidateOutput(datum_result);
 }
 
 TEST_F(TestHashKernel, NullEncodingSchemes) {
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
index cf52870ed89..f428da0fe35 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
@@ -51,7 +51,7 @@ TEST(GetTakeIndices, Basics) {
     ASSERT_OK_AND_ASSIGN(auto indices,
                          internal::GetTakeIndices(*filter->data(), null_selection));
     auto indices_array = MakeArray(indices);
-    ASSERT_OK(indices_array->ValidateFull());
+    ValidateOutput(indices);
     AssertArraysEqual(*expected_indices, *indices_array, /*verbose=*/true);
   };
 
@@ -73,13 +73,13 @@ TEST(GetTakeIndices, NullValidityBuffer) {
   ASSERT_OK_AND_ASSIGN(auto indices,
                        internal::GetTakeIndices(*filter.data(), FilterOptions::DROP));
   auto indices_array = MakeArray(indices);
-  ASSERT_OK(indices_array->ValidateFull());
+  ValidateOutput(indices);
   AssertArraysEqual(*expected_indices, *indices_array, /*verbose=*/true);
 
   ASSERT_OK_AND_ASSIGN(
       indices, internal::GetTakeIndices(*filter.data(), FilterOptions::EMIT_NULL));
   indices_array = MakeArray(indices);
-  ASSERT_OK(indices_array->ValidateFull());
+  ValidateOutput(indices);
   AssertArraysEqual(*expected_indices, *indices_array, /*verbose=*/true);
 }
 
@@ -93,7 +93,7 @@ void CheckGetTakeIndicesCase(const Array& untyped_filter) {
   // Verify DROP indices
   {
     IndexArrayType indices(drop_indices);
-    ASSERT_OK(indices.ValidateFull());
+    ValidateOutput(indices);
 
     int64_t out_position = 0;
     for (int64_t i = 0; i < filter.length(); ++i) {
@@ -116,7 +116,7 @@ void CheckGetTakeIndicesCase(const Array& untyped_filter) {
   // Verify EMIT_NULL indices
   {
     IndexArrayType indices(emit_indices);
-    ASSERT_OK(indices.ValidateFull());
+    ValidateOutput(indices);
 
     int64_t out_position = 0;
     for (int64_t i = 0; i < filter.length(); ++i) {
@@ -183,7 +183,7 @@ class TestFilterKernel : public ::testing::Test {
     // test with EMIT_NULL
     ASSERT_OK_AND_ASSIGN(Datum out_datum, Filter(values, filter, emit_null_));
     auto actual = out_datum.make_array();
-    ASSERT_OK(actual->ValidateFull());
+    ValidateOutput(*actual);
     AssertArraysEqual(*expected, *actual);
 
     // test with DROP using EMIT_NULL and a coalesced filter
@@ -192,7 +192,7 @@ class TestFilterKernel : public ::testing::Test {
     expected = out_datum.make_array();
     ASSERT_OK_AND_ASSIGN(out_datum, Filter(values, filter, drop_));
     actual = out_datum.make_array();
-    ASSERT_OK(actual->ValidateFull());
+    ValidateOutput(*actual);
     AssertArraysEqual(*expected, *actual);
   }
 
@@ -212,11 +212,11 @@ void ValidateFilter(const std::shared_ptr<Array>& values,
 
   ASSERT_OK_AND_ASSIGN(Datum out_datum, Filter(values, filter_boxed, emit_null));
   auto filtered_emit_null = out_datum.make_array();
-  ASSERT_OK(filtered_emit_null->ValidateFull());
+  ValidateOutput(*filtered_emit_null);
 
   ASSERT_OK_AND_ASSIGN(out_datum, Filter(values, filter_boxed, drop));
   auto filtered_drop = out_datum.make_array();
-  ASSERT_OK(filtered_drop->ValidateFull());
+  ValidateOutput(*filtered_drop);
 
   // Create the expected arrays using Take
   ASSERT_OK_AND_ASSIGN(
@@ -384,7 +384,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) {
                            Compare(array, Datum(fifty), CompareOptions(op)));
       ASSERT_OK_AND_ASSIGN(Datum filtered, Filter(array, selection));
       auto filtered_array = filtered.make_array();
-      ASSERT_OK(filtered_array->ValidateFull());
+      ValidateOutput(*filtered_array);
       auto expected =
           CompareAndFilter<TypeParam>(array->raw_values(), array->length(), c_fifty, op);
       ASSERT_ARRAYS_EQUAL(*filtered_array, *expected);
@@ -406,7 +406,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareArrayAndFilterRandomNumeric) {
       ASSERT_OK_AND_ASSIGN(Datum selection, Compare(lhs, rhs, CompareOptions(op)));
       ASSERT_OK_AND_ASSIGN(Datum filtered, Filter(lhs, selection));
       auto filtered_array = filtered.make_array();
-      ASSERT_OK(filtered_array->ValidateFull());
+      ValidateOutput(*filtered_array);
       auto expected = CompareAndFilter<TypeParam>(lhs->raw_values(), lhs->length(),
                                                   rhs->raw_values(), op);
       ASSERT_ARRAYS_EQUAL(*filtered_array, *expected);
@@ -434,7 +434,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, ScalarInRangeAndFilterRandomNumeric) {
     ASSERT_OK_AND_ASSIGN(Datum selection, And(greater_than_fifty, less_than_hundred));
     ASSERT_OK_AND_ASSIGN(Datum filtered, Filter(array, selection));
     auto filtered_array = filtered.make_array();
-    ASSERT_OK(filtered_array->ValidateFull());
+    ValidateOutput(*filtered_array);
     auto expected = CompareAndFilter<TypeParam>(
         array->raw_values(), array->length(),
         [&](CType e) { return (e > c_fifty) && (e < c_hundred); });
@@ -642,7 +642,7 @@ class TestFilterKernelWithRecordBatch : public TestFilterKernel<RecordBatch> {
     std::shared_ptr<RecordBatch> actual;
 
     ASSERT_OK(this->DoFilter(schm, batch_json, selection, options, &actual));
-    ASSERT_OK(actual->ValidateFull());
+    ValidateOutput(actual);
     ASSERT_BATCHES_EQUAL(*RecordBatchFromJSON(schm, expected_batch), *actual);
   }
 
@@ -695,7 +695,7 @@ class TestFilterKernelWithChunkedArray : public TestFilterKernel<ChunkedArray> {
                     const std::vector<std::string>& expected) {
     std::shared_ptr<ChunkedArray> actual;
     ASSERT_OK(this->FilterWithArray(type, values, filter, &actual));
-    ASSERT_OK(actual->ValidateFull());
+    ValidateOutput(actual);
     AssertChunkedEqual(*ChunkedArrayFromJSON(type, expected), *actual);
   }
 
@@ -705,7 +705,7 @@ class TestFilterKernelWithChunkedArray : public TestFilterKernel<ChunkedArray> {
                            const std::vector<std::string>& expected) {
     std::shared_ptr<ChunkedArray> actual;
     ASSERT_OK(this->FilterWithChunkedArray(type, values, filter, &actual));
-    ASSERT_OK(actual->ValidateFull());
+    ValidateOutput(actual);
     AssertChunkedEqual(*ChunkedArrayFromJSON(type, expected), *actual);
   }
 
@@ -754,7 +754,7 @@ class TestFilterKernelWithTable : public TestFilterKernel<Table> {
     std::shared_ptr<Table> actual;
 
     ASSERT_OK(this->FilterWithArray(schm, table_json, filter, options, &actual));
-    ASSERT_OK(actual->ValidateFull());
+    ValidateOutput(actual);
     ASSERT_TABLES_EQUAL(*TableFromJSON(schm, expected_table), *actual);
   }
 
@@ -765,7 +765,7 @@ class TestFilterKernelWithTable : public TestFilterKernel<Table> {
     std::shared_ptr<Table> actual;
 
     ASSERT_OK(this->FilterWithChunkedArray(schm, table_json, filter, options, &actual));
-    ASSERT_OK(actual->ValidateFull());
+    ValidateOutput(actual);
     AssertTablesEqual(*TableFromJSON(schm, expected_table), *actual,
                       /*same_chunk_layout=*/false);
   }
@@ -843,7 +843,7 @@ void AssertTakeArrays(const std::shared_ptr<Array>& values,
                       const std::shared_ptr<Array>& indices,
                       const std::shared_ptr<Array>& expected) {
   ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> actual, Take(*values, *indices));
-  ASSERT_OK(actual->ValidateFull());
+  ValidateOutput(actual);
   AssertArraysEqual(*expected, *actual, /*verbose=*/true);
 }
 
@@ -860,7 +860,7 @@ void CheckTake(const std::shared_ptr<DataType>& type, const std::string& values,
 
   for (auto index_type : {int8(), uint32()}) {
     ASSERT_OK(TakeJSON(type, values, index_type, indices, &actual));
-    ASSERT_OK(actual->ValidateFull());
+    ValidateOutput(actual);
     AssertArraysEqual(*ArrayFromJSON(type, expected), *actual, /*verbose=*/true);
   }
 }
@@ -900,7 +900,7 @@ void ValidateTake(const std::shared_ptr<Array>& values,
                   const std::shared_ptr<Array>& indices) {
   ASSERT_OK_AND_ASSIGN(Datum out, Take(values, indices));
   auto taken = out.make_array();
-  ASSERT_OK(taken->ValidateFull());
+  ValidateOutput(taken);
   ASSERT_EQ(indices->length(), taken->length());
   switch (indices->type_id()) {
     case Type::INT8:
@@ -1324,7 +1324,7 @@ class TestPermutationsWithTake : public TestBase {
   void DoTake(const Int16Array& values, const Int16Array& indices,
               std::shared_ptr<Int16Array>* out) {
     ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> boxed_out, Take(values, indices));
-    ASSERT_OK(boxed_out->ValidateFull());
+    ValidateOutput(boxed_out);
     *out = checked_pointer_cast<Int16Array>(std::move(boxed_out));
   }
 
@@ -1441,7 +1441,7 @@ class TestTakeKernelWithRecordBatch : public TestTakeKernelTyped<RecordBatch> {
 
     for (auto index_type : {int8(), uint32()}) {
       ASSERT_OK(TakeJSON(schm, batch_json, index_type, indices, &actual));
-      ASSERT_OK(actual->ValidateFull());
+      ValidateOutput(actual);
       ASSERT_BATCHES_EQUAL(*RecordBatchFromJSON(schm, expected_batch), *actual);
     }
   }
@@ -1499,7 +1499,7 @@ class TestTakeKernelWithChunkedArray : public TestTakeKernelTyped<ChunkedArray>
                   const std::vector<std::string>& expected) {
     std::shared_ptr<ChunkedArray> actual;
     ASSERT_OK(this->TakeWithArray(type, values, indices, &actual));
-    ASSERT_OK(actual->ValidateFull());
+    ValidateOutput(actual);
     AssertChunkedEqual(*ChunkedArrayFromJSON(type, expected), *actual);
   }
 
@@ -1509,7 +1509,7 @@ class TestTakeKernelWithChunkedArray : public TestTakeKernelTyped<ChunkedArray>
                          const std::vector<std::string>& expected) {
     std::shared_ptr<ChunkedArray> actual;
     ASSERT_OK(this->TakeWithChunkedArray(type, values, indices, &actual));
-    ASSERT_OK(actual->ValidateFull());
+    ValidateOutput(actual);
     AssertChunkedEqual(*ChunkedArrayFromJSON(type, expected), *actual);
   }
 
@@ -1557,7 +1557,7 @@ class TestTakeKernelWithTable : public TestTakeKernelTyped<Table> {
     std::shared_ptr<Table> actual;
 
     ASSERT_OK(this->TakeWithArray(schm, table_json, filter, &actual));
-    ASSERT_OK(actual->ValidateFull());
+    ValidateOutput(actual);
     ASSERT_TABLES_EQUAL(*TableFromJSON(schm, expected_table), *actual);
   }
 
@@ -1568,7 +1568,7 @@ class TestTakeKernelWithTable : public TestTakeKernelTyped<Table> {
     std::shared_ptr<Table> actual;
 
     ASSERT_OK(this->TakeWithChunkedArray(schm, table_json, filter, &actual));
-    ASSERT_OK(actual->ValidateFull());
+    ValidateOutput(actual);
     ASSERT_TABLES_EQUAL(*TableFromJSON(schm, expected_table), *actual);
   }
 
diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc
index a54890e51de..2d76f0102f0 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc
@@ -24,6 +24,7 @@
 #include "arrow/array/array_decimal.h"
 #include "arrow/array/concatenate.h"
 #include "arrow/compute/api_vector.h"
+#include "arrow/compute/kernels/test_util.h"
 #include "arrow/table.h"
 #include "arrow/testing/gtest_common.h"
 #include "arrow/testing/gtest_util.h"
@@ -153,7 +154,7 @@ class TestNthToIndicesBase : public TestBase {
     ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> offsets, NthToIndices(*values, n));
     // null_count field should have been initialized to 0, for convenience
     ASSERT_EQ(offsets->data()->null_count, 0);
-    ASSERT_OK(offsets->ValidateFull());
+    ValidateOutput(*offsets);
     Validate(*checked_pointer_cast<ArrayType>(values), n,
              *checked_pointer_cast<UInt64Array>(offsets));
   }
@@ -352,7 +353,7 @@ template <typename T>
 void AssertSortIndices(const std::shared_ptr<T>& input, SortOrder order,
                        const std::shared_ptr<Array>& expected) {
   ASSERT_OK_AND_ASSIGN(auto actual, SortIndices(*input, order));
-  ASSERT_OK(actual->ValidateFull());
+  ValidateOutput(*actual);
   AssertArraysEqual(*expected, *actual, /*verbose=*/true);
 }
 
@@ -360,7 +361,7 @@ template <typename T>
 void AssertSortIndices(const std::shared_ptr<T>& input, const SortOptions& options,
                        const std::shared_ptr<Array>& expected) {
   ASSERT_OK_AND_ASSIGN(auto actual, SortIndices(Datum(*input), options));
-  ASSERT_OK(actual->ValidateFull());
+  ValidateOutput(*actual);
   AssertArraysEqual(*expected, *actual, /*verbose=*/true);
 }
 
@@ -549,7 +550,7 @@ using SortIndicesableTypes =
 
 template <typename ArrayType>
 void ValidateSorted(const ArrayType& array, UInt64Array& offsets, SortOrder order) {
-  ASSERT_OK(array.ValidateFull());
+  ValidateOutput(array);
   SortComparator<ArrayType> compare;
   for (int i = 1; i < array.length(); i++) {
     uint64_t lhs = offsets.Value(i - 1);
@@ -1171,7 +1172,7 @@ class TestTableSortIndicesRandom : public testing::TestWithParam<RandomParam> {
  public:
   // Validates the sorted indexes are really sorted.
   void Validate(const Table& table, const SortOptions& options, UInt64Array& offsets) {
-    ASSERT_OK(offsets.ValidateFull());
+    ValidateOutput(offsets);
     Comparator comparator{table, options};
     for (int i = 1; i < table.num_rows(); i++) {
       uint64_t lhs = offsets.Value(i - 1);
diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
index 5e242d4e807..39ce58ecaf6 100644
--- a/cpp/src/arrow/filesystem/s3fs.cc
+++ b/cpp/src/arrow/filesystem/s3fs.cc
@@ -66,6 +66,8 @@
 #include <aws/s3/model/PutObjectRequest.h>
 #include <aws/s3/model/UploadPartRequest.h>
 
+#include "arrow/util/windows_fixup.h"
+
 #include "arrow/buffer.h"
 #include "arrow/filesystem/filesystem.h"
 #include "arrow/filesystem/path_util.h"
@@ -85,7 +87,6 @@
 #include "arrow/util/optional.h"
 #include "arrow/util/task_group.h"
 #include "arrow/util/thread_pool.h"
-#include "arrow/util/windows_fixup.h"
 
 namespace arrow {
 
diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc
index eb0edd56566..ea6edb0258e 100644
--- a/cpp/src/arrow/testing/gtest_util.cc
+++ b/cpp/src/arrow/testing/gtest_util.cc
@@ -643,17 +643,34 @@ void AssertZeroPadded(const Array& array) {
   }
 }
 
-void TestInitialized(const Array& array) {
-  for (const auto& buffer : array.data()->buffers) {
+void TestInitialized(const Array& array) { TestInitialized(*array.data()); }
+
+void TestInitialized(const ArrayData& array) {
+  uint8_t total = 0;
+  for (const auto& buffer : array.buffers) {
     if (buffer && buffer->capacity() > 0) {
-      int total = 0;
       auto data = buffer->data();
       for (int64_t i = 0; i < buffer->size(); ++i) {
         total ^= data[i];
       }
-      throw_away = total;
     }
   }
+  uint8_t total_bit = 0;
+  for (uint32_t mask = 1; mask < 256; mask <<= 1) {
+    total_bit ^= (total & mask) != 0;
+  }
+  // This is a dummy condition on all the bits of `total` (which depend on the
+  // entire buffer data).  If not all bits are well-defined, Valgrind will
+  // error with "Conditional jump or move depends on uninitialised value(s)".
+  if (total_bit == 0) {
+    ++throw_away;
+  }
+  for (const auto& child : array.child_data) {
+    TestInitialized(*child);
+  }
+  if (array.dictionary) {
+    TestInitialized(*array.dictionary);
+  }
 }
 
 void SleepFor(double seconds) {
diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h
index 9d01cd4bf27..591745151da 100644
--- a/cpp/src/arrow/testing/gtest_util.h
+++ b/cpp/src/arrow/testing/gtest_util.h
@@ -41,6 +41,7 @@
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/macros.h"
+#include "arrow/util/string_builder.h"
 #include "arrow/util/type_fwd.h"
 
 // NOTE: failing must be inline in the macros below, to get correct file / line number
@@ -136,6 +137,11 @@
     ASSERT_EQ(expected, _actual);               \
   } while (0)
 
+// A generalized version of GTest's SCOPED_TRACE that takes arbitrary arguments.
+//   ARROW_SCOPED_TRACE("some variable = ", some_variable, ...)
+
+#define ARROW_SCOPED_TRACE(...) SCOPED_TRACE(::arrow::util::StringBuilder(__VA_ARGS__))
+
 namespace arrow {
 
 // ----------------------------------------------------------------------
@@ -275,6 +281,7 @@ ARROW_TESTING_EXPORT void AssertZeroPadded(const Array& array);
 
 // Check if the valid buffer bytes are initialized
 // and cause valgrind warnings otherwise.
+ARROW_TESTING_EXPORT void TestInitialized(const ArrayData& array);
 ARROW_TESTING_EXPORT void TestInitialized(const Array& array);
 
 template <typename BuilderType>
diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h
index f55ac88fb91..09076c54d3c 100644
--- a/cpp/src/arrow/util/hashing.h
+++ b/cpp/src/arrow/util/hashing.h
@@ -329,8 +329,7 @@ class HashTable {
 
     // Stash old entries and seal builder, effectively resetting the Buffer
     const Entry* old_entries = entries_;
-    std::shared_ptr<Buffer> previous;
-    RETURN_NOT_OK(entries_builder_.Finish(&previous));
+    ARROW_ASSIGN_OR_RAISE(auto previous, entries_builder_.FinishWithLength(capacity_));
     // Allocate new buffer
     RETURN_NOT_OK(UpsizeBuffer(new_capacity));
 
@@ -461,6 +460,13 @@ class ScalarMemoTable : public MemoTable {
         out_data[index] = entry->payload.value;
       }
     });
+    // Zero-initialize the null entry
+    if (null_index_ != kKeyNotFound) {
+      int32_t index = null_index_ - start;
+      if (index >= 0) {
+        out_data[index] = Scalar{};
+      }
+    }
   }
 
   void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); }
@@ -775,6 +781,8 @@ class BinaryMemoTable : public MemoTable {
     if (left_size > 0) {
       memcpy(out_data, in_data + left_offset, left_size);
     }
+    // Zero-initialize the null entry
+    memset(out_data + left_size, 0, width_size);
 
     auto right_size = values_size() - static_cast<size_t>(null_data_offset);
     if (right_size > 0) {
diff --git a/cpp/src/arrow/util/windows_fixup.h b/cpp/src/arrow/util/windows_fixup.h
index 0afa53c6c1e..2949ac4ab76 100644
--- a/cpp/src/arrow/util/windows_fixup.h
+++ b/cpp/src/arrow/util/windows_fixup.h
@@ -19,6 +19,13 @@
 
 #ifdef _WIN32
 
+#ifdef max
+#undef max
+#endif
+#ifdef min
+#undef min
+#endif
+
 // The Windows API defines macros from *File resolving to either
 // *FileA or *FileW.  Need to undo them.
 #ifdef CopyFile
diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc
index 2a220231453..2fbebf27fce 100644
--- a/cpp/src/parquet/arrow/writer.cc
+++ b/cpp/src/parquet/arrow/writer.cc
@@ -25,7 +25,6 @@
 #include <vector>
 
 #include "arrow/array.h"
-#include "arrow/buffer_builder.h"
 #include "arrow/extension_type.h"
 #include "arrow/ipc/writer.h"
 #include "arrow/table.h"
@@ -56,7 +55,6 @@ using arrow::ExtensionArray;
 using arrow::ExtensionType;
 using arrow::Field;
 using arrow::FixedSizeBinaryArray;
-using Int16BufferBuilder = arrow::TypedBufferBuilder<int16_t>;
 using arrow::ListArray;
 using arrow::MemoryPool;
 using arrow::NumericArray;
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 89b2b0e0413..cc1e262a96d 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -341,7 +341,6 @@ class PlainEncoder<BooleanType> : public EncoderImpl, virtual public BooleanEnco
       // no nulls, just dump the data
       ::arrow::internal::CopyBitmap(data.data()->GetValues<uint8_t>(1), data.offset(),
                                     data.length(), sink_.mutable_data(), sink_.length());
-      sink_.UnsafeAdvance(data.length());
     } else {
       auto n_valid = BitUtil::BytesForBits(data.length() - data.null_count());
       PARQUET_THROW_NOT_OK(sink_.Reserve(n_valid));
@@ -360,6 +359,7 @@ class PlainEncoder<BooleanType> : public EncoderImpl, virtual public BooleanEnco
       }
       writer.Finish();
     }
+    sink_.UnsafeAdvance(data.length());
   }
 
  private:
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
index 02e81becd47..d271d59ef27 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -669,7 +669,7 @@ class EncodingAdHocTyped : public ::testing::Test {
     std::shared_ptr<::arrow::Array> result;
     ASSERT_OK(acc.Finish(&result));
     ASSERT_EQ(50, result->length());
-    ::arrow::AssertArraysEqual(*values, *result);
+    ::arrow::AssertArraysEqual(*values, *result, /*verbose=*/true);
   }
 
   void ByteStreamSplit(int seed) {

From 7ecb25ee81ba6260ba718996219521512ac536b9 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Mon, 21 Jun 2021 11:15:01 -0500
Subject: [PATCH 36/61] ARROW-12867: [R] Bindings for abs()

Closes #10519 from AlenkaF/ARROW-12867

Lead-authored-by: Alenka Frim <frim.alenka@gmail.com>
Co-authored-by: Alenka Frim <AlenkaF@users.noreply.github.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 r/R/expression.R                      |  1 +
 r/tests/testthat/test-dplyr-arrange.R | 13 -------------
 r/tests/testthat/test-dplyr.R         | 11 +++++++++++
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/r/R/expression.R b/r/R/expression.R
index 417a12eeb81..ba542339ff8 100644
--- a/r/R/expression.R
+++ b/r/R/expression.R
@@ -22,6 +22,7 @@
   "as.factor" = "dictionary_encode",
   "is.na" = "is_null",
   "is.nan" = "is_nan",
+  "abs" = "abs_checked",
   # nchar is defined in dplyr-functions.R
   "tolower" = "utf8_lower",
   "toupper" = "utf8_upper",
diff --git a/r/tests/testthat/test-dplyr-arrange.R b/r/tests/testthat/test-dplyr-arrange.R
index 45cd687e848..0d12740f4cb 100644
--- a/r/tests/testthat/test-dplyr-arrange.R
+++ b/r/tests/testthat/test-dplyr-arrange.R
@@ -139,19 +139,6 @@ test_that("arrange() on integer, double, and character columns", {
       collect(),
     tbl
   )
-  expect_warning(
-    expect_equal(
-      tbl %>%
-        Table$create() %>%
-        arrange(abs(int), dbl) %>%
-        collect(),
-      tbl %>%
-        arrange(abs(int), dbl) %>%
-        collect()
-    ),
-    "not supported in Arrow",
-    fixed = TRUE
-  )
 })
 
 test_that("arrange() on datetime columns", {
diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R
index 4fcb0e710a4..c3df89db359 100644
--- a/r/tests/testthat/test-dplyr.R
+++ b/r/tests/testthat/test-dplyr.R
@@ -899,3 +899,14 @@ test_that("No duplicate field names are allowed in an arrow_dplyr_query", {
   )
 })
 
+test_that("abs()", {
+  df <- tibble(x = c(-127, -10, -1, -0 , 0, 1, 10, 127, NA))
+
+  expect_dplyr_equal(
+    input %>%
+      transmute(
+        abs = abs(x)
+      ) %>% collect(),
+    df
+  )
+})

From 12f140f024aa5de39626b702c20b14824a19c02c Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 21 Jun 2021 12:40:22 -0400
Subject: [PATCH 37/61] ARROW-13110: [C++] Deadlock can happen when using
 BackgroundGenerator without transferring callbacks

Closes #10552 from westonpace/bugfix/ARROW-13110--c-deadlock-can-happen-when-using-backgroundgen

Authored-by: Weston Pace <weston.pace@gmail.com>
Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/arrow/dataset/scanner.cc     |  6 +++++-
 cpp/src/arrow/util/async_generator.h | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc
index d9c03239e83..09e05cdbf75 100644
--- a/cpp/src/arrow/dataset/scanner.cc
+++ b/cpp/src/arrow/dataset/scanner.cc
@@ -808,7 +808,11 @@ class OneShotFragment : public Fragment {
   Result<RecordBatchGenerator> ScanBatchesAsync(
       const std::shared_ptr<ScanOptions>& options) override {
     RETURN_NOT_OK(CheckConsumed());
-    return MakeBackgroundGenerator(std::move(batch_it_), options->io_context.executor());
+    ARROW_ASSIGN_OR_RAISE(
+        auto background_gen,
+        MakeBackgroundGenerator(std::move(batch_it_), options->io_context.executor()));
+    return MakeTransferredGenerator(std::move(background_gen),
+                                    internal::GetCpuThreadPool());
   }
   std::string type_name() const override { return "one-shot"; }
 
diff --git a/cpp/src/arrow/util/async_generator.h b/cpp/src/arrow/util/async_generator.h
index d975792ea10..5069d5092d4 100644
--- a/cpp/src/arrow/util/async_generator.h
+++ b/cpp/src/arrow/util/async_generator.h
@@ -20,6 +20,7 @@
 #include <cassert>
 #include <deque>
 #include <queue>
+#include <thread>
 
 #include "arrow/util/functional.h"
 #include "arrow/util/future.h"
@@ -1327,6 +1328,7 @@ class BackgroundGenerator {
     const int max_q;
     const int q_restart;
     Iterator<T> it;
+    std::thread::id worker_thread_id;
 
     // If true, the task is actively pumping items from the queue and does not need a
     // restart
@@ -1349,6 +1351,12 @@ class BackgroundGenerator {
   struct Cleanup {
     explicit Cleanup(State* state) : state(state) {}
     ~Cleanup() {
+      /// TODO: Once ARROW-13109 is available then we can be force consumers to spawn and
+      /// there is no need to perform this check.
+      ///
+      /// It's a deadlock if we enter cleanup from
+      /// the worker thread but it can happen if the consumer doesn't transfer away
+      assert(state->worker_thread_id != std::this_thread::get_id());
       Future<> finish_fut;
       {
         auto lock = state->mutex.Lock();
@@ -1369,6 +1377,7 @@ class BackgroundGenerator {
   static void WorkerTask(std::shared_ptr<State> state) {
     // We need to capture the state to read while outside the mutex
     bool reading = true;
+    state->worker_thread_id = std::this_thread::get_id();
     while (reading) {
       auto next = state->it.Next();
       // Need to capture state->waiting_future inside the mutex to mark finished outside
@@ -1420,6 +1429,7 @@ class BackgroundGenerator {
       // reference it.  We can safely transition to idle now.
       task_finished = state->task_finished;
       state->task_finished = Future<>();
+      state->worker_thread_id = std::thread::id();
     }
     task_finished.MarkFinished();
   }
@@ -1451,6 +1461,14 @@ constexpr int kDefaultBackgroundQRestart = 16;
 /// again.  If it is too high then it will be constantly stopping and restarting the
 /// background queue task
 ///
+/// The "background thread" is a logical thread and will run as tasks on the io_executor.
+/// This thread may stop and start when the queue fills up but there will only be one
+/// active background thread task at any given time.  You MUST transfer away from this
+/// background generator.  Otherwise there could be a race condition if a callback on the
+/// background thread deletes the last consumer reference to the background generator. You
+/// can transfer onto the same executor as the background thread, it is only neccesary to
+/// create a new thread task, not to switch executors.
+///
 /// This generator is not async-reentrant
 ///
 /// This generator will queue up to max_q blocks

From 453635d286367e6bbb69b9a099fefba905dd2ecb Mon Sep 17 00:00:00 2001
From: Benjamin Kietzman <bengilgit@gmail.com>
Date: Mon, 21 Jun 2021 15:09:14 -0400
Subject: [PATCH 38/61] ARROW-13097: [C++] Provide simple reflection utility

Provides functions for enumerating struct's data members, which enables reduction of boilerplate since many operations (equality comparison, serialization, ...) can be reduced to a generic loop over this enumeration of members.

```c++
struct Person { int age; std::string name; };

static auto kPersonProperties =
    MakeProperties(DataMember("age", &Person::age), DataMember("name", &Person::name));

bool operator==(const Person& l, const Person& r) { return EqualsImpl<Person>{l, r, kPersonProperties}.equal_; }
```

Closes #10551 from bkietz/13097-Provide-a-simple-reflecti

Authored-by: Benjamin Kietzman <bengilgit@gmail.com>
Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/arrow/util/CMakeLists.txt        |   1 +
 cpp/src/arrow/util/reflection_internal.h | 116 +++++++++++++
 cpp/src/arrow/util/reflection_test.cc    | 197 +++++++++++++++++++++++
 r/src/csv.cpp                            |   2 +-
 4 files changed, 315 insertions(+), 1 deletion(-)
 create mode 100644 cpp/src/arrow/util/reflection_internal.h
 create mode 100644 cpp/src/arrow/util/reflection_test.cc

diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index 37987b98520..e26a17120cd 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -57,6 +57,7 @@ add_arrow_test(utility-test
                logging_test.cc
                queue_test.cc
                range_test.cc
+               reflection_test.cc
                rle_encoding_test.cc
                stl_util_test.cc
                string_test.cc
diff --git a/cpp/src/arrow/util/reflection_internal.h b/cpp/src/arrow/util/reflection_internal.h
new file mode 100644
index 00000000000..522815dd2be
--- /dev/null
+++ b/cpp/src/arrow/util/reflection_internal.h
@@ -0,0 +1,116 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include "arrow/util/string_view.h"
+
+namespace arrow {
+namespace internal {
+
+template <size_t...>
+struct index_sequence {};
+
+template <size_t N, size_t Head = N, size_t... Tail>
+struct make_index_sequence_impl;
+
+template <size_t N>
+using make_index_sequence = typename make_index_sequence_impl<N>::type;
+
+template <typename... T>
+using index_sequence_for = make_index_sequence<sizeof...(T)>;
+
+template <size_t N, size_t... I>
+struct make_index_sequence_impl<N, 0, I...> {
+  using type = index_sequence<I...>;
+};
+
+template <size_t N, size_t H, size_t... I>
+struct make_index_sequence_impl : make_index_sequence_impl<N, H - 1, H - 1, I...> {};
+
+static_assert(std::is_same<index_sequence<>, make_index_sequence<0>>::value, "");
+static_assert(std::is_same<index_sequence<0, 1, 2>, make_index_sequence<3>>::value, "");
+
+template <typename...>
+struct all_same : std::true_type {};
+
+template <typename One>
+struct all_same<One> : std::true_type {};
+
+template <typename Same, typename... Rest>
+struct all_same<Same, Same, Rest...> : all_same<Same, Rest...> {};
+
+template <typename One, typename Other, typename... Rest>
+struct all_same<One, Other, Rest...> : std::false_type {};
+
+template <size_t... I, typename... T, typename Fn>
+void ForEachTupleMemberImpl(const std::tuple<T...>& tup, Fn&& fn, index_sequence<I...>) {
+  (void)std::make_tuple((fn(std::get<I>(tup), I), std::ignore)...);
+}
+
+template <typename... T, typename Fn>
+void ForEachTupleMember(const std::tuple<T...>& tup, Fn&& fn) {
+  ForEachTupleMemberImpl(tup, fn, index_sequence_for<T...>());
+}
+
+template <typename C, typename T>
+struct DataMemberProperty {
+  using Class = C;
+  using Type = T;
+
+  constexpr const Type& get(const Class& obj) const { return obj.*ptr_; }
+
+  void set(Class* obj, Type value) const { (*obj).*ptr_ = std::move(value); }
+
+  constexpr util::string_view name() const { return name_; }
+
+  util::string_view name_;
+  Type Class::*ptr_;
+};
+
+template <typename Class, typename Type>
+constexpr DataMemberProperty<Class, Type> DataMember(util::string_view name,
+                                                     Type Class::*ptr) {
+  return {name, ptr};
+}
+
+template <typename... Properties>
+struct PropertyTuple {
+  template <typename Fn>
+  void ForEach(Fn&& fn) const {
+    ForEachTupleMember(props_, fn);
+  }
+
+  static_assert(all_same<typename Properties::Class...>::value,
+                "All properties must be properties of the same class");
+
+  size_t size() const { return sizeof...(Properties); }
+
+  std::tuple<Properties...> props_;
+};
+
+template <typename... Properties>
+PropertyTuple<Properties...> MakeProperties(Properties... props) {
+  return {std::make_tuple(props...)};
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/reflection_test.cc b/cpp/src/arrow/util/reflection_test.cc
new file mode 100644
index 00000000000..4ffcf679ecc
--- /dev/null
+++ b/cpp/src/arrow/util/reflection_test.cc
@@ -0,0 +1,197 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sstream>
+
+#include <gtest/gtest.h>
+
+#include "arrow/util/reflection_internal.h"
+#include "arrow/util/string.h"
+
+namespace arrow {
+namespace internal {
+
+// generic property-based equality comparison
+template <typename Class>
+struct EqualsImpl {
+  template <typename Properties>
+  EqualsImpl(const Class& l, const Class& r, const Properties& props)
+      : left_(l), right_(r) {
+    props.ForEach(*this);
+  }
+
+  template <typename Property>
+  void operator()(const Property& prop, size_t i) {
+    equal_ &= prop.get(left_) == prop.get(right_);
+  }
+
+  const Class& left_;
+  const Class& right_;
+  bool equal_ = true;
+};
+
+// generic property-based serialization
+template <typename Class>
+struct ToStringImpl {
+  template <typename Properties>
+  ToStringImpl(util::string_view class_name, const Class& obj, const Properties& props)
+      : class_name_(class_name), obj_(obj), members_(props.size()) {
+    props.ForEach(*this);
+  }
+
+  template <typename Property>
+  void operator()(const Property& prop, size_t i) {
+    std::stringstream ss;
+    ss << prop.name() << ":" << prop.get(obj_);
+    members_[i] = ss.str();
+  }
+
+  std::string Finish() {
+    return class_name_.to_string() + "{" + JoinStrings(members_, ",") + "}";
+  }
+
+  util::string_view class_name_;
+  const Class& obj_;
+  std::vector<std::string> members_;
+};
+
+// generic property-based deserialization
+template <typename Class>
+struct FromStringImpl {
+  template <typename Properties>
+  FromStringImpl(util::string_view class_name, util::string_view repr,
+                 const Properties& props) {
+    Init(class_name, repr, props.size());
+    props.ForEach(*this);
+  }
+
+  void Fail() { obj_ = util::nullopt; }
+
+  void Init(util::string_view class_name, util::string_view repr, size_t num_properties) {
+    if (!repr.starts_with(class_name)) return Fail();
+
+    repr = repr.substr(class_name.size());
+    if (repr.empty()) return Fail();
+    if (repr.front() != '{') return Fail();
+    if (repr.back() != '}') return Fail();
+
+    repr = repr.substr(1, repr.size() - 2);
+    members_ = SplitString(repr, ',');
+    if (members_.size() != num_properties) return Fail();
+  }
+
+  template <typename Property>
+  void operator()(const Property& prop, size_t i) {
+    if (!obj_) return;
+
+    auto first_colon = members_[i].find_first_of(':');
+    if (first_colon == util::string_view::npos) return Fail();
+
+    auto name = members_[i].substr(0, first_colon);
+    if (name != prop.name()) return Fail();
+
+    auto value_repr = members_[i].substr(first_colon + 1);
+    typename Property::Type value;
+    try {
+      std::stringstream ss(value_repr.to_string());
+      ss >> value;
+      if (!ss.eof()) return Fail();
+    } catch (...) {
+      return Fail();
+    }
+    prop.set(&*obj_, std::move(value));
+  }
+
+  util::optional<Class> obj_ = Class{};
+  std::vector<util::string_view> members_;
+};
+
+// unmodified structure which we wish to reflect on:
+struct Person {
+  int age;
+  std::string name;
+};
+
+// enumeration of properties:
+// NB: no references to Person::age or Person::name after this
+// NB: ordering of properties follows this enum, regardless of
+//     order of declaration in `struct Person`
+static auto kPersonProperties =
+    MakeProperties(DataMember("age", &Person::age), DataMember("name", &Person::name));
+
+// use generic facilities to define equality, serialization and deserialization
+bool operator==(const Person& l, const Person& r) {
+  return EqualsImpl<Person>{l, r, kPersonProperties}.equal_;
+}
+
+bool operator!=(const Person& l, const Person& r) { return !(l == r); }
+
+std::string ToString(const Person& obj) {
+  return ToStringImpl<Person>{"Person", obj, kPersonProperties}.Finish();
+}
+
+void PrintTo(const Person& obj, std::ostream* os) { *os << ToString(obj); }
+
+util::optional<Person> PersonFromString(util::string_view repr) {
+  return FromStringImpl<Person>("Person", repr, kPersonProperties).obj_;
+}
+
+TEST(Reflection, EqualityWithDataMembers) {
+  Person genos{19, "Genos"};
+  Person kuseno{45, "Kuseno"};
+
+  EXPECT_EQ(genos, genos);
+  EXPECT_EQ(kuseno, kuseno);
+
+  EXPECT_NE(genos, kuseno);
+  EXPECT_NE(kuseno, genos);
+}
+
+TEST(Reflection, ToStringFromDataMembers) {
+  Person genos{19, "Genos"};
+  Person kuseno{45, "Kuseno"};
+
+  EXPECT_EQ(ToString(genos), "Person{age:19,name:Genos}");
+  EXPECT_EQ(ToString(kuseno), "Person{age:45,name:Kuseno}");
+}
+
+TEST(Reflection, FromStringToDataMembers) {
+  Person genos{19, "Genos"};
+
+  EXPECT_EQ(PersonFromString(ToString(genos)), genos);
+
+  EXPECT_EQ(PersonFromString(""), util::nullopt);
+  EXPECT_EQ(PersonFromString("Per"), util::nullopt);
+  EXPECT_EQ(PersonFromString("Person{"), util::nullopt);
+  EXPECT_EQ(PersonFromString("Person{age:19,name:Genos"), util::nullopt);
+
+  EXPECT_EQ(PersonFromString("Person{name:Genos"), util::nullopt);
+  EXPECT_EQ(PersonFromString("Person{age:19,name:Genos,extra:Cyborg}"), util::nullopt);
+  EXPECT_EQ(PersonFromString("Person{name:Genos,age:19"), util::nullopt);
+
+  EXPECT_EQ(PersonFromString("Fake{age:19,name:Genos}"), util::nullopt);
+
+  EXPECT_EQ(PersonFromString("Person{age,name:Genos}"), util::nullopt);
+  EXPECT_EQ(PersonFromString("Person{age:nineteen,name:Genos}"), util::nullopt);
+  EXPECT_EQ(PersonFromString("Person{age:19 ,name:Genos}"), util::nullopt);
+  EXPECT_EQ(PersonFromString("Person{age:19,moniker:Genos}"), util::nullopt);
+
+  EXPECT_EQ(PersonFromString("Person{age: 19, name: Genos}"), util::nullopt);
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/r/src/csv.cpp b/r/src/csv.cpp
index a8d2256cfe3..3e58f95c372 100644
--- a/r/src/csv.cpp
+++ b/r/src/csv.cpp
@@ -111,7 +111,7 @@ std::shared_ptr<arrow::csv::ConvertOptions> csv___ConvertOptions__initialize(
   if (!Rf_isNull(op_timestamp_parsers)) {
     std::vector<std::shared_ptr<arrow::TimestampParser>> timestamp_parsers;
 
-    // if we have a character vector, convert to arrow::TimestampParser
+    // if we have a character vector, convert to arrow::StrptimeTimestampParser
     if (TYPEOF(op_timestamp_parsers) == STRSXP) {
       cpp11::strings s_timestamp_parsers(op_timestamp_parsers);
       for (cpp11::r_string s : s_timestamp_parsers) {

From 71cb656cbfb6725b3d7d82877ca8ff2d1c932caa Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Tue, 22 Jun 2021 04:50:21 +0900
Subject: [PATCH 39/61] ARROW-13124: [Ruby] Add support for memory view

Closes #10558 from kou/ruby-memory-view

Authored-by: Sutou Kouhei <kou@clear-code.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 ruby/red-arrow/ext/arrow/arrow.cpp       |   3 +
 ruby/red-arrow/ext/arrow/memory-view.cpp | 311 ++++++++++++++++
 ruby/red-arrow/ext/arrow/memory-view.hpp |  26 ++
 ruby/red-arrow/red-arrow.gemspec         |   1 +
 ruby/red-arrow/test/helper.rb            |   1 +
 ruby/red-arrow/test/test-memory-view.rb  | 434 +++++++++++++++++++++++
 6 files changed, 776 insertions(+)
 create mode 100644 ruby/red-arrow/ext/arrow/memory-view.cpp
 create mode 100644 ruby/red-arrow/ext/arrow/memory-view.hpp
 create mode 100644 ruby/red-arrow/test/test-memory-view.rb

diff --git a/ruby/red-arrow/ext/arrow/arrow.cpp b/ruby/red-arrow/ext/arrow/arrow.cpp
index 6226ba0767c..86c8c8fb69f 100644
--- a/ruby/red-arrow/ext/arrow/arrow.cpp
+++ b/ruby/red-arrow/ext/arrow/arrow.cpp
@@ -18,6 +18,7 @@
  */
 
 #include "red-arrow.hpp"
+#include "memory-view.hpp"
 
 #include <ruby.hpp>
 
@@ -78,4 +79,6 @@ extern "C" void Init_arrow() {
   red_arrow::id_jd = rb_intern("jd");
   red_arrow::id_new = rb_intern("new");
   red_arrow::id_to_datetime = rb_intern("to_datetime");
+
+  red_arrow::memory_view::init(mArrow);
 }
diff --git a/ruby/red-arrow/ext/arrow/memory-view.cpp b/ruby/red-arrow/ext/arrow/memory-view.cpp
new file mode 100644
index 00000000000..a3135310c97
--- /dev/null
+++ b/ruby/red-arrow/ext/arrow/memory-view.cpp
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "memory-view.hpp"
+
+#include <arrow-glib/arrow-glib.hpp>
+#include <rbgobject.h>
+
+#include <ruby/version.h>
+
+#if RUBY_API_VERSION_MAJOR >= 3
+#  define HAVE_MEMORY_VIEW
+#  define private memory_view_private
+#  include <ruby/memory_view.h>
+#  undef private
+#endif
+
+#include <sstream>
+
+namespace red_arrow {
+  namespace memory_view {
+#ifdef HAVE_MEMORY_VIEW
+    // This is workaround for the following rb_memory_view_t problems
+    // in C++:
+    //
+    //   * Can't use "private" as member name
+    //   * Can't assign a value to "rb_memory_view_t::private"
+    //
+    // This has compatible layout with rb_memory_view_t.
+    struct memory_view {
+      VALUE obj;
+      void *data;
+      ssize_t byte_size;
+      bool readonly;
+      const char *format;
+      ssize_t item_size;
+      struct {
+        const rb_memory_view_item_component_t *components;
+        size_t length;
+      } item_desc;
+      ssize_t ndim;
+      const ssize_t *shape;
+      const ssize_t *strides;
+      const ssize_t *sub_offsets;
+      void *private_data;
+    };
+
+    struct PrivateData {
+      std::string format;
+    };
+
+    class PrimitiveArrayGetter : public arrow::ArrayVisitor {
+    public:
+      explicit PrimitiveArrayGetter(memory_view *view)
+        : view_(view) {
+      }
+
+      arrow::Status Visit(const arrow::BooleanArray& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        // Memory view doesn't support bit stream. We use one byte
+        // for 8 elements. Users can't calculate the number of
+        // elements from memory view but it's limitation of memory view.
+#ifdef ARROW_LITTLE_ENDIAN
+        view_->format = "b8";
+#else
+        view_->format = "B8";
+#endif
+        view_->item_size = 1;
+        view_->byte_size = (array.length() + 7) / 8;
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::Int8Array& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "c";
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::Int16Array& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "s";
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::Int32Array& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "l";
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::Int64Array& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "q";
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::UInt8Array& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "C";
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::UInt16Array& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "S";
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::UInt32Array& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "L";
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::UInt64Array& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "Q";
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::FloatArray& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "f";
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::DoubleArray& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "d";
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::FixedSizeBinaryArray& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        auto priv = static_cast<PrivateData *>(view_->private_data);
+        const auto type =
+          std::static_pointer_cast<const arrow::FixedSizeBinaryType>(
+            array.type());
+        std::ostringstream output;
+        output << "C" << type->byte_width();
+        priv->format = output.str();
+        view_->format = priv->format.c_str();
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::Date32Array& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "l";
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::Date64Array& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "q";
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::Time32Array& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "l";
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::Time64Array& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "q";
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::TimestampArray& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "q";
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::Decimal128Array& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "q2";
+        return arrow::Status::OK();
+      }
+
+      arrow::Status Visit(const arrow::Decimal256Array& array) override {
+        fill(static_cast<const arrow::Array&>(array));
+        view_->format = "q4";
+        return arrow::Status::OK();
+      }
+
+      private:
+      void fill(const arrow::Array& array) {
+        const auto array_data = array.data();
+        const auto data = array_data->GetValuesSafe<uint8_t>(1);
+        view_->data = const_cast<void *>(reinterpret_cast<const void *>(data));
+        const auto type =
+          std::static_pointer_cast<const arrow::FixedWidthType>(array.type());
+        view_->item_size = type->bit_width() / 8;
+        view_->byte_size = view_->item_size * array.length();
+      }
+
+      memory_view *view_;
+    };
+
+    bool primitive_array_get(VALUE obj, rb_memory_view_t *view, int flags) {
+      if (flags != RUBY_MEMORY_VIEW_SIMPLE) {
+        return false;
+      }
+      auto view_ = reinterpret_cast<memory_view *>(view);
+      view_->obj = obj;
+      view_->private_data = new PrivateData();
+      auto array = GARROW_ARRAY(RVAL2GOBJ(obj));
+      auto arrow_array = garrow_array_get_raw(array);
+      PrimitiveArrayGetter getter(view_);
+      auto status = arrow_array->Accept(&getter);
+      if (!status.ok()) {
+        return false;
+      }
+      view_->readonly = true;
+      view_->ndim = 1;
+      view_->shape = NULL;
+      view_->strides = NULL;
+      view_->sub_offsets = NULL;
+      return true;
+    }
+
+    bool primitive_array_release(VALUE obj, rb_memory_view_t *view) {
+      auto view_ = reinterpret_cast<memory_view *>(view);
+      delete static_cast<PrivateData *>(view_->private_data);
+      return true;
+    }
+
+    bool primitive_array_available_p(VALUE obj) {
+      return true;
+    }
+
+    rb_memory_view_entry_t primitive_array_entry = {
+      primitive_array_get,
+      primitive_array_release,
+      primitive_array_available_p,
+    };
+
+    bool buffer_get(VALUE obj, rb_memory_view_t *view, int flags) {
+      if (flags != RUBY_MEMORY_VIEW_SIMPLE) {
+        return false;
+      }
+      auto view_ = reinterpret_cast<memory_view *>(view);
+      view_->obj = obj;
+      auto buffer = GARROW_BUFFER(RVAL2GOBJ(obj));
+      auto arrow_buffer = garrow_buffer_get_raw(buffer);
+      view_->data =
+        const_cast<void *>(reinterpret_cast<const void *>(arrow_buffer->data()));
+      // Memory view doesn't support bit stream. We use one byte
+      // for 8 elements. Users can't calculate the number of
+      // elements from memory view but it's limitation of memory view.
+#ifdef ARROW_LITTLE_ENDIAN
+      view_->format = "b8";
+#else
+      view_->format = "B8";
+#endif
+      view_->item_size = 1;
+      view_->byte_size = arrow_buffer->size();
+      view_->readonly = true;
+      view_->ndim = 1;
+      view_->shape = NULL;
+      view_->strides = NULL;
+      view_->sub_offsets = NULL;
+      return true;
+    }
+
+    bool buffer_release(VALUE obj, rb_memory_view_t *view) {
+      return true;
+    }
+
+    bool buffer_available_p(VALUE obj) {
+      return true;
+    }
+
+    rb_memory_view_entry_t buffer_entry = {
+      buffer_get,
+      buffer_release,
+      buffer_available_p,
+    };
+#endif
+
+    void init(VALUE mArrow) {
+#ifdef HAVE_MEMORY_VIEW
+      auto cPrimitiveArray =
+        rb_const_get_at(mArrow, rb_intern("PrimitiveArray"));
+      rb_memory_view_register(cPrimitiveArray,
+                              &(red_arrow::memory_view::primitive_array_entry));
+
+      auto cBuffer = rb_const_get_at(mArrow, rb_intern("Buffer"));
+      rb_memory_view_register(cBuffer, &(red_arrow::memory_view::buffer_entry));
+#endif
+    }
+  }
+}
diff --git a/ruby/red-arrow/ext/arrow/memory-view.hpp b/ruby/red-arrow/ext/arrow/memory-view.hpp
new file mode 100644
index 00000000000..7a776462275
--- /dev/null
+++ b/ruby/red-arrow/ext/arrow/memory-view.hpp
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <ruby.hpp>
+
+namespace red_arrow {
+  namespace memory_view {
+    void init(VALUE mArrow);
+  }
+}
diff --git a/ruby/red-arrow/red-arrow.gemspec b/ruby/red-arrow/red-arrow.gemspec
index f23ba9edc59..e5a602a0862 100644
--- a/ruby/red-arrow/red-arrow.gemspec
+++ b/ruby/red-arrow/red-arrow.gemspec
@@ -55,6 +55,7 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency("benchmark-driver")
   spec.add_development_dependency("bundler")
   spec.add_development_dependency("faker")
+  spec.add_development_dependency("fiddle", ">= 1.0.9")
   spec.add_development_dependency("rake")
   spec.add_development_dependency("redcarpet")
   spec.add_development_dependency("test-unit")
diff --git a/ruby/red-arrow/test/helper.rb b/ruby/red-arrow/test/helper.rb
index f7748b254a7..29e5f9cbcf4 100644
--- a/ruby/red-arrow/test/helper.rb
+++ b/ruby/red-arrow/test/helper.rb
@@ -17,6 +17,7 @@
 
 require "arrow"
 
+require "fiddle"
 require "pathname"
 require "tempfile"
 require "zlib"
diff --git a/ruby/red-arrow/test/test-memory-view.rb b/ruby/red-arrow/test/test-memory-view.rb
new file mode 100644
index 00000000000..0b9c98c407f
--- /dev/null
+++ b/ruby/red-arrow/test/test-memory-view.rb
@@ -0,0 +1,434 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class MemoryViewTest < Test::Unit::TestCase
+  def setup
+    unless Fiddle.const_defined?(:MemoryView)
+      omit("Fiddle::MemoryView is needed")
+    end
+    unless Fiddle::MemoryView.respond_to?(:export)
+      omit("Fiddle::MemoryView.export is needed")
+    end
+  end
+
+  def little_endian?
+    [1].pack("s") == [1].pack("s<")
+  end
+
+  test("BooleanArray") do
+    array = Arrow::BooleanArray.new([true] * 9)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      if little_endian?
+        template = "b"
+      else
+        template = "B"
+      end
+      assert_equal([
+                     "#{template}8",
+                     1,
+                     2,
+                     [(("1" * 9) + ("0" * 7))].pack("#{template}*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("Int8Array") do
+    values = [-(2 ** 7), 0, (2 ** 7) - 1]
+    array = Arrow::Int8Array.new(values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "c",
+                     1,
+                     values.size,
+                     values.pack("c*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("Int16Array") do
+    values = [-(2 ** 15), 0, (2 ** 15) - 1]
+    array = Arrow::Int16Array.new(values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "s",
+                     2,
+                     2 * values.size,
+                     values.pack("s*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("Int32Array") do
+    values = [-(2 ** 31), 0, (2 ** 31) - 1]
+    array = Arrow::Int32Array.new(values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "l",
+                     4,
+                     4 * values.size,
+                     values.pack("l*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("Int64Array") do
+    values = [-(2 ** 63), 0, (2 ** 63) - 1]
+    array = Arrow::Int64Array.new(values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "q",
+                     8,
+                     8 * values.size,
+                     values.pack("q*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("UInt8Array") do
+    values = [0, (2 ** 8) - 1]
+    array = Arrow::UInt8Array.new(values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "C",
+                     1,
+                     values.size,
+                     values.pack("C*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("UInt16Array") do
+    values = [0, (2 ** 16) - 1]
+    array = Arrow::UInt16Array.new(values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "S",
+                     2,
+                     2 * values.size,
+                     values.pack("S*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("UInt32Array") do
+    values = [0, (2 ** 32) - 1]
+    array = Arrow::UInt32Array.new(values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "L",
+                     4,
+                     4 * values.size,
+                     values.pack("L*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("UInt64Array") do
+    values = [(2 ** 64) - 1]
+    array = Arrow::UInt64Array.new(values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "Q",
+                     8,
+                     8 * values.size,
+                     values.pack("Q*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("FloatArray") do
+    values = [-1.1, 0.0, 1.1]
+    array = Arrow::FloatArray.new(values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "f",
+                     4,
+                     4 * values.size,
+                     values.pack("f*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("DoubleArray") do
+    values = [-1.1, 0.0, 1.1]
+    array = Arrow::DoubleArray.new(values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "d",
+                     8,
+                     8 * values.size,
+                     values.pack("d*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("FixedSizeBinaryArray") do
+    values = ["\x01\x02", "\x03\x04", "\x05\x06"]
+    data_type = Arrow::FixedSizeBinaryDataType.new(2)
+    array = Arrow::FixedSizeBinaryArray.new(data_type, values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "C2",
+                     2,
+                     2 * values.size,
+                     values.join("").b,
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("Date32Array") do
+    n_days_since_epoch = 17406 # 2017-08-28
+    values = [n_days_since_epoch]
+    array = Arrow::Date32Array.new(values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "l",
+                     4,
+                     4 * values.size,
+                     values.pack("l*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("Date64Array") do
+    n_msecs_since_epoch = 1503878400000 # 2017-08-28T00:00:00Z
+    values = [n_msecs_since_epoch]
+    array = Arrow::Date64Array.new(values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "q",
+                     8,
+                     8 * values.size,
+                     values.pack("q*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("Time32Array") do
+    values = [1, 2, 3]
+    array = Arrow::Time32Array.new(:milli, values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "l",
+                     4,
+                     4 * values.size,
+                     values.pack("l*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("Time64Array") do
+    values = [1, 2, 3]
+    array = Arrow::Time64Array.new(:nano, values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "q",
+                     8,
+                     8 * values.size,
+                     values.pack("q*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("TimestampArray") do
+    values = [1, 2, 3]
+    array = Arrow::TimestampArray.new(:micro, values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "q",
+                     8,
+                     8 * values.size,
+                     values.pack("q*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("Decimal128Array") do
+    values = [
+      Arrow::Decimal128.new("10.1"),
+      Arrow::Decimal128.new("11.1"),
+      Arrow::Decimal128.new("10.2"),
+    ]
+    data_type = Arrow::Decimal128DataType.new(3, 1)
+    array = Arrow::Decimal128Array.new(data_type, values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "q2",
+                     16,
+                     16 * values.size,
+                     values.collect {|value| value.to_bytes.to_s}.join(""),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("Decimal256Array") do
+    values = [
+      Arrow::Decimal256.new("10.1"),
+      Arrow::Decimal256.new("11.1"),
+      Arrow::Decimal256.new("10.2"),
+    ]
+    data_type = Arrow::Decimal256DataType.new(3, 1)
+    array = Arrow::Decimal256Array.new(data_type, values)
+    Fiddle::MemoryView.export(array) do |memory_view|
+      assert_equal([
+                     "q4",
+                     32,
+                     32 * values.size,
+                     values.collect {|value| value.to_bytes.to_s}.join(""),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+
+  test("Buffer") do
+    values = [0, nil, nil] * 3
+    array = Arrow::Int8Array.new(values)
+    buffer = array.null_bitmap
+    Fiddle::MemoryView.export(buffer) do |memory_view|
+      if little_endian?
+        template = "b"
+      else
+        template = "B"
+      end
+      assert_equal([
+                     "#{template}8",
+                     1,
+                     2,
+                     ["100" * 3].pack("#{template}*"),
+                   ],
+                   [
+                     memory_view.format,
+                     memory_view.item_size,
+                     memory_view.byte_size,
+                     memory_view.to_s,
+                   ])
+    end
+  end
+end

From 26b723531529ca946475a481d01565948afd9483 Mon Sep 17 00:00:00 2001
From: Jonathan Keane <jkeane@gmail.com>
Date: Mon, 21 Jun 2021 17:08:04 -0500
Subject: [PATCH 40/61] ARROW-13127: [R] Valgrind nightly errors

Closes #10560 from jonkeane/ARROW-13127-valgrind-c-export

Authored-by: Jonathan Keane <jkeane@gmail.com>
Signed-off-by: Jonathan Keane <jkeane@gmail.com>
---
 r/tests/testthat/test-Array.R       |  8 ++++----
 r/tests/testthat/test-RecordBatch.R | 28 ++++++++++++++++------------
 r/tests/testthat/test-data-type.R   |  4 +++-
 r/tests/testthat/test-field.R       |  4 +++-
 r/tests/testthat/test-schema.R      |  4 +++-
 5 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R
index a9f20c89574..a86bdf0add4 100644
--- a/r/tests/testthat/test-Array.R
+++ b/r/tests/testthat/test-Array.R
@@ -829,13 +829,13 @@ test_that("Array to C-interface", {
   # export the array via the C-interface
   schema_ptr <- allocate_arrow_schema()
   array_ptr <- allocate_arrow_array()
-  on.exit({
-    delete_arrow_schema(schema_ptr)
-    delete_arrow_array(array_ptr)
-  })
   arr$export_to_c(array_ptr, schema_ptr)
 
   # then import it and check that the roundtripped value is the same
   circle <- Array$import_from_c(array_ptr, schema_ptr)
   expect_equal(arr, circle)
+
+  # must clean up the pointers or we leak
+  delete_arrow_schema(schema_ptr)
+  delete_arrow_array(array_ptr)
 })
diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R
index c7e8b2dc2bf..681406caf64 100644
--- a/r/tests/testthat/test-RecordBatch.R
+++ b/r/tests/testthat/test-RecordBatch.R
@@ -422,22 +422,22 @@ test_that("record_batch() scalar recycling with vectors", {
 })
 
 test_that("record_batch() scalar recycling with Scalars, Arrays, and ChunkedArrays", {
-  
+
   expect_data_frame(
     record_batch(a = Array$create(1:10), b = Scalar$create(5)),
     tibble::tibble(a = 1:10, b = 5)
   )
-  
+
   expect_data_frame(
     record_batch(a = Array$create(1:10), b = Array$create(5)),
     tibble::tibble(a = 1:10, b = 5)
   )
-  
+
   expect_data_frame(
     record_batch(a = Array$create(1:10), b = ChunkedArray$create(5)),
     tibble::tibble(a = 1:10, b = 5)
   )
-  
+
 })
 
 test_that("record_batch() no recycling with tibbles", {
@@ -448,7 +448,7 @@ test_that("record_batch() no recycling with tibbles", {
     ),
     regexp = "All input tibbles or data.frames must have the same number of rows"
   )
-  
+
   expect_error(
     record_batch(
       tibble::tibble(a = 1:10),
@@ -573,7 +573,6 @@ test_that("RecordBatchReader to C-interface", {
 
   # export the RecordBatchReader via the C-interface
   stream_ptr <- allocate_arrow_array_stream()
-  on.exit(delete_arrow_array_stream(stream_ptr))
   scan <- Scanner$create(tab)
   reader <- scan$ToRecordBatchReader()
   reader$export_to_c(stream_ptr)
@@ -583,9 +582,11 @@ test_that("RecordBatchReader to C-interface", {
   tab_from_c_new <- circle$read_table()
   expect_equal(tab, tab_from_c_new)
 
+  # must clean up the pointer or we leak
+  delete_arrow_array_stream(stream_ptr)
+
   # export the RecordBatchStreamReader via the C-interface
   stream_ptr_new <- allocate_arrow_array_stream()
-  on.exit(delete_arrow_array_stream(stream_ptr_new))
   bytes <- write_to_raw(example_data)
   expect_type(bytes, "raw")
   reader_new <- RecordBatchStreamReader$create(bytes)
@@ -595,6 +596,9 @@ test_that("RecordBatchReader to C-interface", {
   circle_new <- RecordBatchStreamReader$import_from_c(stream_ptr_new)
   tab_from_c_new <- circle_new$read_table()
   expect_equal(tab, tab_from_c_new)
+
+  # must clean up the pointer or we leak
+  delete_arrow_array_stream(stream_ptr_new)
 })
 
 test_that("RecordBatch to C-interface", {
@@ -603,13 +607,13 @@ test_that("RecordBatch to C-interface", {
   # export the RecordBatch via the C-interface
   schema_ptr <- allocate_arrow_schema()
   array_ptr <- allocate_arrow_array()
-  on.exit({
-    delete_arrow_schema(schema_ptr)
-    delete_arrow_array(array_ptr)
-  })
   batch$export_to_c(array_ptr, schema_ptr)
 
   # then import it and check that the roundtripped value is the same
   circle <- RecordBatch$import_from_c(array_ptr, schema_ptr)
-  expect_equal(batch, circle)
+  expect_equal
+
+  # must clean up the pointers or we leak
+  delete_arrow_schema(schema_ptr)
+  delete_arrow_array(array_ptr)
 })
diff --git a/r/tests/testthat/test-data-type.R b/r/tests/testthat/test-data-type.R
index 412abef98e9..25c0dd5fc9f 100644
--- a/r/tests/testthat/test-data-type.R
+++ b/r/tests/testthat/test-data-type.R
@@ -417,10 +417,12 @@ test_that("DataType to C-interface", {
 
   # export the datatype via the C-interface
   ptr <- allocate_arrow_schema()
-  on.exit(delete_arrow_schema(ptr))
   datatype$export_to_c(ptr)
 
   # then import it and check that the roundtripped value is the same
   circle <- DataType$import_from_c(ptr)
   expect_equal(circle, datatype)
+
+  # must clean up the pointer or we leak
+  delete_arrow_schema(ptr)
 })
diff --git a/r/tests/testthat/test-field.R b/r/tests/testthat/test-field.R
index aacb5012e70..a9ef5a32e36 100644
--- a/r/tests/testthat/test-field.R
+++ b/r/tests/testthat/test-field.R
@@ -42,10 +42,12 @@ test_that("Field to C-interface", {
 
   # export the field via the C-interface
   ptr <- allocate_arrow_schema()
-  on.exit(delete_arrow_schema(ptr))
   field$export_to_c(ptr)
 
   # then import it and check that the roundtripped value is the same
   circle <- Field$import_from_c(ptr)
   expect_equal(circle, field)
+
+  # must clean up the pointer or we leak
+  delete_arrow_schema(ptr)
 })
diff --git a/r/tests/testthat/test-schema.R b/r/tests/testthat/test-schema.R
index 9509c888578..278dc19f2c9 100644
--- a/r/tests/testthat/test-schema.R
+++ b/r/tests/testthat/test-schema.R
@@ -180,10 +180,12 @@ test_that("Schema to C-interface", {
 
   # export the schema via the C-interface
   ptr <- allocate_arrow_schema()
-  on.exit(delete_arrow_schema(ptr))
   schema$export_to_c(ptr)
 
   # then import it and check that the roundtripped value is the same
   circle <- Schema$import_from_c(ptr)
   expect_equal(circle, schema)
+
+  # must clean up the pointer or we leak
+  delete_arrow_schema(ptr)
 })

From ed36a1d6c4030a1daab9eee9e391692184c65874 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Tue, 22 Jun 2021 08:30:49 +0200
Subject: [PATCH 41/61] cleanup and formatting

---
 cpp/src/arrow/visitor_inline.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h
index dbd8c09112d..132c35aeaa1 100644
--- a/cpp/src/arrow/visitor_inline.h
+++ b/cpp/src/arrow/visitor_inline.h
@@ -23,7 +23,6 @@
 
 #include "arrow/array.h"
 #include "arrow/extension_type.h"
-#include "arrow/extensions/complex_type.h"
 #include "arrow/scalar.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
@@ -51,7 +50,6 @@ namespace arrow {
   ACTION(Float);                                     \
   ACTION(Double)
 
-
 #define ARROW_GENERATE_FOR_ALL_TYPES(ACTION)    \
   ACTION(Null);                                 \
   ACTION(Boolean);                              \

From 4ce447a6cbea2139ebf5fa0c3c594ac073b4a0ab Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Wed, 23 Jun 2021 11:40:40 +0200
Subject: [PATCH 42/61] fixes

---
 cpp/src/arrow/extension_type_test.cc    | 10 +++++-----
 cpp/src/arrow/extensions/complex_type.h |  4 ++--
 cpp/src/arrow/python/numpy_convert.cc   | 19 +++++++++----------
 cpp/src/arrow/python/python_to_arrow.cc | 16 +++++++++++++++-
 4 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc
index b20c30c6a54..bf958fc2348 100644
--- a/cpp/src/arrow/extension_type_test.cc
+++ b/cpp/src/arrow/extension_type_test.cc
@@ -189,17 +189,17 @@ class TestExtensionType : public ::testing::Test {
     if (GetExtensionType("uuid")) {
       ASSERT_OK(UnregisterExtensionType("uuid"));
     }
-    if (GetExtensionType("arrow.complex64")) {
-      ASSERT_OK(UnregisterExtensionType("arrow.complex64"));
+    if (GetExtensionType("arrow.extension.complex64")) {
+      ASSERT_OK(UnregisterExtensionType("arrow.extension.complex64"));
     }
-    if (GetExtensionType("arrow.complex128")) {
-      ASSERT_OK(UnregisterExtensionType("arrow.complex128"));
+    if (GetExtensionType("arrow.extension.complex128")) {
+      ASSERT_OK(UnregisterExtensionType("arrow.extension.complex128"));
     }
   }
 };
 
 TEST_F(TestExtensionType, ComplexTypeTest) {
-  auto registered_type = GetExtensionType("arrow.complex64");
+  auto registered_type = GetExtensionType("arrow.extension.complex64");
   ASSERT_NE(registered_type, nullptr);
 
   auto type = complex64();
diff --git a/cpp/src/arrow/extensions/complex_type.h b/cpp/src/arrow/extensions/complex_type.h
index 22f584b1066..5979a79e233 100644
--- a/cpp/src/arrow/extensions/complex_type.h
+++ b/cpp/src/arrow/extensions/complex_type.h
@@ -42,7 +42,7 @@ class ComplexFloatType : public ExtensionType {
   }
 
   std::string extension_name() const override {
-    return "arrow.complex64";
+    return "arrow.extension.complex64";
   }
 
   bool ExtensionEquals(const ExtensionType& other) const override;
@@ -78,7 +78,7 @@ class ComplexDoubleType : public ExtensionType {
   }
 
   std::string extension_name() const override {
-    return "arrow.complex128";
+    return "arrow.extension.complex128";
   }
 
   bool ExtensionEquals(const ExtensionType& other) const override;
diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc
index 241b0323513..256bb5d00b9 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -87,10 +87,10 @@ Status GetTensorType(PyObject* dtype, std::shared_ptr<DataType>* out) {
     TO_ARROW_TYPE_CASE(FLOAT32, float32);
     TO_ARROW_TYPE_CASE(FLOAT64, float64);
     case NPY_COMPLEX64:
-      *out = complex(float32());
+      *out = complex64();
       break;
     case NPY_COMPLEX128:
-      *out = complex(float64());
+      *out = complex128();
       break;
     default: {
       return Status::NotImplemented("Unsupported numpy type ", descr->type_num);
@@ -118,20 +118,19 @@ Status GetNumPyType(const DataType& type, int* type_num) {
     NUMPY_TYPE_CASE(FLOAT, FLOAT32);
     NUMPY_TYPE_CASE(DOUBLE, FLOAT64);
     case Type::EXTENSION: {
-      const auto* ptr = dynamic_cast<const ComplexType*>(&type);
+      auto ext_ptr = dynamic_cast<const ExtensionType*>(&type);
 
-      if (ptr == nullptr) {
-        // continue into the default branch
-      } else if (ptr->subtype()->Equals(float32())) {
+      if(ext_ptr == nullptr) {
+        return Status::Invalid(type.id(), " could not be cast to ExtensionType");
+      } else if (ext_ptr->extension_name() == "arrow.extension.complex64") {
         *type_num = NPY_COMPLEX64;
         break;
-      } else if (ptr->subtype()->Equals(float64())) {
+      } else if (ext_ptr->extension_name() == "arrow.extension.complex128") {
         *type_num = NPY_COMPLEX128;
         break;
       } else {
-        return Status::NotImplemented("Unsupported complex tensor type: ",
-                                      ptr->ToString());
-        break;
+        return Status::NotImplemented("Unsupported ExtensionType: ",
+                                      ext_ptr->extension_name());
       }
     }
 
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index cb40ca827aa..0d31ba3d70d 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -166,7 +166,21 @@ class PyValue {
     return value;
   }
 
-  static Result<std::complex<double>> Convert(const ComplexType*, const O&, I obj) {
+  static Result<std::complex<double>> Convert(const ComplexFloatType*, const O&, I obj) {
+    std::complex<double> value;
+
+    if (PyComplex_Check(obj)) {
+      value =
+          std::complex<double>(PyComplex_RealAsDouble(obj), PyComplex_ImagAsDouble(obj));
+      RETURN_IF_PYERROR();
+    } else {
+      return internal::InvalidValue(obj, "tried to convert to std::complex<double>");
+    }
+
+    return value;
+  }
+
+static Result<std::complex<double>> Convert(const ComplexDoubleType*, const O&, I obj) {
     std::complex<double> value;
 
     if (PyComplex_Check(obj)) {

From e7361614c0eb0d140f2318407abdf3bbf12701f1 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Tue, 22 Jun 2021 09:10:35 -0400
Subject: [PATCH 43/61] ARROW-13139: [C++] ReadaheadGenerator cannot be safely
 copied/moved

I changed the readahead generator to be consistent with the rest of the generators.  All state is put into a dedicated struct and the generator only has a shared_ptr to the dedicated struct.

Closes #10569 from westonpace/bugfix/ARROW-13139--c-readaheadgenerator-cannot-be-safely-copied-m

Authored-by: Weston Pace <weston.pace@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/util/async_generator.h       | 114 +++++++++++++++------
 cpp/src/arrow/util/async_generator_test.cc |  32 ++++++
 2 files changed, 113 insertions(+), 33 deletions(-)

diff --git a/cpp/src/arrow/util/async_generator.h b/cpp/src/arrow/util/async_generator.h
index 5069d5092d4..084720f9908 100644
--- a/cpp/src/arrow/util/async_generator.h
+++ b/cpp/src/arrow/util/async_generator.h
@@ -543,7 +543,7 @@ class TransformingGenerator {
 ///
 /// This generator is not async-reentrant
 ///
-/// This generator may queue up to 1 instance of T
+/// This generator may queue up to 1 instance of T but will not delay
 template <typename T, typename V>
 AsyncGenerator<V> MakeTransformedGenerator(AsyncGenerator<T> generator,
                                            Transformer<T, V> transformer) {
@@ -718,50 +718,60 @@ template <typename T>
 class ReadaheadGenerator {
  public:
   ReadaheadGenerator(AsyncGenerator<T> source_generator, int max_readahead)
-      : source_generator_(std::move(source_generator)), max_readahead_(max_readahead) {
-    auto finished = std::make_shared<std::atomic<bool>>(false);
-    mark_finished_if_done_ = [finished](const Result<T>& next_result) {
-      if (!next_result.ok()) {
-        finished->store(true);
-      } else {
-        if (IsIterationEnd(*next_result)) {
-          *finished = true;
-        }
-      }
-    };
-    finished_ = std::move(finished);
-  }
+      : state_(std::make_shared<State>(std::move(source_generator), max_readahead)) {}
 
   Future<T> operator()() {
-    if (readahead_queue_.empty()) {
+    // Copy so we can capture into lambdas
+    auto state = state_;
+    if (state->readahead_queue.empty()) {
       // This is the first request, let's pump the underlying queue
-      for (int i = 0; i < max_readahead_; i++) {
-        auto next = source_generator_();
-        next.AddCallback(mark_finished_if_done_);
-        readahead_queue_.push(std::move(next));
+      for (int i = 0; i < state->max_readahead; i++) {
+        auto next = state->source_generator();
+        auto state = state_;
+        next.AddCallback(
+            [state](const Result<T>& result) { state->MarkFinishedIfDone(result); });
+        state->readahead_queue.push(std::move(next));
       }
     }
     // Pop one and add one
-    auto result = readahead_queue_.front();
-    readahead_queue_.pop();
-    if (finished_->load()) {
-      readahead_queue_.push(AsyncGeneratorEnd<T>());
+    auto result = state->readahead_queue.front();
+    state->readahead_queue.pop();
+    if (state->finished.load()) {
+      state->readahead_queue.push(AsyncGeneratorEnd<T>());
     } else {
-      auto back_of_queue = source_generator_();
-      back_of_queue.AddCallback(mark_finished_if_done_);
-      readahead_queue_.push(std::move(back_of_queue));
+      auto back_of_queue = state->source_generator();
+      auto state = state_;
+      back_of_queue.AddCallback(
+          [state](const Result<T>& result) { state->MarkFinishedIfDone(result); });
+      state->readahead_queue.push(std::move(back_of_queue));
     }
     return result;
   }
 
  private:
-  AsyncGenerator<T> source_generator_;
-  int max_readahead_;
-  std::function<void(const Result<T>&)> mark_finished_if_done_;
-  // Can't use a bool here because finished may be referenced by callbacks that
-  // outlive this class
-  std::shared_ptr<std::atomic<bool>> finished_;
-  std::queue<Future<T>> readahead_queue_;
+  struct State {
+    State(AsyncGenerator<T> source_generator, int max_readahead)
+        : source_generator(std::move(source_generator)), max_readahead(max_readahead) {
+      finished.store(false);
+    }
+
+    void MarkFinishedIfDone(const Result<T>& next_result) {
+      if (!next_result.ok()) {
+        finished.store(true);
+      } else {
+        if (IsIterationEnd(*next_result)) {
+          finished.store(true);
+        }
+      }
+    }
+
+    AsyncGenerator<T> source_generator;
+    int max_readahead;
+    std::atomic<bool> finished;
+    std::queue<Future<T>> readahead_queue;
+  };
+
+  std::shared_ptr<State> state_;
 };
 
 /// \brief A generator where the producer pushes items on a queue.
@@ -1108,6 +1118,10 @@ AsyncGenerator<T> MakeMergedGenerator(AsyncGenerator<AsyncGenerator<T>> source,
 /// will never pull from any subscription reentrantly.
 ///
 /// This generator may queue 1 instance of T
+///
+/// TODO: Could potentially make a bespoke implementation instead of MergedGenerator that
+/// forwards async-reentrant requests instead of buffering them (which is what
+/// MergedGenerator does)
 template <typename T>
 AsyncGenerator<T> MakeConcatenatedGenerator(AsyncGenerator<AsyncGenerator<T>> source) {
   return MergedGenerator<T>(std::move(source), 1);
@@ -1573,4 +1587,38 @@ AsyncGenerator<T> MakeFailingGenerator(const Result<T>& result) {
   return MakeFailingGenerator<T>(result.status());
 }
 
+/// \brief Prepends initial_values onto a generator
+///
+/// This generator is async-reentrant but will buffer requests and will not
+/// pull from following_values async-reentrantly.
+template <typename T>
+AsyncGenerator<T> MakeGeneratorStartsWith(std::vector<T> initial_values,
+                                          AsyncGenerator<T> following_values) {
+  auto initial_values_vec_gen = MakeVectorGenerator(std::move(initial_values));
+  auto gen_gen = MakeVectorGenerator<AsyncGenerator<T>>(
+      {std::move(initial_values_vec_gen), std::move(following_values)});
+  return MakeConcatenatedGenerator(std::move(gen_gen));
+}
+
+template <typename T>
+struct CancellableGenerator {
+  Future<T> operator()() {
+    if (stop_token.IsStopRequested()) {
+      return stop_token.Poll();
+    }
+    return source();
+  }
+
+  AsyncGenerator<T> source;
+  StopToken stop_token;
+};
+
+/// \brief Allows an async generator to be cancelled
+///
+/// This generator is async-reentrant
+template <typename T>
+AsyncGenerator<T> MakeCancellable(AsyncGenerator<T> source, StopToken stop_token) {
+  return CancellableGenerator<T>{std::move(source), std::move(stop_token)};
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/async_generator_test.cc b/cpp/src/arrow/util/async_generator_test.cc
index 74850b625a2..14b528ade5e 100644
--- a/cpp/src/arrow/util/async_generator_test.cc
+++ b/cpp/src/arrow/util/async_generator_test.cc
@@ -1060,6 +1060,38 @@ TEST(TestAsyncUtil, Readahead) {
   ASSERT_TRUE(IsIterationEnd(last_val));
 }
 
+TEST(TestAsyncUtil, ReadaheadCopy) {
+  auto source = AsyncVectorIt<TestInt>(RangeVector(6));
+  auto gen = MakeReadaheadGenerator(std::move(source), 2);
+
+  for (int i = 0; i < 2; i++) {
+    ASSERT_FINISHES_OK_AND_EQ(TestInt(i), gen());
+  }
+  auto gen_copy = gen;
+  for (int i = 0; i < 2; i++) {
+    ASSERT_FINISHES_OK_AND_EQ(TestInt(i + 2), gen_copy());
+  }
+  for (int i = 0; i < 2; i++) {
+    ASSERT_FINISHES_OK_AND_EQ(TestInt(i + 4), gen());
+  }
+  AssertGeneratorExhausted(gen);
+  AssertGeneratorExhausted(gen_copy);
+}
+
+TEST(TestAsyncUtil, ReadaheadMove) {
+  auto source = AsyncVectorIt<TestInt>(RangeVector(6));
+  auto gen = MakeReadaheadGenerator(std::move(source), 2);
+
+  for (int i = 0; i < 2; i++) {
+    ASSERT_FINISHES_OK_AND_EQ(TestInt(i), gen());
+  }
+  auto gen_copy = std::move(gen);
+  for (int i = 0; i < 4; i++) {
+    ASSERT_FINISHES_OK_AND_EQ(TestInt(i + 2), gen_copy());
+  }
+  AssertGeneratorExhausted(gen_copy);
+}
+
 TEST(TestAsyncUtil, ReadaheadFailed) {
   ASSERT_OK_AND_ASSIGN(auto thread_pool, internal::ThreadPool::Make(4));
   std::atomic<int32_t> counter(0);

From 144cf9e2fdb1d38f587b219650ee3fef309cc847 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Tue, 22 Jun 2021 09:12:47 -0400
Subject: [PATCH 44/61] ARROW-12827: [C++] Improve error message for dataset
 discovery failure

This adds a bit more context to the error messages, though maybe this is a bit wordy?

```
>>> ds.dataset('dataset4', format="ipc")
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/home/lidavidm/Code/upstream/arrow-12827/python/pyarrow/dataset.py", line 655, in dataset
    return _filesystem_dataset(source, **kwargs)
  File "/home/lidavidm/Code/upstream/arrow-12827/python/pyarrow/dataset.py", line 410, in _filesystem_dataset
    return factory.finish(schema)
  File "pyarrow/_dataset.pyx", line 2262, in pyarrow._dataset.DatasetFactory.finish
    return Dataset.wrap(GetResultValue(result))
  File "pyarrow/error.pxi", line 141, in pyarrow.lib.pyarrow_internal_check_status
    return check_status(status)
  File "pyarrow/error.pxi", line 97, in pyarrow.lib.check_status
    raise ArrowInvalid(message)
pyarrow.lib.ArrowInvalid: Error creating dataset. Could not read schema from 'dataset4/foo.parquet': Could not open IPC input source 'dataset4/foo.parquet': File is too small: 9. Is this a 'ipc' file?
>>> ds.dataset('dataset5', format="parquet")
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/home/lidavidm/Code/upstream/arrow-12827/python/pyarrow/dataset.py", line 655, in dataset
    return _filesystem_dataset(source, **kwargs)
  File "/home/lidavidm/Code/upstream/arrow-12827/python/pyarrow/dataset.py", line 410, in _filesystem_dataset
    return factory.finish(schema)
  File "pyarrow/_dataset.pyx", line 2262, in pyarrow._dataset.DatasetFactory.finish
    return Dataset.wrap(GetResultValue(result))
  File "pyarrow/error.pxi", line 141, in pyarrow.lib.pyarrow_internal_check_status
    return check_status(status)
  File "pyarrow/error.pxi", line 112, in pyarrow.lib.check_status
    raise IOError(message)
OSError: Error creating dataset. Could not read schema from 'dataset5/foo.parquet': Could not open Parquet input source 'dataset5/foo.parquet': Invalid: Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.. Is this a 'parquet' file?
```

Closes #10483 from lidavidm/arrow-12827

Authored-by: David Li <li.davidm96@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/dataset/discovery.cc         | 10 ++++-
 cpp/src/arrow/dataset/file_csv_test.cc     |  2 +-
 cpp/src/arrow/dataset/file_ipc_test.cc     |  2 +-
 cpp/src/arrow/dataset/file_parquet_test.cc |  2 +-
 cpp/src/arrow/dataset/test_util.h          | 50 ++++++++++++++++++----
 5 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc
index 70b6930bf2f..e124c7abd91 100644
--- a/cpp/src/arrow/dataset/discovery.cc
+++ b/cpp/src/arrow/dataset/discovery.cc
@@ -226,8 +226,14 @@ Result<std::vector<std::shared_ptr<Schema>>> FileSystemDatasetFactory::InspectSc
   int fragments = options.fragments;
   for (const auto& info : files_) {
     if (has_fragments_limit && fragments-- == 0) break;
-    ARROW_ASSIGN_OR_RAISE(auto schema, format_->Inspect({info, fs_}));
-    schemas.push_back(schema);
+    auto result = format_->Inspect({info, fs_});
+    if (ARROW_PREDICT_FALSE(!result.ok())) {
+      return result.status().WithMessage(
+          "Error creating dataset. Could not read schema from '", info.path(),
+          "': ", result.status().message(), ". Is this a '", format_->type_name(),
+          "' file?");
+    }
+    schemas.push_back(result.MoveValueUnsafe());
   }
 
   ARROW_ASSIGN_OR_RAISE(auto partition_schema,
diff --git a/cpp/src/arrow/dataset/file_csv_test.cc b/cpp/src/arrow/dataset/file_csv_test.cc
index a0d0a75a20f..acb66d4c75b 100644
--- a/cpp/src/arrow/dataset/file_csv_test.cc
+++ b/cpp/src/arrow/dataset/file_csv_test.cc
@@ -229,7 +229,7 @@ N/A
 }
 
 TEST_P(TestCsvFileFormat, InspectFailureWithRelevantError) {
-  TestInspectFailureWithRelevantError(StatusCode::Invalid);
+  TestInspectFailureWithRelevantError(StatusCode::Invalid, "CSV");
 }
 
 TEST_P(TestCsvFileFormat, Inspect) {
diff --git a/cpp/src/arrow/dataset/file_ipc_test.cc b/cpp/src/arrow/dataset/file_ipc_test.cc
index 561ef00ae0b..f0409abe85b 100644
--- a/cpp/src/arrow/dataset/file_ipc_test.cc
+++ b/cpp/src/arrow/dataset/file_ipc_test.cc
@@ -85,7 +85,7 @@ TEST_F(TestIpcFileFormat, WriteRecordBatchReaderCustomOptions) {
 }
 
 TEST_F(TestIpcFileFormat, InspectFailureWithRelevantError) {
-  TestInspectFailureWithRelevantError(StatusCode::Invalid);
+  TestInspectFailureWithRelevantError(StatusCode::Invalid, "IPC");
 }
 TEST_F(TestIpcFileFormat, Inspect) { TestInspect(); }
 TEST_F(TestIpcFileFormat, IsSupported) { TestIsSupported(); }
diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc
index 7722d4da885..2561bdb7c5d 100644
--- a/cpp/src/arrow/dataset/file_parquet_test.cc
+++ b/cpp/src/arrow/dataset/file_parquet_test.cc
@@ -176,7 +176,7 @@ class TestParquetFileFormat : public FileFormatFixtureMixin<ParquetFormatHelper>
 };
 
 TEST_F(TestParquetFileFormat, InspectFailureWithRelevantError) {
-  TestInspectFailureWithRelevantError(StatusCode::IOError);
+  TestInspectFailureWithRelevantError(StatusCode::IOError, "parquet");
 }
 TEST_F(TestParquetFileFormat, Inspect) { TestInspect(); }
 
diff --git a/cpp/src/arrow/dataset/test_util.h b/cpp/src/arrow/dataset/test_util.h
index 39223eba35b..6a0375073ee 100644
--- a/cpp/src/arrow/dataset/test_util.h
+++ b/cpp/src/arrow/dataset/test_util.h
@@ -407,18 +407,50 @@ class FileFormatFixtureMixin : public ::testing::Test {
   }
 
   // Shared test cases
-  void TestInspectFailureWithRelevantError(StatusCode code) {
-    std::shared_ptr<Buffer> buf = std::make_shared<Buffer>(util::string_view(""));
-    auto result = format_->Inspect(FileSource(buf));
-    EXPECT_EQ(code, result.status().code());
-    EXPECT_THAT(result.status().ToString(), testing::HasSubstr("<Buffer>"));
-
+  void AssertInspectFailure(const std::string& contents, StatusCode code,
+                            const std::string& format_name) {
+    SCOPED_TRACE("Format: " + format_name + " File contents: " + contents);
     constexpr auto file_name = "herp/derp";
+    auto make_error_message = [&](const std::string& filename) {
+      return "Could not open " + format_name + " input source '" + filename + "':";
+    };
+    const auto buf = std::make_shared<Buffer>(contents);
+    Status status;
+
+    status = format_->Inspect(FileSource(buf)).status();
+    EXPECT_EQ(code, status.code());
+    EXPECT_THAT(status.ToString(), ::testing::HasSubstr(make_error_message("<Buffer>")));
+
+    ASSERT_OK_AND_EQ(false, format_->IsSupported(FileSource(buf)));
+
     ASSERT_OK_AND_ASSIGN(
         auto fs, fs::internal::MockFileSystem::Make(fs::kNoTime, {fs::File(file_name)}));
-    result = format_->Inspect({file_name, fs});
-    EXPECT_EQ(code, result.status().code());
-    EXPECT_THAT(result.status().ToString(), testing::HasSubstr(file_name));
+    status = format_->Inspect({file_name, fs}).status();
+    EXPECT_EQ(code, status.code());
+    EXPECT_THAT(status.ToString(), testing::HasSubstr(make_error_message("herp/derp")));
+
+    fs::FileSelector s;
+    s.base_dir = "/";
+    s.recursive = true;
+    FileSystemFactoryOptions options;
+    ASSERT_OK_AND_ASSIGN(auto factory,
+                         FileSystemDatasetFactory::Make(fs, s, format_, options));
+    status = factory->Finish().status();
+    EXPECT_EQ(code, status.code());
+    EXPECT_THAT(
+        status.ToString(),
+        ::testing::AllOf(
+            ::testing::HasSubstr(make_error_message("/herp/derp")),
+            ::testing::HasSubstr(
+                "Error creating dataset. Could not read schema from '/herp/derp':"),
+            ::testing::HasSubstr("Is this a '" + format_->type_name() + "' file?")));
+  }
+  void TestInspectFailureWithRelevantError(StatusCode code,
+                                           const std::string format_name) {
+    const std::vector<std::string> file_contents{"", "PAR0", "ASDFPAR1", "ARROW1"};
+    for (const auto& contents : file_contents) {
+      AssertInspectFailure(contents, code, format_name);
+    }
   }
   void TestInspect() {
     auto reader = GetRecordBatchReader(schema({field("f64", float64())}));

From 41be193628d6a87f56316c9d118e42691dd93825 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 22 Jun 2021 09:16:11 -0400
Subject: [PATCH 45/61] ARROW-13034: [Python][Docs] Update the cloud examples
 on the Parquet doc page

Closes #10548 from jorisvandenbossche/ARROW-13034

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 docs/source/python/filesystems.rst |  1 +
 docs/source/python/parquet.rst     | 67 +++++++++++++-----------------
 2 files changed, 31 insertions(+), 37 deletions(-)

diff --git a/docs/source/python/filesystems.rst b/docs/source/python/filesystems.rst
index 01e7d7bba55..13c0d93101e 100644
--- a/docs/source/python/filesystems.rst
+++ b/docs/source/python/filesystems.rst
@@ -178,6 +178,7 @@ some environment variables.
   If ``CLASSPATH`` is not set, then it will be set automatically if the
   ``hadoop`` executable is in your system path, or if ``HADOOP_HOME`` is set.
 
+.. _filesystem-fsspec:
 
 Using fsspec-compatible filesystems
 -----------------------------------
diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst
index 693fc97e062..0db0df1bc4c 100644
--- a/docs/source/python/parquet.rst
+++ b/docs/source/python/parquet.rst
@@ -387,7 +387,8 @@ individual table writes are wrapped using ``with`` statements so the
 .. code-block:: python
 
    # Remote file-system example
-   fs = pa.hdfs.connect(host, port, user=user, kerb_ticket=ticket_cache_path)
+   from pyarrow.fs import HadoopFileSystem
+   fs = HadoopFileSystem(host, port, user=user, kerb_ticket=ticket_cache_path)
    pq.write_to_dataset(table, root_path='dataset_name',
                        partition_cols=['one', 'two'], filesystem=fs)
 
@@ -545,46 +546,38 @@ This can be disabled by specifying ``use_threads=False``.
    The number of threads to use concurrently is automatically inferred by Arrow
    and can be inspected using the :func:`~pyarrow.cpu_count()` function.
 
+Reading from cloud storage
+--------------------------
 
-Reading a Parquet File from Azure Blob storage
-----------------------------------------------
+In addition to local files, pyarrow supports other filesystems, such as cloud
+filesystems, through the ``filesystem`` keyword:
 
-The code below shows how to use Azure's storage sdk along with pyarrow to read
-a parquet file into a Pandas dataframe.
-This is suitable for executing inside a Jupyter notebook running on a Python 3
-kernel.
+.. code-block:: python
+
+    from pyarrow import fs
 
-Dependencies:
+    s3  = fs.S3FileSystem(region="us-east-2")
+    table = pq.read_table("bucket/object/key/prefix", filesystem=s3)
 
-* python 3.6.2
-* azure-storage 0.36.0
-* pyarrow 0.8.0
+Currently, :class:`HDFS <pyarrow.fs.HadoopFileSystem>` and
+:class:`Amazon S3-compatible storage <pyarrow.fs.S3FileSystem>` are
+supported. See the :ref:`filesystem` docs for more details. For those
+built-in filesystems, the filesystem can also be inferred from the file path,
+if specified as a URI:
 
 .. code-block:: python
 
-   import pyarrow.parquet as pq
-   from io import BytesIO
-   from azure.storage.blob import BlockBlobService
-
-   account_name = '...'
-   account_key = '...'
-   container_name = '...'
-   parquet_file = 'mysample.parquet'
-
-   byte_stream = io.BytesIO()
-   block_blob_service = BlockBlobService(account_name=account_name, account_key=account_key)
-   try:
-      block_blob_service.get_blob_to_stream(container_name=container_name, blob_name=parquet_file, stream=byte_stream)
-      df = pq.read_table(source=byte_stream).to_pandas()
-      # Do work on df ...
-   finally:
-      # Add finally block to ensure closure of the stream
-      byte_stream.close()
-
-Notes:
-
-* The ``account_key`` can be found under ``Settings -> Access keys`` in the
-  Microsoft Azure portal for a given container
-* The code above works for a container with private access, Lease State =
-  Available, Lease Status = Unlocked
-* The parquet file was Blob Type = Block blob
+    table = pq.read_table("s3://bucket/object/key/prefix")
+
+Other filesystems can still be supported if there is an
+`fsspec <https://filesystem-spec.readthedocs.io/en/latest/>`__-compatible
+implementation available. See :ref:`filesystem-fsspec` for more details.
+One example is Azure Blob storage, which can be interfaced through the
+`adlfs <https://github.com/dask/adlfs>`__ package.
+
+.. code-block:: python
+
+    from adlfs import AzureBlobFileSystem
+
+    abfs = AzureBlobFileSystem(account_name="XXXX", account_key="XXXX", container_name="XXXX")
+    table = pq.read_table("file.parquet", filesystem=abfs)

From 0f4874203e53271da754e32286858924faf24a3f Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwe.korn@quantco.com>
Date: Tue, 22 Jun 2021 17:45:17 +0200
Subject: [PATCH 46/61] ARROW-13140: [C++/Python] Upgrade libthrift pin in the
 nightlies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #10570 from xhochy/ARROW-13140

Authored-by: Uwe L. Korn <uwe.korn@quantco.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 ...cuda_compiler_version10.2numpy1.17python3.6.____cpython.yaml | 2 +-
 ...cuda_compiler_version10.2numpy1.17python3.7.____cpython.yaml | 2 +-
 ...cuda_compiler_version10.2numpy1.17python3.8.____cpython.yaml | 2 +-
 ...cuda_compiler_version10.2numpy1.19python3.9.____cpython.yaml | 2 +-
 ...cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml | 2 +-
 ...cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml | 2 +-
 ...cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml | 2 +-
 ...cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml | 2 +-
 .../linux_aarch64_numpy1.17python3.6.____cpython.yaml           | 2 +-
 .../linux_aarch64_numpy1.17python3.7.____cpython.yaml           | 2 +-
 .../linux_aarch64_numpy1.17python3.8.____cpython.yaml           | 2 +-
 .../linux_aarch64_numpy1.19python3.9.____cpython.yaml           | 2 +-
 .../.ci_support/osx_64_numpy1.17python3.6.____cpython.yaml      | 2 +-
 .../.ci_support/osx_64_numpy1.17python3.7.____cpython.yaml      | 2 +-
 .../.ci_support/osx_64_numpy1.17python3.8.____cpython.yaml      | 2 +-
 .../.ci_support/osx_64_numpy1.19python3.9.____cpython.yaml      | 2 +-
 .../.ci_support/osx_arm64_python3.8.____cpython.yaml            | 2 +-
 .../.ci_support/osx_arm64_python3.9.____cpython.yaml            | 2 +-
 ...cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml | 2 +-
 ...cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml | 2 +-
 ...cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml | 2 +-
 ...cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml | 2 +-
 22 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.6.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.6.____cpython.yaml
index 9d014c1a2f8..dfc87c80b31 100644
--- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.6.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.6.____cpython.yaml
@@ -55,7 +55,7 @@ snappy:
 target_platform:
 - linux-64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.7.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.7.____cpython.yaml
index b8cde8e0752..3416b952c90 100644
--- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.7.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.7.____cpython.yaml
@@ -55,7 +55,7 @@ snappy:
 target_platform:
 - linux-64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.8.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.8.____cpython.yaml
index 5e5c7ab7c93..f819ba7229e 100644
--- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.8.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.8.____cpython.yaml
@@ -55,7 +55,7 @@ snappy:
 target_platform:
 - linux-64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.19python3.9.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.19python3.9.____cpython.yaml
index d0926aa3cef..3e2e0ef51fb 100644
--- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.19python3.9.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.19python3.9.____cpython.yaml
@@ -55,7 +55,7 @@ snappy:
 target_platform:
 - linux-64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml
index 6625c55c2c9..3aba0f1294c 100644
--- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml
@@ -55,7 +55,7 @@ snappy:
 target_platform:
 - linux-64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml
index d356a8a56cf..ff26bc5215e 100644
--- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml
@@ -55,7 +55,7 @@ snappy:
 target_platform:
 - linux-64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml
index 61f311506e6..5703aba68ec 100644
--- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml
@@ -55,7 +55,7 @@ snappy:
 target_platform:
 - linux-64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml
index 6abfe8271dc..8ff58d717e8 100644
--- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml
@@ -55,7 +55,7 @@ snappy:
 target_platform:
 - linux-64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_numpy1.17python3.6.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_numpy1.17python3.6.____cpython.yaml
index 408e05667bd..5bb4381febf 100644
--- a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_numpy1.17python3.6.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_numpy1.17python3.6.____cpython.yaml
@@ -57,7 +57,7 @@ snappy:
 target_platform:
 - linux-aarch64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_numpy1.17python3.7.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_numpy1.17python3.7.____cpython.yaml
index ef7ff818a54..2b1715d585b 100644
--- a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_numpy1.17python3.7.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_numpy1.17python3.7.____cpython.yaml
@@ -57,7 +57,7 @@ snappy:
 target_platform:
 - linux-aarch64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_numpy1.17python3.8.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_numpy1.17python3.8.____cpython.yaml
index ea0327e5c2a..5a0e7313e9d 100644
--- a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_numpy1.17python3.8.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_numpy1.17python3.8.____cpython.yaml
@@ -57,7 +57,7 @@ snappy:
 target_platform:
 - linux-aarch64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_numpy1.19python3.9.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_numpy1.19python3.9.____cpython.yaml
index 5ba7c16b1b0..16ace00bdae 100644
--- a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_numpy1.19python3.9.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_numpy1.19python3.9.____cpython.yaml
@@ -57,7 +57,7 @@ snappy:
 target_platform:
 - linux-aarch64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.6.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.6.____cpython.yaml
index d4a7e2a75b5..0be59fe1a38 100644
--- a/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.6.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.6.____cpython.yaml
@@ -53,7 +53,7 @@ snappy:
 target_platform:
 - osx-64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.7.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.7.____cpython.yaml
index c7e57ba9a1a..d2c046ab2ea 100644
--- a/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.7.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.7.____cpython.yaml
@@ -53,7 +53,7 @@ snappy:
 target_platform:
 - osx-64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.8.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.8.____cpython.yaml
index 7fd69d4e965..43f63445469 100644
--- a/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.8.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.8.____cpython.yaml
@@ -53,7 +53,7 @@ snappy:
 target_platform:
 - osx-64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.19python3.9.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.19python3.9.____cpython.yaml
index 8ba2718c411..7cc730f9bb0 100644
--- a/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.19python3.9.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.19python3.9.____cpython.yaml
@@ -53,7 +53,7 @@ snappy:
 target_platform:
 - osx-64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/osx_arm64_python3.8.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/osx_arm64_python3.8.____cpython.yaml
index e9ae1d9858e..e5f8e2ba2a8 100644
--- a/dev/tasks/conda-recipes/.ci_support/osx_arm64_python3.8.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/osx_arm64_python3.8.____cpython.yaml
@@ -53,7 +53,7 @@ snappy:
 target_platform:
 - osx-arm64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/osx_arm64_python3.9.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/osx_arm64_python3.9.____cpython.yaml
index 84ef1bc9b85..cd3eca6d23d 100644
--- a/dev/tasks/conda-recipes/.ci_support/osx_arm64_python3.9.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/osx_arm64_python3.9.____cpython.yaml
@@ -53,7 +53,7 @@ snappy:
 target_platform:
 - osx-arm64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - c_compiler_version
   - cxx_compiler_version
diff --git a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml
index 63a7faeaa33..8d4e25167b0 100644
--- a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml
@@ -45,7 +45,7 @@ snappy:
 target_platform:
 - win-64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - numpy
   - python
diff --git a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml
index 684987c6fbb..8da4a8380b7 100644
--- a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml
@@ -45,7 +45,7 @@ snappy:
 target_platform:
 - win-64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - numpy
   - python
diff --git a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml
index afefac79ec7..1980e1be39b 100644
--- a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml
@@ -45,7 +45,7 @@ snappy:
 target_platform:
 - win-64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - numpy
   - python
diff --git a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml
index c385d13eac0..1106037d36b 100644
--- a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml
+++ b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml
@@ -45,7 +45,7 @@ snappy:
 target_platform:
 - win-64
 thrift_cpp:
-- 0.14.1
+- 0.14.2
 zip_keys:
 - - numpy
   - python

From f044d5e51724b9dd1bb87463694e450dabc4468d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Tue, 22 Jun 2021 17:46:56 +0200
Subject: [PATCH 47/61] ARROW-12983: [C++][Python][R] Properly overflow to
 chunked array in Python-to-Arrow conversion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Still need to port the R changes from #10470

Tested locally using:

```
 PYARROW_TEST_SLOW=ON PYARROW_TEST_LARGE_MEMORY=ON ./run_test.sh -sv pyarrow/tests/
```

Closes #10556 from kszucs/fff

Authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
---
 .github/workflows/python.yml                  |   7 +
 ci/scripts/python_test.sh                     |   2 +-
 cpp/src/arrow/array/builder_binary.h          |   7 -
 cpp/src/arrow/python/inference.cc             |   9 +-
 cpp/src/arrow/python/iterators.h              |  19 +--
 cpp/src/arrow/python/numpy_internal.h         |   2 +-
 cpp/src/arrow/python/python_to_arrow.cc       |  55 +++-----
 cpp/src/arrow/util/converter.h                |  85 ++++++++---
 .../pyarrow/tests/parquet/test_data_types.py  |   5 +
 python/pyarrow/tests/test_array.py            |   1 +
 python/pyarrow/tests/test_convert_builtin.py  |  54 ++++++-
 python/pyarrow/tests/test_pandas.py           |   8 +-
 r/src/r_to_arrow.cpp                          | 133 +++++++++---------
 13 files changed, 236 insertions(+), 151 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 981fd61c029..c6781a4b149 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -131,6 +131,7 @@ jobs:
       ARROW_WITH_BROTLI: ON
       ARROW_BUILD_TESTS: OFF
       CMAKE_ARGS: "-DPython3_EXECUTABLE=/usr/local/bin/python3"
+      PYARROW_TEST_LARGE_MEMORY: ON
     steps:
       - name: Checkout Arrow
         uses: actions/checkout@v2
@@ -139,6 +140,12 @@ jobs:
       - name: Fetch Submodules and Tags
         shell: bash
         run: ci/scripts/util_checkout.sh
+      - name: Show available RAM size
+        shell: bash
+        run: |
+          hwmemsize=$(sysctl -n hw.memsize)
+          ramsize=$(expr $hwmemsize / $((1024**3)))
+          echo "System Memory: ${ramsize} GB"
       - name: Install Dependencies
         shell: bash
         run: |
diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh
index 80a9cdef4a3..6e05af89a19 100755
--- a/ci/scripts/python_test.sh
+++ b/ci/scripts/python_test.sh
@@ -29,4 +29,4 @@ export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH}
 # Enable some checks inside Python itself
 export PYTHONDEVMODE=1
 
-pytest -r s ${PYTEST_ARGS} --pyargs pyarrow
+pytest -r s -v ${PYTEST_ARGS} --pyargs pyarrow
diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index a60031258ad..c1c664a1249 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -291,14 +291,7 @@ class BaseBinaryBuilder : public ArrayBuilder {
   }
 
   Status Resize(int64_t capacity) override {
-    // XXX Why is this check necessary?  There is no reason to disallow, say,
-    // binary arrays with more than 2**31 empty or null values.
-    if (capacity > memory_limit()) {
-      return Status::CapacityError("BinaryBuilder cannot reserve space for more than ",
-                                   memory_limit(), " child elements, got ", capacity);
-    }
     ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
-
     // One more than requested for offsets
     ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
     return ArrayBuilder::Resize(capacity);
diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc
index 9d6707aa11d..5086815f84f 100644
--- a/cpp/src/arrow/python/inference.cc
+++ b/cpp/src/arrow/python/inference.cc
@@ -379,12 +379,13 @@ class TypeInferrer {
   // Infer value type from a sequence of values
   Status VisitSequence(PyObject* obj, PyObject* mask = nullptr) {
     if (mask == nullptr || mask == Py_None) {
-      return internal::VisitSequence(obj, [this](PyObject* value, bool* keep_going) {
-        return Visit(value, keep_going);
-      });
+      return internal::VisitSequence(
+          obj, /*offset=*/0,
+          [this](PyObject* value, bool* keep_going) { return Visit(value, keep_going); });
     } else {
       return internal::VisitSequenceMasked(
-          obj, mask, [this](PyObject* value, uint8_t masked, bool* keep_going) {
+          obj, mask, /*offset=*/0,
+          [this](PyObject* value, uint8_t masked, bool* keep_going) {
             if (!masked) {
               return Visit(value, keep_going);
             } else {
diff --git a/cpp/src/arrow/python/iterators.h b/cpp/src/arrow/python/iterators.h
index 6b0b55342a5..58213ee2dbc 100644
--- a/cpp/src/arrow/python/iterators.h
+++ b/cpp/src/arrow/python/iterators.h
@@ -36,7 +36,7 @@ namespace internal {
 //
 // If keep_going is set to false, the iteration terminates
 template <class VisitorFunc>
-inline Status VisitSequenceGeneric(PyObject* obj, VisitorFunc&& func) {
+inline Status VisitSequenceGeneric(PyObject* obj, int64_t offset, VisitorFunc&& func) {
   // VisitorFunc may set to false to terminate iteration
   bool keep_going = true;
 
@@ -49,7 +49,7 @@ inline Status VisitSequenceGeneric(PyObject* obj, VisitorFunc&& func) {
     if (PyArray_DESCR(arr_obj)->type_num == NPY_OBJECT) {
       // It's an array object, we can fetch object pointers directly
       const Ndarray1DIndexer<PyObject*> objects(arr_obj);
-      for (int64_t i = 0; keep_going && i < objects.size(); ++i) {
+      for (int64_t i = offset; keep_going && i < objects.size(); ++i) {
         RETURN_NOT_OK(func(objects[i], i, &keep_going));
       }
       return Status::OK();
@@ -64,7 +64,7 @@ inline Status VisitSequenceGeneric(PyObject* obj, VisitorFunc&& func) {
     if (PyList_Check(obj) || PyTuple_Check(obj)) {
       // Use fast item access
       const Py_ssize_t size = PySequence_Fast_GET_SIZE(obj);
-      for (Py_ssize_t i = 0; keep_going && i < size; ++i) {
+      for (Py_ssize_t i = offset; keep_going && i < size; ++i) {
         PyObject* value = PySequence_Fast_GET_ITEM(obj, i);
         RETURN_NOT_OK(func(value, static_cast<int64_t>(i), &keep_going));
       }
@@ -72,7 +72,7 @@ inline Status VisitSequenceGeneric(PyObject* obj, VisitorFunc&& func) {
       // Regular sequence: avoid making a potentially large copy
       const Py_ssize_t size = PySequence_Size(obj);
       RETURN_IF_PYERROR();
-      for (Py_ssize_t i = 0; keep_going && i < size; ++i) {
+      for (Py_ssize_t i = offset; keep_going && i < size; ++i) {
         OwnedRef value_ref(PySequence_ITEM(obj, i));
         RETURN_IF_PYERROR();
         RETURN_NOT_OK(func(value_ref.obj(), static_cast<int64_t>(i), &keep_going));
@@ -86,16 +86,17 @@ inline Status VisitSequenceGeneric(PyObject* obj, VisitorFunc&& func) {
 
 // Visit sequence with no null mask
 template <class VisitorFunc>
-inline Status VisitSequence(PyObject* obj, VisitorFunc&& func) {
+inline Status VisitSequence(PyObject* obj, int64_t offset, VisitorFunc&& func) {
   return VisitSequenceGeneric(
-      obj, [&func](PyObject* value, int64_t i /* unused */, bool* keep_going) {
+      obj, offset, [&func](PyObject* value, int64_t i /* unused */, bool* keep_going) {
         return func(value, keep_going);
       });
 }
 
 /// Visit sequence with null mask
 template <class VisitorFunc>
-inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, VisitorFunc&& func) {
+inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, int64_t offset,
+                                  VisitorFunc&& func) {
   if (mo == nullptr || !PyArray_Check(mo)) {
     return Status::Invalid("Null mask must be NumPy array");
   }
@@ -115,7 +116,7 @@ inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, VisitorFunc&& fun
     Ndarray1DIndexer<uint8_t> mask_values(mask);
 
     return VisitSequenceGeneric(
-        obj, [&func, &mask_values](PyObject* value, int64_t i, bool* keep_going) {
+        obj, offset, [&func, &mask_values](PyObject* value, int64_t i, bool* keep_going) {
           return func(value, mask_values[i], keep_going);
         });
   } else {
@@ -132,7 +133,7 @@ template <class VisitorFunc>
 inline Status VisitIterable(PyObject* obj, VisitorFunc&& func) {
   if (PySequence_Check(obj)) {
     // Numpy arrays fall here as well
-    return VisitSequence(obj, std::forward<VisitorFunc>(func));
+    return VisitSequence(obj, /*offset=*/0, std::forward<VisitorFunc>(func));
   }
   // Fall back on the iterator protocol
   OwnedRef iter_ref(PyObject_GetIter(obj));
diff --git a/cpp/src/arrow/python/numpy_internal.h b/cpp/src/arrow/python/numpy_internal.h
index 58f61d5899e..7c71bc06f6d 100644
--- a/cpp/src/arrow/python/numpy_internal.h
+++ b/cpp/src/arrow/python/numpy_internal.h
@@ -52,7 +52,7 @@ class Ndarray1DIndexer {
 
   int64_t size() const { return PyArray_SIZE(arr_); }
 
-  T* data() const { return data_; }
+  const T* data() const { return reinterpret_cast<const T*>(data_); }
 
   bool is_strided() const { return stride_ != sizeof(T); }
 
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index 0d31ba3d70d..a0f8de10d07 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -422,22 +422,25 @@ static Result<std::complex<double>> Convert(const ComplexDoubleType*, const O&,
 class PyConverter : public Converter<PyObject*, PyConversionOptions> {
  public:
   // Iterate over the input values and defer the conversion to the Append method
-  Status Extend(PyObject* values, int64_t size) override {
+  Status Extend(PyObject* values, int64_t size, int64_t offset = 0) override {
+    DCHECK_GE(size, offset);
     /// Ensure we've allocated enough space
-    RETURN_NOT_OK(this->Reserve(size));
+    RETURN_NOT_OK(this->Reserve(size - offset));
     // Iterate over the items adding each one
-    return internal::VisitSequence(values, [this](PyObject* item, bool* /* unused */) {
-      return this->Append(item);
-    });
+    return internal::VisitSequence(
+        values, offset,
+        [this](PyObject* item, bool* /* unused */) { return this->Append(item); });
   }
 
   // Convert and append a sequence of values masked with a numpy array
-  Status ExtendMasked(PyObject* values, PyObject* mask, int64_t size) override {
+  Status ExtendMasked(PyObject* values, PyObject* mask, int64_t size,
+                      int64_t offset = 0) override {
+    DCHECK_GE(size, offset);
     /// Ensure we've allocated enough space
-    RETURN_NOT_OK(this->Reserve(size));
+    RETURN_NOT_OK(this->Reserve(size - offset));
     // Iterate over the items adding each one
     return internal::VisitSequenceMasked(
-        values, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) {
+        values, mask, offset, [this](PyObject* item, bool is_masked, bool* /* unused */) {
           if (is_masked) {
             return this->AppendNull();
           } else {
@@ -544,34 +547,6 @@ class PyPrimitiveConverter<
   }
 };
 
-template <typename T>
-class PyPrimitiveConverter<T, enable_if_binary<T>>
-    : public PrimitiveConverter<T, PyConverter> {
- public:
-  using OffsetType = typename T::offset_type;
-
-  Status Append(PyObject* value) override {
-    if (PyValue::IsNull(this->options_, value)) {
-      this->primitive_builder_->UnsafeAppendNull();
-    } else {
-      ARROW_RETURN_NOT_OK(
-          PyValue::Convert(this->primitive_type_, this->options_, value, view_));
-      // Since we don't know the varying length input size in advance, we need to
-      // reserve space in the value builder one by one. ReserveData raises CapacityError
-      // if the value would not fit into the array.
-      ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size));
-      this->primitive_builder_->UnsafeAppend(view_.bytes,
-                                             static_cast<OffsetType>(view_.size));
-    }
-    return Status::OK();
-  }
-
- protected:
-  // Create a single instance of PyBytesView here to prevent unnecessary object
-  // creation/destruction. This significantly improves the conversion performance.
-  PyBytesView view_;
-};
-
 template <typename T>
 class PyPrimitiveConverter<T, enable_if_t<std::is_same<T, FixedSizeBinaryType>::value>>
     : public PrimitiveConverter<T, PyConverter> {
@@ -593,7 +568,7 @@ class PyPrimitiveConverter<T, enable_if_t<std::is_same<T, FixedSizeBinaryType>::
 };
 
 template <typename T>
-class PyPrimitiveConverter<T, enable_if_string_like<T>>
+class PyPrimitiveConverter<T, enable_if_base_binary<T>>
     : public PrimitiveConverter<T, PyConverter> {
  public:
   using OffsetType = typename T::offset_type;
@@ -608,6 +583,9 @@ class PyPrimitiveConverter<T, enable_if_string_like<T>>
         // observed binary value
         observed_binary_ = true;
       }
+      // Since we don't know the varying length input size in advance, we need to
+      // reserve space in the value builder one by one. ReserveData raises CapacityError
+      // if the value would not fit into the array.
       ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size));
       this->primitive_builder_->UnsafeAppend(view_.bytes,
                                              static_cast<OffsetType>(view_.size));
@@ -758,7 +736,6 @@ class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {
     auto value_builder =
         checked_cast<ValueBuilderType*>(this->value_converter_->builder().get());
 
-    // TODO(wesm): Vector append when not strided
     Ndarray1DIndexer<NumpyType> values(ndarray);
     if (null_sentinels_possible) {
       for (int64_t i = 0; i < values.size(); ++i) {
@@ -768,6 +745,8 @@ class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {
           RETURN_NOT_OK(value_builder->Append(values[i]));
         }
       }
+    } else if (!values.is_strided()) {
+      RETURN_NOT_OK(value_builder->AppendValues(values.data(), values.size()));
     } else {
       for (int64_t i = 0; i < values.size(); ++i) {
         RETURN_NOT_OK(value_builder->Append(values[i]));
diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h
index 2c40a48726b..0b29e0f5bc7 100644
--- a/cpp/src/arrow/util/converter.h
+++ b/cpp/src/arrow/util/converter.h
@@ -54,11 +54,12 @@ class Converter {
 
   virtual Status Append(InputType value) { return Status::NotImplemented("Append"); }
 
-  virtual Status Extend(InputType values, int64_t size) {
+  virtual Status Extend(InputType values, int64_t size, int64_t offset = 0) {
     return Status::NotImplemented("Extend");
   }
 
-  virtual Status ExtendMasked(InputType values, InputType mask, int64_t size) {
+  virtual Status ExtendMasked(InputType values, InputType mask, int64_t size,
+                              int64_t offset = 0) {
     return Status::NotImplemented("ExtendMasked");
   }
 
@@ -70,6 +71,8 @@ class Converter {
 
   bool may_overflow() const { return may_overflow_; }
 
+  bool rewind_on_overflow() const { return rewind_on_overflow_; }
+
   virtual Status Reserve(int64_t additional_capacity) {
     return builder_->Reserve(additional_capacity);
   }
@@ -96,6 +99,7 @@ class Converter {
   std::shared_ptr<ArrayBuilder> builder_;
   OptionsType options_;
   bool may_overflow_ = false;
+  bool rewind_on_overflow_ = false;
 };
 
 template <typename ArrowType, typename BaseConverter>
@@ -134,7 +138,8 @@ class ListConverter : public BaseConverter {
         std::make_shared<BuilderType>(pool, value_converter_->builder(), this->type_);
     list_builder_ = checked_cast<BuilderType*>(this->builder_.get());
     // Narrow list types may overflow
-    this->may_overflow_ = sizeof(typename ArrowType::offset_type) < sizeof(int64_t);
+    this->may_overflow_ = this->rewind_on_overflow_ =
+        sizeof(typename ArrowType::offset_type) < sizeof(int64_t);
     return Status::OK();
   }
 
@@ -167,6 +172,7 @@ class StructConverter : public BaseConverter {
                             (MakeConverter<BaseConverter, ConverterTrait>(
                                 field->type(), this->options_, pool)));
       this->may_overflow_ |= child_converter->may_overflow();
+      this->rewind_on_overflow_ = this->may_overflow_;
       child_builders.push_back(child_converter->builder());
       children_.push_back(std::move(child_converter));
     }
@@ -302,32 +308,69 @@ class Chunker {
     return status;
   }
 
-  // we could get bit smarter here since the whole batch of appendable values
-  // will be rejected if a capacity error is raised
-  Status Extend(InputType values, int64_t size) {
-    auto status = converter_->Extend(values, size);
-    if (ARROW_PREDICT_FALSE(status.IsCapacityError())) {
-      if (converter_->builder()->length() == 0) {
+  Status Extend(InputType values, int64_t size, int64_t offset = 0) {
+    while (offset < size) {
+      auto length_before = converter_->builder()->length();
+      auto status = converter_->Extend(values, size, offset);
+      auto length_after = converter_->builder()->length();
+      auto num_converted = length_after - length_before;
+
+      offset += num_converted;
+      length_ += num_converted;
+
+      if (status.IsCapacityError()) {
+        if (converter_->builder()->length() == 0) {
+          // Builder length == 0 means the individual element is too large to append.
+          // In this case, no need to try again.
+          return status;
+        } else if (converter_->rewind_on_overflow()) {
+          // The list-like and binary-like conversion paths may raise  a capacity error,
+          // we need to handle them differently. While the binary-like converters check
+          // the capacity before append/extend the list-like converters just check after
+          // append/extend. Thus depending on the implementation semantics we may need
+          // to rewind (slice) the output chunk by one.
+          length_ -= 1;
+          offset -= 1;
+        }
+        ARROW_RETURN_NOT_OK(FinishChunk());
+      } else if (!status.ok()) {
         return status;
       }
-      ARROW_RETURN_NOT_OK(FinishChunk());
-      return Extend(values, size);
     }
-    length_ += size;
-    return status;
+    return Status::OK();
   }
 
-  Status ExtendMasked(InputType values, InputType mask, int64_t size) {
-    auto status = converter_->ExtendMasked(values, mask, size);
-    if (ARROW_PREDICT_FALSE(status.IsCapacityError())) {
-      if (converter_->builder()->length() == 0) {
+  Status ExtendMasked(InputType values, InputType mask, int64_t size,
+                      int64_t offset = 0) {
+    while (offset < size) {
+      auto length_before = converter_->builder()->length();
+      auto status = converter_->ExtendMasked(values, mask, size, offset);
+      auto length_after = converter_->builder()->length();
+      auto num_converted = length_after - length_before;
+
+      offset += num_converted;
+      length_ += num_converted;
+
+      if (status.IsCapacityError()) {
+        if (converter_->builder()->length() == 0) {
+          // Builder length == 0 means the individual element is too large to append.
+          // In this case, no need to try again.
+          return status;
+        } else if (converter_->rewind_on_overflow()) {
+          // The list-like and binary-like conversion paths may raise  a capacity error,
+          // we need to handle them differently. While the binary-like converters check
+          // the capacity before append/extend the list-like converters just check after
+          // append/extend. Thus depending on the implementation semantics we may need
+          // to rewind (slice) the output chunk by one.
+          length_ -= 1;
+          offset -= 1;
+        }
+        ARROW_RETURN_NOT_OK(FinishChunk());
+      } else if (!status.ok()) {
         return status;
       }
-      ARROW_RETURN_NOT_OK(FinishChunk());
-      return ExtendMasked(values, mask, size);
     }
-    length_ += size;
-    return status;
+    return Status::OK();
   }
 
   Status FinishChunk() {
diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py
index 850dff94df4..bdbc6b7b5a5 100644
--- a/python/pyarrow/tests/parquet/test_data_types.py
+++ b/python/pyarrow/tests/parquet/test_data_types.py
@@ -404,6 +404,7 @@ def test_fixed_size_binary():
 # -----------------------------------------------------------------------------
 
 
+@pytest.mark.slow
 @pytest.mark.large_memory
 def test_large_table_int32_overflow():
     size = np.iinfo('int32').max + 1
@@ -424,6 +425,7 @@ def _simple_table_roundtrip(table, use_legacy_dataset=False, **write_kwargs):
     return _read_table(buf, use_legacy_dataset=use_legacy_dataset)
 
 
+@pytest.mark.slow
 @pytest.mark.large_memory
 @parametrize_legacy_dataset
 def test_byte_array_exactly_2gb(use_legacy_dataset):
@@ -444,6 +446,7 @@ def test_byte_array_exactly_2gb(use_legacy_dataset):
         assert t.equals(result)
 
 
+@pytest.mark.slow
 @pytest.mark.pandas
 @pytest.mark.large_memory
 @parametrize_legacy_dataset
@@ -469,6 +472,7 @@ def test_binary_array_overflow_to_chunked(use_legacy_dataset):
     assert tbl.equals(read_tbl)
 
 
+@pytest.mark.slow
 @pytest.mark.pandas
 @pytest.mark.large_memory
 @parametrize_legacy_dataset
@@ -499,6 +503,7 @@ def test_large_binary():
             _check_roundtrip(table, use_dictionary=use_dictionary)
 
 
+@pytest.mark.slow
 @pytest.mark.large_memory
 def test_large_binary_huge():
     s = b'xy' * 997
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 30500bc3c5b..9f6ab678a95 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -2630,6 +2630,7 @@ def test_array_from_numpy_str_utf8():
         pa.array(vec, pa.string(), mask=np.array([False]))
 
 
+@pytest.mark.slow
 @pytest.mark.large_memory
 def test_numpy_binary_overflow_to_chunked():
     # ARROW-3762, ARROW-5966
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index ba94b340bd3..1a500b8523f 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -756,6 +756,7 @@ def test_large_binary_array(ty):
     assert len(arr) == nrepeats
 
 
+@pytest.mark.slow
 @pytest.mark.large_memory
 @pytest.mark.parametrize("ty", [pa.large_binary(), pa.large_string()])
 def test_large_binary_value(ty):
@@ -2169,7 +2170,6 @@ def test_auto_chunking_list_of_binary():
     assert arr.chunk(1).to_pylist() == [['x' * 1024]] * 2
 
 
-@pytest.mark.slow
 @pytest.mark.large_memory
 def test_auto_chunking_list_like():
     item = np.ones((2**28,), dtype='uint8')
@@ -2185,7 +2185,11 @@ def test_auto_chunking_list_like():
     assert arr.num_chunks == 2
     assert len(arr.chunk(0)) == 7
     assert len(arr.chunk(1)) == 1
-    assert arr.chunk(1)[0].as_py() == list(item)
+    chunk = arr.chunk(1)
+    scalar = chunk[0]
+    assert isinstance(scalar, pa.ListScalar)
+    expected = pa.array(item, type=pa.uint8())
+    assert scalar.values == expected
 
 
 @pytest.mark.slow
@@ -2232,3 +2236,49 @@ def test_nested_auto_chunking(ty, char):
         'integer': 1,
         'string-like': char
     }
+
+
+@pytest.mark.large_memory
+def test_array_from_pylist_data_overflow():
+    # Regression test for ARROW-12983
+    # Data buffer overflow - should result in chunked array
+    items = [b'a' * 4096] * (2 ** 19)
+    arr = pa.array(items, type=pa.string())
+    assert isinstance(arr, pa.ChunkedArray)
+    assert len(arr) == 2**19
+    assert len(arr.chunks) > 1
+
+    mask = np.zeros(2**19, bool)
+    arr = pa.array(items, mask=mask, type=pa.string())
+    assert isinstance(arr, pa.ChunkedArray)
+    assert len(arr) == 2**19
+    assert len(arr.chunks) > 1
+
+    arr = pa.array(items, type=pa.binary())
+    assert isinstance(arr, pa.ChunkedArray)
+    assert len(arr) == 2**19
+    assert len(arr.chunks) > 1
+
+
+@pytest.mark.slow
+@pytest.mark.large_memory
+def test_array_from_pylist_offset_overflow():
+    # Regression test for ARROW-12983
+    # Offset buffer overflow - should result in chunked array
+    # Note this doesn't apply to primitive arrays
+    items = [b'a'] * (2 ** 31)
+    arr = pa.array(items, type=pa.string())
+    assert isinstance(arr, pa.ChunkedArray)
+    assert len(arr) == 2**31
+    assert len(arr.chunks) > 1
+
+    mask = np.zeros(2**31, bool)
+    arr = pa.array(items, mask=mask, type=pa.string())
+    assert isinstance(arr, pa.ChunkedArray)
+    assert len(arr) == 2**31
+    assert len(arr.chunks) > 1
+
+    arr = pa.array(items, type=pa.binary())
+    assert isinstance(arr, pa.ChunkedArray)
+    assert len(arr) == 2**31
+    assert len(arr.chunks) > 1
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 7f904433fa2..b6557875c2c 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -2167,20 +2167,19 @@ def test_list_of_dictionary(self):
         expected[2] = None
         tm.assert_series_equal(arr.to_pandas(), expected)
 
-    @pytest.mark.slow
     @pytest.mark.large_memory
     def test_auto_chunking_on_list_overflow(self):
         # ARROW-9976
-        n = 2**24
+        n = 2**21
         df = pd.DataFrame.from_dict({
-            "a": list(np.zeros((n, 2**7), dtype='uint8')),
+            "a": list(np.zeros((n, 2**10), dtype='uint8')),
             "b": range(n)
         })
         table = pa.Table.from_pandas(df)
 
         column_a = table[0]
         assert column_a.num_chunks == 2
-        assert len(column_a.chunk(0)) == 2**24 - 1
+        assert len(column_a.chunk(0)) == 2**21 - 1
         assert len(column_a.chunk(1)) == 1
 
     def test_map_array_roundtrip(self):
@@ -2356,6 +2355,7 @@ def test_from_numpy_nested(self):
             {'x': {'xx': 1, 'yy': True}, 'y': 2, 'z': 'foo'},
             {'x': {'xx': 3, 'yy': False}, 'y': 4, 'z': 'bar'}]
 
+    @pytest.mark.slow
     @pytest.mark.large_memory
     def test_from_numpy_large(self):
         # Exercise rechunking + nulls
diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp
index d0f4f3a6def..683e8f278e8 100644
--- a/r/src/r_to_arrow.cpp
+++ b/r/src/r_to_arrow.cpp
@@ -326,7 +326,7 @@ class RConverter : public Converter<SEXP, RConversionOptions> {
  public:
   virtual Status Append(SEXP) { return Status::NotImplemented("Append"); }
 
-  virtual Status Extend(SEXP values, int64_t size) {
+  virtual Status Extend(SEXP values, int64_t size, int64_t offset = 0) {
     return Status::NotImplemented("Extend");
   }
 
@@ -337,7 +337,7 @@ class RConverter : public Converter<SEXP, RConversionOptions> {
     tasks.Append(false, task);
   }
 
-  virtual Status ExtendMasked(SEXP values, SEXP mask, int64_t size) {
+  virtual Status ExtendMasked(SEXP values, SEXP mask, int64_t size, int64_t offset = 0) {
     return Status::NotImplemented("ExtendMasked");
   }
 };
@@ -434,8 +434,8 @@ template <typename T>
 class RPrimitiveConverter<T, enable_if_null<T>>
     : public PrimitiveConverter<T, RConverter> {
  public:
-  Status Extend(SEXP, int64_t size) override {
-    return this->primitive_builder_->AppendNulls(size);
+  Status Extend(SEXP, int64_t size, int64_t offset = 0) override {
+    return this->primitive_builder_->AppendNulls(size - offset);
   }
 };
 
@@ -445,17 +445,17 @@ class RPrimitiveConverter<
     T, enable_if_t<is_integer_type<T>::value || is_floating_type<T>::value>>
     : public PrimitiveConverter<T, RConverter> {
  public:
-  Status Extend(SEXP x, int64_t size) override {
+  Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
     auto rtype = GetVectorType(x);
     switch (rtype) {
       case UINT8:
-        return ExtendDispatch<unsigned char>(x, size);
+        return ExtendDispatch<unsigned char>(x, size, offset);
       case INT32:
-        return ExtendDispatch<int>(x, size);
+        return ExtendDispatch<int>(x, size, offset);
       case FLOAT64:
-        return ExtendDispatch<double>(x, size);
+        return ExtendDispatch<double>(x, size, offset);
       case INT64:
-        return ExtendDispatch<int64_t>(x, size);
+        return ExtendDispatch<int64_t>(x, size, offset);
 
       default:
         break;
@@ -471,14 +471,14 @@ class RPrimitiveConverter<
 
  private:
   template <typename r_value_type>
-  Status ExtendDispatch(SEXP x, int64_t size) {
+  Status ExtendDispatch(SEXP x, int64_t size, int64_t offset) {
     if (ALTREP(x)) {
       // `x` is an ALTREP R vector storing `r_value_type`
       // and that type matches exactly the type of the array this is building
-      return Extend_impl(RVectorIterator_ALTREP<r_value_type>(x, 0), size);
+      return Extend_impl(RVectorIterator_ALTREP<r_value_type>(x, offset), size);
     } else {
       // `x` is not an ALTREP vector so we have direct access to a range of values
-      return Extend_impl(RVectorIterator<r_value_type>(x, 0), size);
+      return Extend_impl(RVectorIterator<r_value_type>(x, offset), size);
     }
   }
 
@@ -514,16 +514,16 @@ template <typename T>
 class RPrimitiveConverter<T, enable_if_t<is_boolean_type<T>::value>>
     : public PrimitiveConverter<T, RConverter> {
  public:
-  Status Extend(SEXP x, int64_t size) override {
+  Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
     auto rtype = GetVectorType(x);
     if (rtype != BOOLEAN) {
       return Status::Invalid("Expecting a logical vector");
     }
 
     if (ALTREP(x)) {
-      return Extend_impl(RVectorIterator_ALTREP<cpp11::r_bool>(x, 0), size);
+      return Extend_impl(RVectorIterator_ALTREP<cpp11::r_bool>(x, offset), size);
     } else {
-      return Extend_impl(RVectorIterator<cpp11::r_bool>(x, 0), size);
+      return Extend_impl(RVectorIterator<cpp11::r_bool>(x, offset), size);
     }
   }
 
@@ -553,16 +553,16 @@ template <typename T>
 class RPrimitiveConverter<T, enable_if_t<is_date_type<T>::value>>
     : public PrimitiveConverter<T, RConverter> {
  public:
-  Status Extend(SEXP x, int64_t size) override {
+  Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
     switch (GetVectorType(x)) {
       case DATE_INT:
-        return AppendRange_Date_dispatch<int>(x, size);
+        return AppendRange_Date_dispatch<int>(x, size, offset);
 
       case DATE_DBL:
-        return AppendRange_Date_dispatch<double>(x, size);
+        return AppendRange_Date_dispatch<double>(x, size, offset);
 
       case POSIXCT:
-        return AppendRange_Posixct_dispatch(x, size);
+        return AppendRange_Posixct_dispatch(x, size, offset);
 
       default:
         break;
@@ -578,11 +578,12 @@ class RPrimitiveConverter<T, enable_if_t<is_date_type<T>::value>>
 
  private:
   template <typename r_value_type>
-  Status AppendRange_Date_dispatch(SEXP x, int64_t size) {
+  Status AppendRange_Date_dispatch(SEXP x, int64_t size, int64_t offset) {
     if (ALTREP(x)) {
-      return AppendRange_Date(RVectorIterator_ALTREP<r_value_type>(x, 0), size);
+      return AppendRange_Date(RVectorIterator_ALTREP<r_value_type>(x, offset),
+                              size - offset);
     } else {
-      return AppendRange_Date(RVectorIterator<r_value_type>(x, 0), size);
+      return AppendRange_Date(RVectorIterator<r_value_type>(x, offset), size - offset);
     }
   }
 
@@ -602,11 +603,12 @@ class RPrimitiveConverter<T, enable_if_t<is_date_type<T>::value>>
     return VisitVector(it, size, append_null, append_value);
   }
 
-  Status AppendRange_Posixct_dispatch(SEXP x, int64_t size) {
+  Status AppendRange_Posixct_dispatch(SEXP x, int64_t size, int64_t offset) {
     if (ALTREP(x)) {
-      return AppendRange_Posixct(RVectorIterator_ALTREP<double>(x, 0), size);
+      return AppendRange_Posixct(RVectorIterator_ALTREP<double>(x, offset),
+                                 size - offset);
     } else {
-      return AppendRange_Posixct(RVectorIterator<double>(x, 0), size);
+      return AppendRange_Posixct(RVectorIterator<double>(x, offset), size - offset);
     }
   }
 
@@ -660,8 +662,8 @@ template <typename T>
 class RPrimitiveConverter<T, enable_if_t<is_time_type<T>::value>>
     : public PrimitiveConverter<T, RConverter> {
  public:
-  Status Extend(SEXP x, int64_t size) override {
-    RETURN_NOT_OK(this->Reserve(size));
+  Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+    RETURN_NOT_OK(this->Reserve(size - offset));
     auto rtype = GetVectorType(x);
     if (rtype != TIME) {
       return Status::Invalid("Invalid conversion to time");
@@ -699,10 +701,11 @@ class RPrimitiveConverter<T, enable_if_t<is_time_type<T>::value>>
     };
 
     if (ALTREP(x)) {
-      return VisitVector(RVectorIterator_ALTREP<double>(x, 0), size, append_null,
+      return VisitVector(RVectorIterator_ALTREP<double>(x, offset), size, append_null,
                          append_value);
     } else {
-      return VisitVector(RVectorIterator<double>(x, 0), size, append_null, append_value);
+      return VisitVector(RVectorIterator<double>(x, offset), size, append_null,
+                         append_value);
     }
   }
 
@@ -716,8 +719,8 @@ template <typename T>
 class RPrimitiveConverter<T, enable_if_t<is_timestamp_type<T>::value>>
     : public PrimitiveConverter<T, RConverter> {
  public:
-  Status Extend(SEXP x, int64_t size) override {
-    RETURN_NOT_OK(this->Reserve(size));
+  Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+    RETURN_NOT_OK(this->Reserve(size - offset));
 
     RVectorType rtype = GetVectorType(x);
     if (rtype != POSIXCT) {
@@ -737,10 +740,11 @@ class RPrimitiveConverter<T, enable_if_t<is_timestamp_type<T>::value>>
     };
 
     if (ALTREP(x)) {
-      return VisitVector(RVectorIterator_ALTREP<double>(x, 0), size, append_null,
+      return VisitVector(RVectorIterator_ALTREP<double>(x, offset), size, append_null,
                          append_value);
     } else {
-      return VisitVector(RVectorIterator<double>(x, 0), size, append_null, append_value);
+      return VisitVector(RVectorIterator<double>(x, offset), size, append_null,
+                         append_value);
     }
   }
 
@@ -754,7 +758,7 @@ template <typename T>
 class RPrimitiveConverter<T, enable_if_t<is_decimal_type<T>::value>>
     : public PrimitiveConverter<T, RConverter> {
  public:
-  Status Extend(SEXP x, int64_t size) override {
+  Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
     return Status::NotImplemented("Extend");
   }
 };
@@ -786,8 +790,8 @@ class RPrimitiveConverter<T, enable_if_binary<T>>
  public:
   using OffsetType = typename T::offset_type;
 
-  Status Extend(SEXP x, int64_t size) override {
-    RETURN_NOT_OK(this->Reserve(size));
+  Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+    RETURN_NOT_OK(this->Reserve(size - offset));
     RETURN_NOT_OK(check_binary(x, size));
 
     auto append_null = [this]() {
@@ -801,7 +805,7 @@ class RPrimitiveConverter<T, enable_if_binary<T>>
       this->primitive_builder_->UnsafeAppend(RAW_RO(raw), static_cast<OffsetType>(n));
       return Status::OK();
     };
-    return VisitVector(RVectorIterator<SEXP>(x, 0), size, append_null, append_value);
+    return VisitVector(RVectorIterator<SEXP>(x, offset), size, append_null, append_value);
   }
 
   void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
@@ -814,8 +818,8 @@ template <typename T>
 class RPrimitiveConverter<T, enable_if_t<std::is_same<T, FixedSizeBinaryType>::value>>
     : public PrimitiveConverter<T, RConverter> {
  public:
-  Status Extend(SEXP x, int64_t size) override {
-    RETURN_NOT_OK(this->Reserve(size));
+  Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+    RETURN_NOT_OK(this->Reserve(size - offset));
     RETURN_NOT_OK(check_binary(x, size));
 
     auto append_null = [this]() {
@@ -833,7 +837,7 @@ class RPrimitiveConverter<T, enable_if_t<std::is_same<T, FixedSizeBinaryType>::v
       this->primitive_builder_->UnsafeAppend(RAW_RO(raw));
       return Status::OK();
     };
-    return VisitVector(RVectorIterator<SEXP>(x, 0), size, append_null, append_value);
+    return VisitVector(RVectorIterator<SEXP>(x, offset), size, append_null, append_value);
   }
 
   void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
@@ -848,12 +852,12 @@ class RPrimitiveConverter<T, enable_if_string_like<T>>
  public:
   using OffsetType = typename T::offset_type;
 
-  Status Extend(SEXP x, int64_t size) override {
+  Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
     RVectorType rtype = GetVectorType(x);
     if (rtype != STRING) {
       return Status::Invalid("Expecting a character vector");
     }
-    return UnsafeAppendUtf8Strings(arrow::r::utf8_strings(x), size);
+    return UnsafeAppendUtf8Strings(arrow::r::utf8_strings(x), size, offset);
   }
 
   void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
@@ -863,14 +867,14 @@ class RPrimitiveConverter<T, enable_if_string_like<T>>
   }
 
  private:
-  Status UnsafeAppendUtf8Strings(const cpp11::strings& s, int64_t size) {
+  Status UnsafeAppendUtf8Strings(const cpp11::strings& s, int64_t size, int64_t offset) {
     RETURN_NOT_OK(this->primitive_builder_->Reserve(s.size()));
     const SEXP* p_strings = reinterpret_cast<const SEXP*>(DATAPTR_RO(s));
 
     // we know all the R strings are utf8 already, so we can get
     // a definite size and then use UnsafeAppend*()
     int64_t total_length = 0;
-    for (R_xlen_t i = 0; i < size; i++, ++p_strings) {
+    for (R_xlen_t i = offset; i < size; i++, ++p_strings) {
       SEXP si = *p_strings;
       total_length += si == NA_STRING ? 0 : LENGTH(si);
     }
@@ -878,7 +882,7 @@ class RPrimitiveConverter<T, enable_if_string_like<T>>
 
     // append
     p_strings = reinterpret_cast<const SEXP*>(DATAPTR_RO(s));
-    for (R_xlen_t i = 0; i < size; i++, ++p_strings) {
+    for (R_xlen_t i = offset; i < size; i++, ++p_strings) {
       SEXP si = *p_strings;
       if (si == NA_STRING) {
         this->primitive_builder_->UnsafeAppendNull();
@@ -895,7 +899,7 @@ template <typename T>
 class RPrimitiveConverter<T, enable_if_t<is_duration_type<T>::value>>
     : public PrimitiveConverter<T, RConverter> {
  public:
-  Status Extend(SEXP x, int64_t size) override {
+  Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
     // TODO: look in lubridate
     return Status::NotImplemented("Extend");
   }
@@ -911,7 +915,7 @@ template <typename U>
 class RDictionaryConverter<U, enable_if_has_c_type<U>>
     : public DictionaryConverter<U, RConverter> {
  public:
-  Status Extend(SEXP x, int64_t size) override {
+  Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
     return Status::NotImplemented("Extend");
   }
 };
@@ -922,14 +926,14 @@ class RDictionaryConverter<ValueType, enable_if_has_string_view<ValueType>>
  public:
   using BuilderType = DictionaryBuilder<ValueType>;
 
-  Status Extend(SEXP x, int64_t size) override {
-    RETURN_NOT_OK(ExtendSetup(x, size));
-    return ExtendImpl(x, size, GetCharLevels(x));
+  Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+    RETURN_NOT_OK(ExtendSetup(x, size, offset));
+    return ExtendImpl(x, size, offset, GetCharLevels(x));
   }
 
   void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
     // the setup runs synchronously first
-    Status setup = ExtendSetup(values, size);
+    Status setup = ExtendSetup(values, size, /*offset=*/0);
 
     if (!setup.ok()) {
       // if that fails, propagate the error
@@ -938,7 +942,7 @@ class RDictionaryConverter<ValueType, enable_if_has_string_view<ValueType>>
       auto char_levels = GetCharLevels(values);
 
       tasks.Append(true, [this, values, size, char_levels]() {
-        return this->ExtendImpl(values, size, char_levels);
+        return this->ExtendImpl(values, size, /*offset=*/0, char_levels);
       });
     }
   }
@@ -970,7 +974,7 @@ class RDictionaryConverter<ValueType, enable_if_has_string_view<ValueType>>
     return char_levels;
   }
 
-  Status ExtendSetup(SEXP x, int64_t size) {
+  Status ExtendSetup(SEXP x, int64_t size, int64_t offset) {
     RVectorType rtype = GetVectorType(x);
     if (rtype != FACTOR) {
       return Status::Invalid("invalid R type to convert to dictionary");
@@ -982,17 +986,18 @@ class RDictionaryConverter<ValueType, enable_if_has_string_view<ValueType>>
     RETURN_NOT_OK(this->value_builder_->InsertMemoValues(*memo_array));
 
     // then we can proceed
-    return this->Reserve(size);
+    return this->Reserve(size - offset);
   }
 
-  Status ExtendImpl(SEXP values, int64_t size,
+  Status ExtendImpl(SEXP values, int64_t size, int64_t offset,
                     const std::vector<const char*>& char_levels) {
     auto append_null = [this]() { return this->value_builder_->AppendNull(); };
     auto append_value = [this, &char_levels](int value) {
       return this->value_builder_->Append(char_levels[value - 1]);
     };
 
-    return VisitVector(RVectorIterator<int>(values, 0), size, append_null, append_value);
+    return VisitVector(RVectorIterator<int>(values, offset), size, append_null,
+                       append_value);
   }
 };
 
@@ -1014,7 +1019,7 @@ struct RConverterTrait<T, enable_if_list_like<T>> {
 template <typename T>
 class RListConverter : public ListConverter<T, RConverter, RConverterTrait> {
  public:
-  Status Extend(SEXP x, int64_t size) override {
+  Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
     RETURN_NOT_OK(this->Reserve(size));
 
     RVectorType rtype = GetVectorType(x);
@@ -1034,7 +1039,7 @@ class RListConverter : public ListConverter<T, RConverter, RConverterTrait> {
       return this->value_converter_.get()->Extend(value, n);
     };
 
-    return VisitVector(RVectorIterator<SEXP>(x, 0), size, append_null, append_value);
+    return VisitVector(RVectorIterator<SEXP>(x, offset), size, append_null, append_value);
   }
 
   void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
@@ -1056,12 +1061,12 @@ struct RConverterTrait<StructType> {
 
 class RStructConverter : public StructConverter<RConverter, RConverterTrait> {
  public:
-  Status Extend(SEXP x, int64_t size) override {
-    RETURN_NOT_OK(ExtendSetup(x, size));
+  Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+    RETURN_NOT_OK(ExtendSetup(x, size, offset));
 
     auto fields = this->struct_type_->fields();
     R_xlen_t n_columns = XLENGTH(x);
-    for (R_xlen_t i = 0; i < n_columns; i++) {
+    for (R_xlen_t i = offset; i < n_columns; i++) {
       auto status = children_[i]->Extend(VECTOR_ELT(x, i), size);
       if (!status.ok()) {
         return Status::Invalid("Problem with column ", (i + 1), " (", fields[i]->name(),
@@ -1074,7 +1079,7 @@ class RStructConverter : public StructConverter<RConverter, RConverterTrait> {
 
   void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
     // the setup runs synchronously first
-    Status setup = ExtendSetup(values, size);
+    Status setup = ExtendSetup(values, size, /*offset=*/0);
 
     if (!setup.ok()) {
       // if that fails, propagate the error
@@ -1095,7 +1100,7 @@ class RStructConverter : public StructConverter<RConverter, RConverterTrait> {
     return StructConverter<RConverter, RConverterTrait>::Init(pool);
   }
 
-  Status ExtendSetup(SEXP x, int64_t size) {
+  Status ExtendSetup(SEXP x, int64_t size, int64_t offset) {
     // check that x is compatible
     R_xlen_t n_columns = XLENGTH(x);
 
@@ -1133,7 +1138,7 @@ class RStructConverter : public StructConverter<RConverter, RConverterTrait> {
       }
     }
 
-    RETURN_NOT_OK(this->Reserve(size));
+    RETURN_NOT_OK(this->Reserve(size - offset));
 
     for (R_xlen_t i = 0; i < size; i++) {
       RETURN_NOT_OK(struct_builder_->Append());

From 71643649cfe3673a3001e85ff68c40ea74bf6d81 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Tue, 22 Jun 2021 20:15:19 +0200
Subject: [PATCH 48/61] ARROW-13135: [C++] Fix Status propagation from Parquet
 exception

Closes #10566 from pitrou/ARROW-13135-parquet-status-exception

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 cpp/src/arrow/dataset/file_parquet.cc      | 87 +++++++++++++---------
 cpp/src/arrow/dataset/file_parquet_test.cc |  2 +-
 cpp/src/arrow/dataset/test_util.h          |  4 +-
 cpp/src/parquet/exception.h                | 43 ++++++-----
 cpp/src/parquet/reader_test.cc             |  8 +-
 python/pyarrow/tests/parquet/test_basic.py | 16 +++-
 python/pyarrow/tests/test_hdfs.py          | 17 +++--
 7 files changed, 107 insertions(+), 70 deletions(-)

diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
index 0ebbd0a5333..8611cf89997 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -52,6 +52,8 @@ using parquet::arrow::SchemaField;
 using parquet::arrow::SchemaManifest;
 using parquet::arrow::StatisticsAsScalars;
 
+namespace {
+
 /// \brief A ScanTask backed by a parquet file and a RowGroup within a parquet file.
 class ParquetScanTask : public ScanTask {
  public:
@@ -128,7 +130,7 @@ class ParquetScanTask : public ScanTask {
   arrow::io::CacheOptions cache_options_;
 };
 
-static parquet::ReaderProperties MakeReaderProperties(
+parquet::ReaderProperties MakeReaderProperties(
     const ParquetFileFormat& format, ParquetFragmentScanOptions* parquet_scan_options,
     MemoryPool* pool = default_memory_pool()) {
   // Can't mutate pool after construction
@@ -144,7 +146,7 @@ static parquet::ReaderProperties MakeReaderProperties(
   return properties;
 }
 
-static parquet::ArrowReaderProperties MakeArrowReaderProperties(
+parquet::ArrowReaderProperties MakeArrowReaderProperties(
     const ParquetFileFormat& format, const parquet::FileMetaData& metadata) {
   parquet::ArrowReaderProperties properties(/* use_threads = */ false);
   for (const std::string& name : format.reader_options.dict_columns) {
@@ -155,7 +157,7 @@ static parquet::ArrowReaderProperties MakeArrowReaderProperties(
 }
 
 template <typename M>
-static Result<std::shared_ptr<SchemaManifest>> GetSchemaManifest(
+Result<std::shared_ptr<SchemaManifest>> GetSchemaManifest(
     const M& metadata, const parquet::ArrowReaderProperties& properties) {
   auto manifest = std::make_shared<SchemaManifest>();
   const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata = nullptr;
@@ -164,7 +166,7 @@ static Result<std::shared_ptr<SchemaManifest>> GetSchemaManifest(
   return manifest;
 }
 
-static util::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
+util::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
     const SchemaField& schema_field, const parquet::RowGroupMetaData& metadata) {
   // For the remaining of this function, failure to extract/parse statistics
   // are ignored by returning nullptr. The goal is two fold. First
@@ -214,8 +216,8 @@ static util::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
   return util::nullopt;
 }
 
-static void AddColumnIndices(const SchemaField& schema_field,
-                             std::vector<int>* column_projection) {
+void AddColumnIndices(const SchemaField& schema_field,
+                      std::vector<int>* column_projection) {
   if (schema_field.is_leaf()) {
     column_projection->push_back(schema_field.column_index);
   } else {
@@ -227,8 +229,8 @@ static void AddColumnIndices(const SchemaField& schema_field,
 }
 
 // Compute the column projection out of an optional arrow::Schema
-static std::vector<int> InferColumnProjection(const parquet::arrow::FileReader& reader,
-                                              const ScanOptions& options) {
+std::vector<int> InferColumnProjection(const parquet::arrow::FileReader& reader,
+                                       const ScanOptions& options) {
   auto manifest = reader.manifest();
   // Checks if the field is needed in either the projection or the filter.
   auto field_names = options.MaterializedFields();
@@ -253,6 +255,33 @@ static std::vector<int> InferColumnProjection(const parquet::arrow::FileReader&
   return columns_selection;
 }
 
+Status WrapSourceError(const Status& status, const std::string& path) {
+  return status.WithMessage("Could not open Parquet input source '", path,
+                            "': ", status.message());
+}
+
+Result<bool> IsSupportedParquetFile(const ParquetFileFormat& format,
+                                    const FileSource& source) {
+  BEGIN_PARQUET_CATCH_EXCEPTIONS
+  try {
+    ARROW_ASSIGN_OR_RAISE(auto input, source.Open());
+    ARROW_ASSIGN_OR_RAISE(
+        auto parquet_scan_options,
+        GetFragmentScanOptions<ParquetFragmentScanOptions>(
+            kParquetTypeName, nullptr, format.default_fragment_scan_options));
+    auto reader = parquet::ParquetFileReader::Open(
+        std::move(input), MakeReaderProperties(format, parquet_scan_options.get()));
+    std::shared_ptr<parquet::FileMetaData> metadata = reader->metadata();
+    return metadata != nullptr && metadata->can_decompress();
+  } catch (const ::parquet::ParquetInvalidOrCorruptedFileException& e) {
+    ARROW_UNUSED(e);
+    return false;
+  }
+  END_PARQUET_CATCH_EXCEPTIONS
+}
+
+}  // namespace
+
 bool ParquetFileFormat::Equals(const FileFormat& other) const {
   if (other.type_name() != type_name()) return false;
 
@@ -270,24 +299,11 @@ ParquetFileFormat::ParquetFileFormat(const parquet::ReaderProperties& reader_pro
 }
 
 Result<bool> ParquetFileFormat::IsSupported(const FileSource& source) const {
-  try {
-    ARROW_ASSIGN_OR_RAISE(auto input, source.Open());
-    ARROW_ASSIGN_OR_RAISE(auto parquet_scan_options,
-                          GetFragmentScanOptions<ParquetFragmentScanOptions>(
-                              kParquetTypeName, nullptr, default_fragment_scan_options));
-    auto reader = parquet::ParquetFileReader::Open(
-        std::move(input), MakeReaderProperties(*this, parquet_scan_options.get()));
-    std::shared_ptr<parquet::FileMetaData> metadata = reader->metadata();
-    return metadata != nullptr && metadata->can_decompress();
-  } catch (const ::parquet::ParquetInvalidOrCorruptedFileException& e) {
-    ARROW_UNUSED(e);
-    return false;
-  } catch (const ::parquet::ParquetException& e) {
-    return Status::IOError("Could not open parquet input source '", source.path(),
-                           "': ", e.what());
+  auto maybe_is_supported = IsSupportedParquetFile(*this, source);
+  if (!maybe_is_supported.ok()) {
+    return WrapSourceError(maybe_is_supported.status(), source.path());
   }
-
-  return true;
+  return maybe_is_supported;
 }
 
 Result<std::shared_ptr<Schema>> ParquetFileFormat::Inspect(
@@ -307,14 +323,18 @@ Result<std::unique_ptr<parquet::arrow::FileReader>> ParquetFileFormat::GetReader
   auto properties = MakeReaderProperties(*this, parquet_scan_options.get(), pool);
 
   ARROW_ASSIGN_OR_RAISE(auto input, source.Open());
-  std::unique_ptr<parquet::ParquetFileReader> reader;
-  try {
-    reader = parquet::ParquetFileReader::Open(std::move(input), std::move(properties));
-  } catch (const ::parquet::ParquetException& e) {
-    return Status::IOError("Could not open parquet input source '", source.path(),
-                           "': ", e.what());
-  }
 
+  auto make_reader = [&]() -> Result<std::unique_ptr<parquet::ParquetFileReader>> {
+    BEGIN_PARQUET_CATCH_EXCEPTIONS
+    return parquet::ParquetFileReader::Open(std::move(input), std::move(properties));
+    END_PARQUET_CATCH_EXCEPTIONS
+  };
+
+  auto maybe_reader = std::move(make_reader)();
+  if (!maybe_reader.ok()) {
+    return WrapSourceError(maybe_reader.status(), source.path());
+  }
+  std::unique_ptr<parquet::ParquetFileReader> reader = *std::move(maybe_reader);
   std::shared_ptr<parquet::FileMetaData> metadata = reader->metadata();
   auto arrow_properties = MakeArrowReaderProperties(*this, *metadata);
 
@@ -371,8 +391,7 @@ Future<std::shared_ptr<parquet::arrow::FileReader>> ParquetFileFormat::GetReader
       },
       [path](
           const Status& status) -> Result<std::shared_ptr<parquet::arrow::FileReader>> {
-        return status.WithMessage("Could not open Parquet input source '", path,
-                                  "': ", status.message());
+        return WrapSourceError(status, path);
       });
 }
 
diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc
index 2561bdb7c5d..04c86b1f16f 100644
--- a/cpp/src/arrow/dataset/file_parquet_test.cc
+++ b/cpp/src/arrow/dataset/file_parquet_test.cc
@@ -176,7 +176,7 @@ class TestParquetFileFormat : public FileFormatFixtureMixin<ParquetFormatHelper>
 };
 
 TEST_F(TestParquetFileFormat, InspectFailureWithRelevantError) {
-  TestInspectFailureWithRelevantError(StatusCode::IOError, "parquet");
+  TestInspectFailureWithRelevantError(StatusCode::Invalid, "Parquet");
 }
 TEST_F(TestParquetFileFormat, Inspect) { TestInspect(); }
 
diff --git a/cpp/src/arrow/dataset/test_util.h b/cpp/src/arrow/dataset/test_util.h
index 6a0375073ee..66f5654fec4 100644
--- a/cpp/src/arrow/dataset/test_util.h
+++ b/cpp/src/arrow/dataset/test_util.h
@@ -445,13 +445,15 @@ class FileFormatFixtureMixin : public ::testing::Test {
                 "Error creating dataset. Could not read schema from '/herp/derp':"),
             ::testing::HasSubstr("Is this a '" + format_->type_name() + "' file?")));
   }
+
   void TestInspectFailureWithRelevantError(StatusCode code,
-                                           const std::string format_name) {
+                                           const std::string& format_name) {
     const std::vector<std::string> file_contents{"", "PAR0", "ASDFPAR1", "ARROW1"};
     for (const auto& contents : file_contents) {
       AssertInspectFailure(contents, code, format_name);
     }
   }
+
   void TestInspect() {
     auto reader = GetRecordBatchReader(schema({field("f64", float64())}));
     auto source = GetFileSource(reader.get());
diff --git a/cpp/src/parquet/exception.h b/cpp/src/parquet/exception.h
index bfd1bfd9422..826f5bdc8bf 100644
--- a/cpp/src/parquet/exception.h
+++ b/cpp/src/parquet/exception.h
@@ -33,23 +33,29 @@
 
 // Parquet exception to Arrow Status
 
-#define PARQUET_CATCH_NOT_OK(s)                          \
-  try {                                                  \
-    (s);                                                 \
-  } catch (const ::parquet::ParquetStatusException& e) { \
-    return e.status();                                   \
-  } catch (const ::parquet::ParquetException& e) {       \
-    return ::arrow::Status::IOError(e.what());           \
+#define BEGIN_PARQUET_CATCH_EXCEPTIONS try {
+#define END_PARQUET_CATCH_EXCEPTIONS                   \
+  }                                                    \
+  catch (const ::parquet::ParquetStatusException& e) { \
+    return e.status();                                 \
+  }                                                    \
+  catch (const ::parquet::ParquetException& e) {       \
+    return ::arrow::Status::IOError(e.what());         \
   }
 
-#define PARQUET_CATCH_AND_RETURN(s)                      \
-  try {                                                  \
-    return (s);                                          \
-  } catch (const ::parquet::ParquetStatusException& e) { \
-    return e.status();                                   \
-  } catch (const ::parquet::ParquetException& e) {       \
-    return ::arrow::Status::IOError(e.what());           \
-  }
+// clang-format off
+
+#define PARQUET_CATCH_NOT_OK(s)    \
+  BEGIN_PARQUET_CATCH_EXCEPTIONS   \
+  (s);                             \
+  END_PARQUET_CATCH_EXCEPTIONS
+
+// clang-format on
+
+#define PARQUET_CATCH_AND_RETURN(s) \
+  BEGIN_PARQUET_CATCH_EXCEPTIONS    \
+  return (s);                       \
+  END_PARQUET_CATCH_EXCEPTIONS
 
 // Arrow Status to Parquet exception
 
@@ -149,11 +155,4 @@ void ThrowNotOk(StatusReturnBlock&& b) {
   PARQUET_THROW_NOT_OK(b());
 }
 
-#define BEGIN_PARQUET_CATCH_EXCEPTIONS try {
-#define END_PARQUET_CATCH_EXCEPTIONS             \
-  }                                              \
-  catch (const ::parquet::ParquetException& e) { \
-    return ::arrow::Status::IOError(e.what());   \
-  }
-
 }  // namespace parquet
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index 806ff2b9494..2d13266df22 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -688,17 +688,17 @@ TEST(TestFileReader, TestOpenErrors) {
           ::testing::HasSubstr("Couldn't deserialize thrift: No more data to read")));
 
   EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT(
-      IOError, ::testing::HasSubstr("Parquet file size is 0 bytes"), OpenBufferAsync(""));
+      Invalid, ::testing::HasSubstr("Parquet file size is 0 bytes"), OpenBufferAsync(""));
   EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT(
-      IOError, ::testing::HasSubstr("Parquet magic bytes not found"),
+      Invalid, ::testing::HasSubstr("Parquet magic bytes not found"),
       OpenBufferAsync("AAAAPAR0"));
   EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT(
-      IOError,
+      Invalid,
       ::testing::HasSubstr(
           "Parquet file size is 5 bytes, smaller than the minimum file footer"),
       OpenBufferAsync("APAR1"));
   EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT(
-      IOError,
+      Invalid,
       ::testing::HasSubstr(
           "Parquet file size is 8 bytes, smaller than the size reported by footer's"),
       OpenBufferAsync("\xFF\xFF\xFF\x0FPAR1"));
diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py
index ebf9d44b3bb..adaa39f7ed6 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -278,13 +278,25 @@ def test_relative_paths(tempdir, use_legacy_dataset, filesystem):
     assert result.equals(table)
 
 
-@parametrize_legacy_dataset
-def test_read_non_existing_file(use_legacy_dataset):
+def test_read_non_existing_file():
     # ensure we have a proper error message
     with pytest.raises(FileNotFoundError):
         pq.read_table('i-am-not-existing.parquet')
 
 
+def test_file_error_python_exception():
+    class BogusFile(io.BytesIO):
+        def read(self, *args):
+            raise ZeroDivisionError("zorglub")
+
+        def seek(self, *args):
+            raise ZeroDivisionError("zorglub")
+
+    # ensure the Python exception is restored
+    with pytest.raises(ZeroDivisionError, match="zorglub"):
+        pq.read_table(BogusFile(b""))
+
+
 @parametrize_legacy_dataset
 def test_parquet_read_from_buffer(tempdir, use_legacy_dataset):
     # reading from a buffer from python's open()
diff --git a/python/pyarrow/tests/test_hdfs.py b/python/pyarrow/tests/test_hdfs.py
index e5b8d1a6106..c71353b45f0 100644
--- a/python/pyarrow/tests/test_hdfs.py
+++ b/python/pyarrow/tests/test_hdfs.py
@@ -39,6 +39,15 @@
 # HDFS tests
 
 
+def check_libhdfs_present():
+    if not pa.have_libhdfs():
+        message = 'No libhdfs available on system'
+        if os.environ.get('PYARROW_HDFS_TEST_LIBHDFS_REQUIRE'):
+            pytest.fail(message)
+        else:
+            pytest.skip(message)
+
+
 def hdfs_test_client():
     host = os.environ.get('ARROW_HDFS_TEST_HOST', 'default')
     user = os.environ.get('ARROW_HDFS_TEST_USER', None)
@@ -382,12 +391,7 @@ class TestLibHdfs(HdfsTestCases, unittest.TestCase):
 
     @classmethod
     def check_driver(cls):
-        if not pa.have_libhdfs():
-            message = 'No libhdfs available on system'
-            if os.environ.get('PYARROW_HDFS_TEST_LIBHDFS_REQUIRE'):
-                pytest.fail(message)
-            else:
-                pytest.skip(message)
+        check_libhdfs_present()
 
     def test_orphaned_file(self):
         hdfs = hdfs_test_client()
@@ -418,6 +422,7 @@ def _get_hdfs_uri(path):
 def test_fastparquet_read_with_hdfs():
     from pandas.testing import assert_frame_equal
 
+    check_libhdfs_present()
     try:
         import snappy  # noqa
     except ImportError:

From 3250018d42eb62a2c5b4613b7b131a10d9d89879 Mon Sep 17 00:00:00 2001
From: Benjamin Kietzman <bengilgit@gmail.com>
Date: Tue, 22 Jun 2021 17:55:20 -0400
Subject: [PATCH 49/61] ARROW-10440: [C++][Dataset] Visit FileWriters before
 Finish

This enables collection of paths written to during writing of a FileSystemDataset

Closes #10573 from bkietz/10440-Add-a-callback-to-visit-f

Authored-by: Benjamin Kietzman <bengilgit@gmail.com>
Signed-off-by: David Li <li.davidm96@gmail.com>
---
 cpp/src/arrow/dataset/file_base.cc    | 34 +++++++++++++++------------
 cpp/src/arrow/dataset/file_base.h     | 17 +++++++++++---
 cpp/src/arrow/dataset/file_csv.h      |  3 ++-
 cpp/src/arrow/dataset/file_ipc.cc     | 11 +++++----
 cpp/src/arrow/dataset/file_ipc.h      |  6 +++--
 cpp/src/arrow/dataset/file_parquet.cc | 16 ++++++++-----
 cpp/src/arrow/dataset/file_parquet.h  |  6 +++--
 cpp/src/arrow/dataset/file_test.cc    | 19 ++++++++-------
 cpp/src/arrow/dataset/test_util.h     | 21 +++++++++++++----
 9 files changed, 87 insertions(+), 46 deletions(-)

diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc
index b1cbd63ec61..741071d1703 100644
--- a/cpp/src/arrow/dataset/file_base.cc
+++ b/cpp/src/arrow/dataset/file_base.cc
@@ -418,7 +418,8 @@ class WriteQueue {
 
     ARROW_ASSIGN_OR_RAISE(
         writer_, write_options.format()->MakeWriter(std::move(destination), schema_,
-                                                    write_options.file_write_options));
+                                                    write_options.file_write_options,
+                                                    {write_options.filesystem, path}));
     return Status::OK();
   }
 
@@ -445,15 +446,15 @@ struct WriteState {
   std::unordered_map<std::string, std::unique_ptr<WriteQueue>> queues;
 };
 
-Status WriteNextBatch(WriteState& state, const std::shared_ptr<Fragment>& fragment,
+Status WriteNextBatch(WriteState* state, const std::shared_ptr<Fragment>& fragment,
                       std::shared_ptr<RecordBatch> batch) {
-  ARROW_ASSIGN_OR_RAISE(auto groups, state.write_options.partitioning->Partition(batch));
+  ARROW_ASSIGN_OR_RAISE(auto groups, state->write_options.partitioning->Partition(batch));
   batch.reset();  // drop to hopefully conserve memory
 
-  if (groups.batches.size() > static_cast<size_t>(state.write_options.max_partitions)) {
+  if (groups.batches.size() > static_cast<size_t>(state->write_options.max_partitions)) {
     return Status::Invalid("Fragment would be written into ", groups.batches.size(),
                            " partitions. This exceeds the maximum of ",
-                           state.write_options.max_partitions);
+                           state->write_options.max_partitions);
   }
 
   std::unordered_set<WriteQueue*> need_flushed;
@@ -462,20 +463,20 @@ Status WriteNextBatch(WriteState& state, const std::shared_ptr<Fragment>& fragme
         and_(std::move(groups.expressions[i]), fragment->partition_expression());
     auto batch = std::move(groups.batches[i]);
 
-    ARROW_ASSIGN_OR_RAISE(auto part,
-                          state.write_options.partitioning->Format(partition_expression));
+    ARROW_ASSIGN_OR_RAISE(
+        auto part, state->write_options.partitioning->Format(partition_expression));
 
     WriteQueue* queue;
     {
       // lookup the queue to which batch should be appended
-      auto queues_lock = state.mutex.Lock();
+      auto queues_lock = state->mutex.Lock();
 
       queue = internal::GetOrInsertGenerated(
-                  &state.queues, std::move(part),
+                  &state->queues, std::move(part),
                   [&](const std::string& emplaced_part) {
                     // lookup in `queues` also failed,
                     // generate a new WriteQueue
-                    size_t queue_index = state.queues.size() - 1;
+                    size_t queue_index = state->queues.size() - 1;
 
                     return internal::make_unique<WriteQueue>(emplaced_part, queue_index,
                                                              batch->schema());
@@ -489,12 +490,12 @@ Status WriteNextBatch(WriteState& state, const std::shared_ptr<Fragment>& fragme
 
   // flush all touched WriteQueues
   for (auto queue : need_flushed) {
-    RETURN_NOT_OK(queue->Flush(state.write_options));
+    RETURN_NOT_OK(queue->Flush(state->write_options));
   }
   return Status::OK();
 }
 
-Status WriteInternal(const ScanOptions& scan_options, WriteState& state,
+Status WriteInternal(const ScanOptions& scan_options, WriteState* state,
                      ScanTaskVector scan_tasks) {
   // Store a mapping from partitions (represened by their formatted partition expressions)
   // to a WriteQueue which flushes batches into that partition's output file. In principle
@@ -544,7 +545,7 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio
 #pragma warning(disable : 4996)
 #endif
 
-  // TODO: (ARROW-11782/ARROW-12288) Remove calls to Scan()
+  // TODO(ARROW-11782/ARROW-12288) Remove calls to Scan()
   ARROW_ASSIGN_OR_RAISE(auto scan_task_it, scanner->Scan());
   ARROW_ASSIGN_OR_RAISE(ScanTaskVector scan_tasks, scan_task_it.ToVector());
 
@@ -555,11 +556,14 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio
 #endif
 
   WriteState state(write_options);
-  RETURN_NOT_OK(WriteInternal(*scanner->options(), state, std::move(scan_tasks)));
+  RETURN_NOT_OK(WriteInternal(*scanner->options(), &state, std::move(scan_tasks)));
 
   auto task_group = scanner->options()->TaskGroup();
   for (const auto& part_queue : state.queues) {
-    task_group->Append([&] { return part_queue.second->writer()->Finish(); });
+    task_group->Append([&] {
+      RETURN_NOT_OK(write_options.writer_pre_finish(part_queue.second->writer().get()));
+      return part_queue.second->writer()->Finish();
+    });
   }
   return task_group->Finish();
 }
diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h
index dd47b1226f4..f074e0f81da 100644
--- a/cpp/src/arrow/dataset/file_base.h
+++ b/cpp/src/arrow/dataset/file_base.h
@@ -175,7 +175,8 @@ class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this<FileForma
   /// \brief Create a writer for this format.
   virtual Result<std::shared_ptr<FileWriter>> MakeWriter(
       std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
-      std::shared_ptr<FileWriteOptions> options) const = 0;
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const = 0;
 
   /// \brief Get default write options for this format.
   virtual std::shared_ptr<FileWriteOptions> DefaultWriteOptions() = 0;
@@ -313,19 +314,23 @@ class ARROW_DS_EXPORT FileWriter {
   const std::shared_ptr<FileFormat>& format() const { return options_->format(); }
   const std::shared_ptr<Schema>& schema() const { return schema_; }
   const std::shared_ptr<FileWriteOptions>& options() const { return options_; }
+  const fs::FileLocator& destination() const { return destination_locator_; }
 
  protected:
   FileWriter(std::shared_ptr<Schema> schema, std::shared_ptr<FileWriteOptions> options,
-             std::shared_ptr<io::OutputStream> destination)
+             std::shared_ptr<io::OutputStream> destination,
+             fs::FileLocator destination_locator)
       : schema_(std::move(schema)),
         options_(std::move(options)),
-        destination_(destination) {}
+        destination_(std::move(destination)),
+        destination_locator_(std::move(destination_locator)) {}
 
   virtual Status FinishInternal() = 0;
 
   std::shared_ptr<Schema> schema_;
   std::shared_ptr<FileWriteOptions> options_;
   std::shared_ptr<io::OutputStream> destination_;
+  fs::FileLocator destination_locator_;
 };
 
 /// \brief Options for writing a dataset.
@@ -349,6 +354,12 @@ struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions {
   /// {i} will be replaced by an auto incremented integer.
   std::string basename_template;
 
+  /// Callback to be invoked against all FileWriters before
+  /// they are finalized with FileWriter::Finish().
+  std::function<Status(FileWriter*)> writer_pre_finish = [](FileWriter*) {
+    return Status::OK();
+  };
+
   const std::shared_ptr<FileFormat>& format() const {
     return file_write_options->format();
   }
diff --git a/cpp/src/arrow/dataset/file_csv.h b/cpp/src/arrow/dataset/file_csv.h
index f6636285c92..a365f7eac2b 100644
--- a/cpp/src/arrow/dataset/file_csv.h
+++ b/cpp/src/arrow/dataset/file_csv.h
@@ -67,7 +67,8 @@ class ARROW_DS_EXPORT CsvFileFormat : public FileFormat {
 
   Result<std::shared_ptr<FileWriter>> MakeWriter(
       std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
-      std::shared_ptr<FileWriteOptions> options) const override {
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const override {
     return Status::NotImplemented("writing fragment of CsvFileFormat");
   }
 
diff --git a/cpp/src/arrow/dataset/file_ipc.cc b/cpp/src/arrow/dataset/file_ipc.cc
index 2032f03d28f..40f5d3e8e0d 100644
--- a/cpp/src/arrow/dataset/file_ipc.cc
+++ b/cpp/src/arrow/dataset/file_ipc.cc
@@ -258,7 +258,8 @@ std::shared_ptr<FileWriteOptions> IpcFileFormat::DefaultWriteOptions() {
 
 Result<std::shared_ptr<FileWriter>> IpcFileFormat::MakeWriter(
     std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
-    std::shared_ptr<FileWriteOptions> options) const {
+    std::shared_ptr<FileWriteOptions> options,
+    fs::FileLocator destination_locator) const {
   if (!Equals(*options->format())) {
     return Status::TypeError("Mismatching format/write options.");
   }
@@ -274,14 +275,16 @@ Result<std::shared_ptr<FileWriter>> IpcFileFormat::MakeWriter(
 
   return std::shared_ptr<FileWriter>(
       new IpcFileWriter(std::move(destination), std::move(writer), std::move(schema),
-                        std::move(ipc_options)));
+                        std::move(ipc_options), std::move(destination_locator)));
 }
 
 IpcFileWriter::IpcFileWriter(std::shared_ptr<io::OutputStream> destination,
                              std::shared_ptr<ipc::RecordBatchWriter> writer,
                              std::shared_ptr<Schema> schema,
-                             std::shared_ptr<IpcFileWriteOptions> options)
-    : FileWriter(std::move(schema), std::move(options), std::move(destination)),
+                             std::shared_ptr<IpcFileWriteOptions> options,
+                             fs::FileLocator destination_locator)
+    : FileWriter(std::move(schema), std::move(options), std::move(destination),
+                 std::move(destination_locator)),
       batch_writer_(std::move(writer)) {}
 
 Status IpcFileWriter::Write(const std::shared_ptr<RecordBatch>& batch) {
diff --git a/cpp/src/arrow/dataset/file_ipc.h b/cpp/src/arrow/dataset/file_ipc.h
index deff26c6f95..ef78515221c 100644
--- a/cpp/src/arrow/dataset/file_ipc.h
+++ b/cpp/src/arrow/dataset/file_ipc.h
@@ -67,7 +67,8 @@ class ARROW_DS_EXPORT IpcFileFormat : public FileFormat {
 
   Result<std::shared_ptr<FileWriter>> MakeWriter(
       std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
-      std::shared_ptr<FileWriteOptions> options) const override;
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const override;
 
   std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
 };
@@ -107,7 +108,8 @@ class ARROW_DS_EXPORT IpcFileWriter : public FileWriter {
   IpcFileWriter(std::shared_ptr<io::OutputStream> destination,
                 std::shared_ptr<ipc::RecordBatchWriter> writer,
                 std::shared_ptr<Schema> schema,
-                std::shared_ptr<IpcFileWriteOptions> options);
+                std::shared_ptr<IpcFileWriteOptions> options,
+                fs::FileLocator destination_locator);
 
   Status FinishInternal() override;
 
diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
index 8611cf89997..9e29926e837 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -70,7 +70,7 @@ class ParquetScanTask : public ScanTask {
         reader_(std::move(reader)),
         pre_buffer_once_(std::move(pre_buffer_once)),
         pre_buffer_row_groups_(std::move(pre_buffer_row_groups)),
-        io_context_(io_context),
+        io_context_(std::move(io_context)),
         cache_options_(cache_options) {}
 
   Result<RecordBatchIterator> Execute() override {
@@ -540,7 +540,8 @@ std::shared_ptr<FileWriteOptions> ParquetFileFormat::DefaultWriteOptions() {
 
 Result<std::shared_ptr<FileWriter>> ParquetFileFormat::MakeWriter(
     std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
-    std::shared_ptr<FileWriteOptions> options) const {
+    std::shared_ptr<FileWriteOptions> options,
+    fs::FileLocator destination_locator) const {
   if (!Equals(*options->format())) {
     return Status::TypeError("Mismatching format/write options");
   }
@@ -552,14 +553,17 @@ Result<std::shared_ptr<FileWriter>> ParquetFileFormat::MakeWriter(
       *schema, default_memory_pool(), destination, parquet_options->writer_properties,
       parquet_options->arrow_writer_properties, &parquet_writer));
 
-  return std::shared_ptr<FileWriter>(new ParquetFileWriter(
-      std::move(destination), std::move(parquet_writer), std::move(parquet_options)));
+  return std::shared_ptr<FileWriter>(
+      new ParquetFileWriter(std::move(destination), std::move(parquet_writer),
+                            std::move(parquet_options), std::move(destination_locator)));
 }
 
 ParquetFileWriter::ParquetFileWriter(std::shared_ptr<io::OutputStream> destination,
                                      std::shared_ptr<parquet::arrow::FileWriter> writer,
-                                     std::shared_ptr<ParquetFileWriteOptions> options)
-    : FileWriter(writer->schema(), std::move(options), std::move(destination)),
+                                     std::shared_ptr<ParquetFileWriteOptions> options,
+                                     fs::FileLocator destination_locator)
+    : FileWriter(writer->schema(), std::move(options), std::move(destination),
+                 std::move(destination_locator)),
       parquet_writer_(std::move(writer)) {}
 
 Status ParquetFileWriter::Write(const std::shared_ptr<RecordBatch>& batch) {
diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h
index 347f4032046..da4fd58ebbe 100644
--- a/cpp/src/arrow/dataset/file_parquet.h
+++ b/cpp/src/arrow/dataset/file_parquet.h
@@ -128,7 +128,8 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat {
 
   Result<std::shared_ptr<FileWriter>> MakeWriter(
       std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
-      std::shared_ptr<FileWriteOptions> options) const override;
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const override;
 
   std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
 };
@@ -252,7 +253,8 @@ class ARROW_DS_EXPORT ParquetFileWriter : public FileWriter {
  private:
   ParquetFileWriter(std::shared_ptr<io::OutputStream> destination,
                     std::shared_ptr<parquet::arrow::FileWriter> writer,
-                    std::shared_ptr<ParquetFileWriteOptions> options);
+                    std::shared_ptr<ParquetFileWriteOptions> options,
+                    fs::FileLocator destination_locator);
 
   Status FinishInternal() override;
 
diff --git a/cpp/src/arrow/dataset/file_test.cc b/cpp/src/arrow/dataset/file_test.cc
index 839b48a0e64..b80d1bb57f0 100644
--- a/cpp/src/arrow/dataset/file_test.cc
+++ b/cpp/src/arrow/dataset/file_test.cc
@@ -87,22 +87,23 @@ constexpr int kNumScanTasks = 2;
 constexpr int kBatchesPerScanTask = 2;
 constexpr int kRowsPerBatch = 1024;
 class MockFileFormat : public FileFormat {
-  virtual std::string type_name() const { return "mock"; }
-  virtual bool Equals(const FileFormat& other) const { return false; }
-  virtual Result<bool> IsSupported(const FileSource& source) const { return true; }
-  virtual Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const {
+  std::string type_name() const override { return "mock"; }
+  bool Equals(const FileFormat& other) const override { return false; }
+  Result<bool> IsSupported(const FileSource& source) const override { return true; }
+  Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override {
     return Status::NotImplemented("Not needed for test");
   }
-  virtual Result<std::shared_ptr<FileWriter>> MakeWriter(
+  Result<std::shared_ptr<FileWriter>> MakeWriter(
       std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
-      std::shared_ptr<FileWriteOptions> options) const {
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const override {
     return Status::NotImplemented("Not needed for test");
   }
-  virtual std::shared_ptr<FileWriteOptions> DefaultWriteOptions() { return nullptr; }
+  std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override { return nullptr; }
 
-  virtual Result<ScanTaskIterator> ScanFile(
+  Result<ScanTaskIterator> ScanFile(
       const std::shared_ptr<ScanOptions>& options,
-      const std::shared_ptr<FileFragment>& file) const {
+      const std::shared_ptr<FileFragment>& file) const override {
     auto sch = schema({field("i32", int32())});
     ScanTaskVector scan_tasks;
     for (int i = 0; i < kNumScanTasks; i++) {
diff --git a/cpp/src/arrow/dataset/test_util.h b/cpp/src/arrow/dataset/test_util.h
index 66f5654fec4..42704fea9b5 100644
--- a/cpp/src/arrow/dataset/test_util.h
+++ b/cpp/src/arrow/dataset/test_util.h
@@ -486,7 +486,7 @@ class FileFormatFixtureMixin : public ::testing::Test {
     EXPECT_OK_AND_ASSIGN(auto sink, GetFileSink());
 
     if (!options) options = format->DefaultWriteOptions();
-    EXPECT_OK_AND_ASSIGN(auto writer, format->MakeWriter(sink, schema, options));
+    EXPECT_OK_AND_ASSIGN(auto writer, format->MakeWriter(sink, schema, options, {}));
     ARROW_EXPECT_OK(writer->Write(GetRecordBatchReader(schema).get()));
     ARROW_EXPECT_OK(writer->Finish());
     EXPECT_OK_AND_ASSIGN(auto written, sink->Finish());
@@ -722,7 +722,8 @@ class DummyFileFormat : public FileFormat {
 
   Result<std::shared_ptr<FileWriter>> MakeWriter(
       std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
-      std::shared_ptr<FileWriteOptions> options) const override {
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const override {
     return Status::NotImplemented("writing fragment of DummyFileFormat");
   }
 
@@ -770,7 +771,8 @@ class JSONRecordBatchFileFormat : public FileFormat {
 
   Result<std::shared_ptr<FileWriter>> MakeWriter(
       std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
-      std::shared_ptr<FileWriteOptions> options) const override {
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const override {
     return Status::NotImplemented("writing fragment of JSONRecordBatchFileFormat");
   }
 
@@ -1057,8 +1059,12 @@ class WriteFileSystemDatasetMixin : public MakeFileSystemDatasetMixin {
   void SetWriteOptions(std::shared_ptr<FileWriteOptions> file_write_options) {
     write_options_.file_write_options = file_write_options;
     write_options_.filesystem = fs_;
-    write_options_.base_dir = "new_root/";
+    write_options_.base_dir = "/new_root/";
     write_options_.basename_template = "dat_{i}";
+    write_options_.writer_pre_finish = [this](FileWriter* writer) {
+      visited_paths_.push_back(writer->destination().path);
+      return Status::OK();
+    };
   }
 
   void DoWrite(std::shared_ptr<Partitioning> desired_partitioning) {
@@ -1210,11 +1216,17 @@ class WriteFileSystemDatasetMixin : public MakeFileSystemDatasetMixin {
     for (const auto& file_contents : expected_files_) {
       expected_paths.insert(file_contents.first);
     }
+
+    // expect the written filesystem to contain precisely the paths we expected
     for (auto path : checked_pointer_cast<FileSystemDataset>(written_)->files()) {
       actual_paths.insert(std::move(path));
     }
     EXPECT_THAT(actual_paths, testing::UnorderedElementsAreArray(expected_paths));
 
+    // Additionally, the writer producing each written file was visited and its path
+    // collected. That should match the expected paths as well
+    EXPECT_THAT(visited_paths_, testing::UnorderedElementsAreArray(expected_paths));
+
     ASSERT_OK_AND_ASSIGN(auto written_fragments_it, written_->GetFragments());
     for (auto maybe_fragment : written_fragments_it) {
       ASSERT_OK_AND_ASSIGN(auto fragment, maybe_fragment);
@@ -1257,6 +1269,7 @@ class WriteFileSystemDatasetMixin : public MakeFileSystemDatasetMixin {
   PathAndContent expected_files_;
   std::shared_ptr<Schema> expected_physical_schema_;
   std::shared_ptr<Dataset> written_;
+  std::vector<std::string> visited_paths_;
   FileSystemDatasetWriteOptions write_options_;
   std::shared_ptr<ScanOptions> scan_options_;
 };

From 68c1300d034ec521a520661ee84c24892f36bae1 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Wed, 23 Jun 2021 11:48:33 +0200
Subject: [PATCH 50/61] remove comment

---
 cpp/src/arrow/extension_type_test.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc
index bf958fc2348..3220c73edcb 100644
--- a/cpp/src/arrow/extension_type_test.cc
+++ b/cpp/src/arrow/extension_type_test.cc
@@ -213,8 +213,6 @@ TEST_F(TestExtensionType, ComplexTypeTest) {
 
   ASSERT_TRUE(deserialized->Equals(*type));
   ASSERT_FALSE(deserialized->Equals(*fixed_size_list(float32(), 2)));
-
-  // auto type2 = complex(int16());
 }
 
 TEST_F(TestExtensionType, ExtensionTypeTest) {

From 512fffa20e13e6ccb298886733d3c88b1f693634 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Fri, 25 Jun 2021 11:33:35 +0200
Subject: [PATCH 51/61] WIP, with segfaults

---
 cpp/src/arrow/extension_type_test.cc     |  8 ----
 cpp/src/arrow/extensions/complex_type.cc | 22 +++++++++-
 cpp/src/arrow/python/numpy_convert.cc    | 10 ++---
 cpp/src/arrow/python/numpy_internal.h    |  2 +
 cpp/src/arrow/python/numpy_to_arrow.cc   | 55 ++++++++++++++++++++++++
 cpp/src/arrow/python/type_traits.h       | 32 ++++++++++++++
 python/pyarrow/tests/test_pandas.py      |  2 +
 7 files changed, 116 insertions(+), 15 deletions(-)

diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc
index 3220c73edcb..c3bd43f1b61 100644
--- a/cpp/src/arrow/extension_type_test.cc
+++ b/cpp/src/arrow/extension_type_test.cc
@@ -181,20 +181,12 @@ class TestExtensionType : public ::testing::Test {
  public:
   void SetUp() {
     ASSERT_OK(RegisterExtensionType(std::make_shared<UuidType>()));
-    ASSERT_OK(RegisterExtensionType(std::make_shared<ComplexFloatType>()));
-    ASSERT_OK(RegisterExtensionType(std::make_shared<ComplexDoubleType>()));
   }
 
   void TearDown() {
     if (GetExtensionType("uuid")) {
       ASSERT_OK(UnregisterExtensionType("uuid"));
     }
-    if (GetExtensionType("arrow.extension.complex64")) {
-      ASSERT_OK(UnregisterExtensionType("arrow.extension.complex64"));
-    }
-    if (GetExtensionType("arrow.extension.complex128")) {
-      ASSERT_OK(UnregisterExtensionType("arrow.extension.complex128"));
-    }
   }
 };
 
diff --git a/cpp/src/arrow/extensions/complex_type.cc b/cpp/src/arrow/extensions/complex_type.cc
index 30ce7442062..8cf89edcd7e 100644
--- a/cpp/src/arrow/extensions/complex_type.cc
+++ b/cpp/src/arrow/extensions/complex_type.cc
@@ -17,13 +17,14 @@
 
 // Complex Number Extension Type
 
+#include <mutex>
+#include <thread>
 #include <sstream>
 
 #include "arrow/extensions/complex_type.h"
 
 namespace arrow {
 
-
 bool ComplexFloatType::ExtensionEquals(const ExtensionType& other) const {
   const auto& other_ext = static_cast<const ExtensionType&>(other);
   return other_ext.extension_name() == this->extension_name();
@@ -43,5 +44,24 @@ std::shared_ptr<DataType> complex128() {
   return std::make_shared<ComplexDoubleType>();
 }
 
+/// NOTE(sjperkins)
+// Suggestions on how to improve this welcome!
+std::once_flag complex_float_registered;
+std::once_flag complex_double_registered;
+
+Status register_complex_types()
+{
+  std::call_once(complex_float_registered,
+                 RegisterExtensionType,
+                 std::make_shared<ComplexFloatType>());
+
+  std::call_once(complex_double_registered,
+                 RegisterExtensionType,
+                 std::make_shared<ComplexDoubleType>());
+  
+  return Status::OK();
+}
+
+static Status complex_types_registered = register_complex_types();
 
 };  // namespace arrow
diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc
index 256bb5d00b9..25f494e4e2e 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -118,19 +118,17 @@ Status GetNumPyType(const DataType& type, int* type_num) {
     NUMPY_TYPE_CASE(FLOAT, FLOAT32);
     NUMPY_TYPE_CASE(DOUBLE, FLOAT64);
     case Type::EXTENSION: {
-      auto ext_ptr = dynamic_cast<const ExtensionType*>(&type);
+      auto ext = static_cast<const ExtensionType*>(&type);
 
-      if(ext_ptr == nullptr) {
-        return Status::Invalid(type.id(), " could not be cast to ExtensionType");
-      } else if (ext_ptr->extension_name() == "arrow.extension.complex64") {
+      if (ext->extension_name() == "arrow.extension.complex64") {
         *type_num = NPY_COMPLEX64;
         break;
-      } else if (ext_ptr->extension_name() == "arrow.extension.complex128") {
+      } else if (ext->extension_name() == "arrow.extension.complex128") {
         *type_num = NPY_COMPLEX128;
         break;
       } else {
         return Status::NotImplemented("Unsupported ExtensionType: ",
-                                      ext_ptr->extension_name());
+                                      ext->extension_name());
       }
     }
 
diff --git a/cpp/src/arrow/python/numpy_internal.h b/cpp/src/arrow/python/numpy_internal.h
index 7c71bc06f6d..cb4eacb0297 100644
--- a/cpp/src/arrow/python/numpy_internal.h
+++ b/cpp/src/arrow/python/numpy_internal.h
@@ -145,6 +145,8 @@ inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) {
     TYPE_VISIT_INLINE(FLOAT16);
     TYPE_VISIT_INLINE(FLOAT32);
     TYPE_VISIT_INLINE(FLOAT64);
+    TYPE_VISIT_INLINE(COMPLEX64);
+    TYPE_VISIT_INLINE(COMPLEX128);
     TYPE_VISIT_INLINE(DATETIME);
     TYPE_VISIT_INLINE(TIMEDELTA);
     TYPE_VISIT_INLINE(OBJECT);
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index a382f766333..824062ffeee 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -229,6 +229,8 @@ class NumPyConverter {
 
   Status Visit(const FixedSizeBinaryType& type);
 
+  Status Visit(const ExtensionType& type);
+
   // Default case
   Status Visit(const DataType& type) { return TypeNotImplemented(type.ToString()); }
 
@@ -464,6 +466,17 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
   return Status::OK();
 }
 
+// template <>
+// inline Status NumPyConverter::ConvertData<ExtensionType>(std::shared_ptr<Buffer>* data) {
+//   std::shared_ptr<DataType> input_type;
+
+//   return Status::Invalid("Can't handle ExtensionType ConvertData");
+
+//   RETURN_NOT_OK(PrepareInputData<ExtensionType>(data));
+
+//   return Status::OK();
+// }
+
 template <>
 inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* data) {
   std::shared_ptr<DataType> input_type;
@@ -581,6 +594,48 @@ Status NumPyConverter::Visit(const BinaryType& type) {
   return Status::OK();
 }
 
+Status NumPyConverter::Visit(const ExtensionType& type) {
+  if(type.extension_name() == "arrow.extension.complex64") {
+    // TODO(sjperkins)
+    // Cut n paste from VisitNative, fix this
+    if (mask_ != nullptr) {
+        RETURN_NOT_OK(InitNullBitmap());
+        null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
+    } else {
+      RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_,
+                                                &null_count_));
+    }
+
+    std::shared_ptr<Buffer> data;
+    RETURN_NOT_OK(ConvertData<ComplexFloatType>(&data));
+
+    std::shared_ptr<Buffer> empty;
+
+    auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0);
+    return PushArray(arr_data);
+  } else if(type.extension_name() == "arrow.extension.complex128") {
+    // TODO(sjperkins)
+    // Cut n paste from VisitNative, fix this
+    if (mask_ != nullptr) {
+        RETURN_NOT_OK(InitNullBitmap());
+        null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
+    } else {
+      RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_,
+                                                &null_count_));
+    }
+
+    std::shared_ptr<Buffer> data;
+    RETURN_NOT_OK(ConvertData<ComplexFloatType>(&data));
+
+    std::shared_ptr<Buffer> empty;
+
+    auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0);
+    return PushArray(arr_data);
+  } else {
+    return TypeNotImplemented(type.ToString());
+  }
+}
+
 Status NumPyConverter::Visit(const FixedSizeBinaryType& type) {
   auto byte_width = type.byte_width();
 
diff --git a/cpp/src/arrow/python/type_traits.h b/cpp/src/arrow/python/type_traits.h
index a941577f765..e15150ca2f8 100644
--- a/cpp/src/arrow/python/type_traits.h
+++ b/cpp/src/arrow/python/type_traits.h
@@ -22,6 +22,7 @@
 #include "arrow/python/platform.h"
 
 #include <cstdint>
+#include <complex>
 #include <limits>
 
 #include "arrow/python/numpy_interop.h"
@@ -126,6 +127,37 @@ struct npy_traits<NPY_FLOAT64> {
   static inline bool isnull(double v) { return v != v; }
 };
 
+template <>
+struct npy_traits<NPY_COMPLEX64> {
+  using TypeClass = ComplexFloatType;
+  // NOTE(sjperkins)
+  // This should technically be FixedSizeListScalar, but FixedSizeListScalar
+  // isn't correctly sized for memcpy's in numpy_to_arrow.cc and doesn't
+  // have a default constructor either
+  using value_type = std::complex<float>;
+
+  static constexpr std::complex<float> na_sentinel = std::complex<float>(std::numeric_limits<float>::quiet_NaN(),
+                                                                         std::numeric_limits<float>::quiet_NaN());
+  static constexpr bool supports_nulls = true;
+  static inline bool isnull(const std::complex<float> & v) { return v != v; }
+};
+
+template <>
+struct npy_traits<NPY_COMPLEX128> {
+  using TypeClass = ComplexDoubleType;
+  // NOTE(sjperkins)
+  // This should technically be FixedSizeListScalar, but FixedSizeListScalar
+  // isn't correctly sized for memcpy's in numpy_to_arrow.cc and doesn't
+  // have a default constructor either
+  using value_type = std::complex<double>;
+  
+  static constexpr std::complex<double> na_sentinel = std::complex<double>(std::numeric_limits<double>::quiet_NaN(),
+                                                                           std::numeric_limits<double>::quiet_NaN());
+  static constexpr bool supports_nulls = true;
+  static inline bool isnull(const std::complex<double> & v) { return v != v; }
+};
+
+
 template <>
 struct npy_traits<NPY_DATETIME> {
   typedef int64_t value_type;
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index b6557875c2c..b88398dd3db 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -67,6 +67,8 @@ def _alltypes_example(size=100):
         'int64': np.arange(size, dtype=np.int64),
         'float32': np.arange(size, dtype=np.float32),
         'float64': np.arange(size, dtype=np.float64),
+        'complex64': np.arange(size, dtype=np.complex64),
+        'complex128': np.arange(size, dtype=np.complex128),
         'bool': np.random.randn(size) > 0,
         # TODO(wesm): Pandas only support ns resolution, Arrow supports s, ms,
         # us, ns

From ada75254f05c691cc9fa9515dc2449f4306e6128 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Fri, 23 Jul 2021 16:07:58 +0200
Subject: [PATCH 52/61] Doesn't segfault, but extension class not registered in
 Python?

---
 cpp/src/arrow/python/numpy_to_arrow.cc | 39 ++++++++------------------
 1 file changed, 12 insertions(+), 27 deletions(-)

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index 824062ffeee..b9e77040ac4 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -466,16 +466,6 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
   return Status::OK();
 }
 
-// template <>
-// inline Status NumPyConverter::ConvertData<ExtensionType>(std::shared_ptr<Buffer>* data) {
-//   std::shared_ptr<DataType> input_type;
-
-//   return Status::Invalid("Can't handle ExtensionType ConvertData");
-
-//   RETURN_NOT_OK(PrepareInputData<ExtensionType>(data));
-
-//   return Status::OK();
-// }
 
 template <>
 inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* data) {
@@ -594,42 +584,37 @@ Status NumPyConverter::Visit(const BinaryType& type) {
   return Status::OK();
 }
 
+
 Status NumPyConverter::Visit(const ExtensionType& type) {
   if(type.extension_name() == "arrow.extension.complex64") {
-    // TODO(sjperkins)
-    // Cut n paste from VisitNative, fix this
     if (mask_ != nullptr) {
-        RETURN_NOT_OK(InitNullBitmap());
-        null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
+      RETURN_NOT_OK(InitNullBitmap());
+      null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
     } else {
       RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_,
-                                                &null_count_));
+                                                 &null_count_));
     }
 
     std::shared_ptr<Buffer> data;
     RETURN_NOT_OK(ConvertData<ComplexFloatType>(&data));
 
-    std::shared_ptr<Buffer> empty;
-
-    auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0);
+    auto float_arr_data = ArrayData::Make(float32(), length_*2, {nullptr, data}, 0, 0);
+    auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_}, {float_arr_data}, null_count_, 0);
     return PushArray(arr_data);
   } else if(type.extension_name() == "arrow.extension.complex128") {
-    // TODO(sjperkins)
-    // Cut n paste from VisitNative, fix this
     if (mask_ != nullptr) {
-        RETURN_NOT_OK(InitNullBitmap());
-        null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
+      RETURN_NOT_OK(InitNullBitmap());
+      null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
     } else {
       RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_,
-                                                &null_count_));
+                                                 &null_count_));
     }
 
     std::shared_ptr<Buffer> data;
-    RETURN_NOT_OK(ConvertData<ComplexFloatType>(&data));
-
-    std::shared_ptr<Buffer> empty;
+    RETURN_NOT_OK(ConvertData<ComplexDoubleType>(&data));
 
-    auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0);
+    auto float_arr_data = ArrayData::Make(float64(), length_*2, {nullptr, data}, 0, 0);
+    auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_}, {float_arr_data}, null_count_, 0);
     return PushArray(arr_data);
   } else {
     return TypeNotImplemented(type.ToString());

From 7ed676ad7033fceb8dc3aa1590a1d69ded964e05 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Tue, 2 Nov 2021 21:59:19 +0200
Subject: [PATCH 53/61] [skip ci] Move __arrow_ext_class__ to BaseExtensionType

---
 python/pyarrow/types.pxi | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 8795e4d3af9..10d029774a3 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -708,6 +708,18 @@ cdef class BaseExtensionType(DataType):
         """
         return pyarrow_wrap_data_type(self.ext_type.storage_type())
 
+
+    def __arrow_ext_class__(self):
+        """Return an extension array class to be used for building or
+        deserializing arrays with this extension type.
+
+        This method should return a subclass of the ExtensionArray class. By
+        default, if not specialized in the extension implementation, an
+        extension type array will be a built-in ExtensionArray instance.
+        """
+        return ExtensionArray
+
+
     def wrap_array(self, storage):
         """
         Wrap the given storage array as an extension array.
@@ -826,16 +838,6 @@ cdef class ExtensionType(BaseExtensionType):
         """
         return NotImplementedError
 
-    def __arrow_ext_class__(self):
-        """Return an extension array class to be used for building or
-        deserializing arrays with this extension type.
-
-        This method should return a subclass of the ExtensionArray class. By
-        default, if not specialized in the extension implementation, an
-        extension type array will be a built-in ExtensionArray instance.
-        """
-        return ExtensionArray
-
 
 cdef class PyExtensionType(ExtensionType):
     """

From b3440d5fc58977cb6dce07b2c6cfb4a82d790495 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Wed, 3 Nov 2021 14:39:41 +0200
Subject: [PATCH 54/61] [skip ci] Progress towards the Pandas API understanding
 Complex Number Extension Types

---
 python/pyarrow/types.pxi | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 10d029774a3..419cb1469bc 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -50,6 +50,11 @@ cdef dict _pandas_type_map = {
     _Type_DECIMAL128: np.object_,
 }
 
+cdef dict _pandas_ext_type_map = {
+    b'arrow.extension.complex64': np.complex64,
+    b'arrow.extension.complex128': np.complex128,
+}
+
 cdef dict _pep3118_type_map = {
     _Type_INT8: b'b',
     _Type_INT16: b'h',
@@ -90,7 +95,7 @@ def _is_primitive(Type type):
 # Workaround for Cython parsing bug
 # https://github.com/cython/cython/issues/2143
 ctypedef CFixedWidthType* _CFixedWidthTypePtr
-
+ctypedef const CExtensionType* _CExtensionTypePtr
 
 cdef class DataType(_Weakrefable):
     """
@@ -194,10 +199,22 @@ cdef class DataType(_Weakrefable):
         Return the equivalent NumPy / Pandas dtype.
         """
         cdef Type type_id = self.type.id()
-        if type_id in _pandas_type_map:
+        cdef const CExtensionType * ext_type
+        cdef bytes ext_name
+
+        if type_id == _Type_EXTENSION:
+            ext_type = dynamic_cast[_CExtensionTypePtr](self.type)
+
+            if ext_type:
+                ext_name = ext_type.extension_name()
+
+                if ext_name in _pandas_ext_type_map:
+                    return _pandas_ext_type_map[ext_name]
+
+        elif type_id in _pandas_type_map:
             return _pandas_type_map[type_id]
-        else:
-            raise NotImplementedError(str(self))
+
+        raise NotImplementedError(str(self))
 
     def _export_to_c(self, uintptr_t out_ptr):
         """

From aa580e6cc4e9e64c6c0760b759bb7a68435b461c Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Wed, 3 Nov 2021 23:11:14 +0200
Subject: [PATCH 55/61] [skip ci] arrow.extension.complex{64,128} ->
 arrow.complex{64,128}

---
 cpp/src/arrow/extension_type_test.cc    | 2 +-
 cpp/src/arrow/extensions/complex_type.h | 4 ++--
 cpp/src/arrow/python/numpy_convert.cc   | 4 ++--
 cpp/src/arrow/python/numpy_to_arrow.cc  | 4 ++--
 python/pyarrow/types.pxi                | 4 ++--
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc
index 0e630c8dc5c..17a6c139ad4 100644
--- a/cpp/src/arrow/extension_type_test.cc
+++ b/cpp/src/arrow/extension_type_test.cc
@@ -191,7 +191,7 @@ class TestExtensionType : public ::testing::Test {
 };
 
 TEST_F(TestExtensionType, ComplexTypeTest) {
-  auto registered_type = GetExtensionType("arrow.extension.complex64");
+  auto registered_type = GetExtensionType("arrow.complex64");
   ASSERT_NE(registered_type, nullptr);
 
   auto type = complex64();
diff --git a/cpp/src/arrow/extensions/complex_type.h b/cpp/src/arrow/extensions/complex_type.h
index 5979a79e233..22f584b1066 100644
--- a/cpp/src/arrow/extensions/complex_type.h
+++ b/cpp/src/arrow/extensions/complex_type.h
@@ -42,7 +42,7 @@ class ComplexFloatType : public ExtensionType {
   }
 
   std::string extension_name() const override {
-    return "arrow.extension.complex64";
+    return "arrow.complex64";
   }
 
   bool ExtensionEquals(const ExtensionType& other) const override;
@@ -78,7 +78,7 @@ class ComplexDoubleType : public ExtensionType {
   }
 
   std::string extension_name() const override {
-    return "arrow.extension.complex128";
+    return "arrow.complex128";
   }
 
   bool ExtensionEquals(const ExtensionType& other) const override;
diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc
index 6c4fa0e2d8a..20427a7ddb5 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -120,10 +120,10 @@ Status GetNumPyType(const DataType& type, int* type_num) {
     case Type::EXTENSION: {
       auto ext = static_cast<const ExtensionType*>(&type);
 
-      if (ext->extension_name() == "arrow.extension.complex64") {
+      if (ext->extension_name() == "arrow.complex64") {
         *type_num = NPY_COMPLEX64;
         break;
-      } else if (ext->extension_name() == "arrow.extension.complex128") {
+      } else if (ext->extension_name() == "arrow.complex128") {
         *type_num = NPY_COMPLEX128;
         break;
       } else {
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index b9e77040ac4..7fe7b3c0091 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -586,7 +586,7 @@ Status NumPyConverter::Visit(const BinaryType& type) {
 
 
 Status NumPyConverter::Visit(const ExtensionType& type) {
-  if(type.extension_name() == "arrow.extension.complex64") {
+  if(type.extension_name() == "arrow.complex64") {
     if (mask_ != nullptr) {
       RETURN_NOT_OK(InitNullBitmap());
       null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
@@ -601,7 +601,7 @@ Status NumPyConverter::Visit(const ExtensionType& type) {
     auto float_arr_data = ArrayData::Make(float32(), length_*2, {nullptr, data}, 0, 0);
     auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_}, {float_arr_data}, null_count_, 0);
     return PushArray(arr_data);
-  } else if(type.extension_name() == "arrow.extension.complex128") {
+  } else if(type.extension_name() == "arrow.complex128") {
     if (mask_ != nullptr) {
       RETURN_NOT_OK(InitNullBitmap());
       null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 419cb1469bc..e4a30085e99 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -51,8 +51,8 @@ cdef dict _pandas_type_map = {
 }
 
 cdef dict _pandas_ext_type_map = {
-    b'arrow.extension.complex64': np.complex64,
-    b'arrow.extension.complex128': np.complex128,
+    b'arrow.complex64': np.complex64,
+    b'arrow.complex128': np.complex128,
 }
 
 cdef dict _pep3118_type_map = {

From 8f30936e066d76306c3e23a6670866e25a33ba3a Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Fri, 5 Nov 2021 18:27:37 +0200
Subject: [PATCH 56/61] [skip ci] WIP

---
 cpp/src/arrow/extensions/complex_type.h |   6 ++
 cpp/src/arrow/python/CMakeLists.txt     |   3 +-
 cpp/src/arrow/python/arrow_to_pandas.cc | 106 +++++++++++++++++++++++-
 cpp/src/arrow/python/type_traits.cc     |  29 +++++++
 cpp/src/arrow/python/type_traits.h      |  29 +++++--
 python/pyarrow/pandas_compat.py         |  24 ++++--
 6 files changed, 182 insertions(+), 15 deletions(-)
 create mode 100644 cpp/src/arrow/python/type_traits.cc

diff --git a/cpp/src/arrow/extensions/complex_type.h b/cpp/src/arrow/extensions/complex_type.h
index 22f584b1066..9a425a210ab 100644
--- a/cpp/src/arrow/extensions/complex_type.h
+++ b/cpp/src/arrow/extensions/complex_type.h
@@ -18,6 +18,8 @@
 // Complex Number Extension Type
 #pragma once
 
+#include <complex>
+
 #include "arrow/extension_type.h"
 
 namespace arrow {
@@ -34,6 +36,8 @@ class ComplexFloatArray : public ExtensionArray {
 
 class ComplexFloatType : public ExtensionType {
  public:
+  using c_type = std::complex<float>;
+
   explicit ComplexFloatType()
       : ExtensionType(fixed_size_list(float32(), 2)) {}
 
@@ -70,6 +74,8 @@ class ComplexDoubleArray : public ExtensionArray {
 
 class ComplexDoubleType : public ExtensionType {
  public:
+  using c_type = std::complex<double>;
+
   explicit ComplexDoubleType()
       : ExtensionType(fixed_size_list(float64(), 2)) {}
 
diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt
index 40f351b56a5..3ef405b1445 100644
--- a/cpp/src/arrow/python/CMakeLists.txt
+++ b/cpp/src/arrow/python/CMakeLists.txt
@@ -43,7 +43,8 @@ set(ARROW_PYTHON_SRCS
     numpy_to_arrow.cc
     python_to_arrow.cc
     pyarrow.cc
-    serialize.cc)
+    serialize.cc
+    type_traits.cc)
 
 set_source_files_properties(init.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON
                                                SKIP_UNITY_BUILD_INCLUSION ON)
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index 3f386ad529a..a948650a1c1 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -17,6 +17,7 @@
 
 // Functions for pandas conversion via NumPy
 
+#include "arrow/extensions/complex_type.h"
 #include "arrow/python/arrow_to_pandas.h"
 #include "arrow/python/numpy_interop.h"  // IWYU pragma: expand
 
@@ -1234,8 +1235,9 @@ class IntWriter : public TypedPandasWriter<NPY_TYPE> {
   }
 };
 
+
 template <int NPY_TYPE>
-class FloatWriter : public TypedPandasWriter<NPY_TYPE> {
+class ComplexWriter : public TypedPandasWriter<NPY_TYPE> {
  public:
   using ArrowType = typename npy_traits<NPY_TYPE>::TypeClass;
   using TypedPandasWriter<NPY_TYPE>::TypedPandasWriter;
@@ -1244,6 +1246,18 @@ class FloatWriter : public TypedPandasWriter<NPY_TYPE> {
   bool CanZeroCopy(const ChunkedArray& data) const override {
     return IsNonNullContiguous(data) && data.type()->id() == ArrowType::type_id;
   }
+};
+
+template <int NPY_TYPE>
+class FloatWriter : public TypedPandasWriter<NPY_TYPE> {
+ public:
+  using ArrowType = typename npy_traits<NPY_TYPE>::TypeClass;
+  using TypedPandasWriter<NPY_TYPE>::TypedPandasWriter;
+  using T = typename ArrowType::c_type;
+
+  bool CanZeroCopy(const ChunkedArray& data) const override {
+    return data.type()->id() == ArrowType::type_id && IsNonNullContiguous(data);
+  }
 
   Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
     Type::type in_type = data->type()->id();
@@ -1278,6 +1292,26 @@ class FloatWriter : public TypedPandasWriter<NPY_TYPE> {
       case Type::DOUBLE:
         ConvertNumericNullableCast(*data, npy_traits<NPY_TYPE>::na_sentinel, out_values);
         break;
+      // case Type::EXTENSION:
+      //   {
+      //     auto ext_type = std::static_pointer_cast<const ExtensionType>(data->type());
+
+      //     if(ext_type == nullptr) {
+      //       return Status::TypeError(
+      //         "Unable to cast ", data->type()->ToString(), "to ExtensionType");
+      //     }
+
+      //     if(ext_type->extension_name() == "arrow.complex64") {
+      //       ConvertNumericNullableCast(*data, npy_traits<NPY_TYPE>::na_sentinel, out_values);
+      //     } else if (ext_type->extension_name() == "arrow.complex128") {
+      //       ConvertNumericNullableCast(*data, npy_traits<NPY_TYPE>::na_sentinel, out_values);
+      //     } else {
+      //       return Status::NotImplemented("Cannot write Arrow data of type ",
+      //                                     data->type()->ToString(),
+      //                                     " to a Pandas floating point block");
+      //     }
+      //   }
+      //   break;
       default:
         return Status::NotImplemented("Cannot write Arrow data of type ",
                                       data->type()->ToString(),
@@ -1301,6 +1335,8 @@ using Int64Writer = IntWriter<NPY_INT64>;
 using Float16Writer = FloatWriter<NPY_FLOAT16>;
 using Float32Writer = FloatWriter<NPY_FLOAT32>;
 using Float64Writer = FloatWriter<NPY_FLOAT64>;
+// using Complex64Writer = FloatWriter<NPY_COMPLEX64>;
+// using Complex128Writer = FloatWriter<NPY_COMPLEX128>;
 
 class BoolWriter : public TypedPandasWriter<NPY_BOOL> {
  public:
@@ -1501,7 +1537,7 @@ class TimedeltaWriter : public TypedPandasWriter<NPY_TIMEDELTA> {
 
   bool CanZeroCopy(const ChunkedArray& data) const override {
     const auto& type = checked_cast<const DurationType&>(*data.type());
-    return IsNonNullContiguous(data) && type.unit() == UNIT;
+    return type.unit() == UNIT && IsNonNullContiguous(data);
   }
 
   Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
@@ -1810,6 +1846,22 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type,
       }
     } break;
     case PandasWriter::EXTENSION:
+      // {
+      //   auto ext_type = dynamic_cast<const ExtensionType *>(&type);
+
+      //   if(ext_type == nullptr) {
+      //     return Status::TypeError(
+      //       "Unable to cast ", type.ToString(), "to ExtensionType");
+      //   }
+
+      //   if(ext_type->extension_name() == "arrow.complex64") {
+      //     *writer = std::make_shared<Float32Writer>(options, num_rows, num_columns);
+      //   } else if (ext_type->extension_name() == "arrow.complex128") {
+      //     *writer = std::make_shared<Float64Writer>(options, num_rows, num_columns);
+      //   } else {
+      //     *writer = std::make_shared<ExtensionWriter>(options, num_rows, num_columns);
+      //   }
+      // }
       *writer = std::make_shared<ExtensionWriter>(options, num_rows, num_columns);
       break;
       BLOCK_CASE(OBJECT, ObjectWriter);
@@ -1972,6 +2024,16 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions&
       *output_type = PandasWriter::CATEGORICAL;
       break;
     case Type::EXTENSION:
+      // {
+      //   auto ext_type = std::static_pointer_cast<ExtensionType>(data.type());
+
+      //   if(ext_type->extension_name() == "arrow.complex64") {
+      //     return Status::NotImplemented("complex64");
+      //   } else if (ext_type->extension_name() == "arrow.complex128") {
+      //     return Status::NotImplemented("complex128");
+      //   }
+      // }
+
       *output_type = PandasWriter::EXTENSION;
       break;
     default:
@@ -2197,6 +2259,45 @@ class SplitBlockCreator : public PandasBlockCreator {
   std::vector<std::shared_ptr<PandasWriter>> writers_;
 };
 
+
+Status ConvertComplexArrays(const PandasOptions& options,
+                            ChunkedArrayVector* arrays,
+                            FieldVector* fields) {
+
+  for (int i = 0; i < static_cast<int>(arrays->size()); i++) {
+    auto array = (*arrays)[i];
+
+    if (array->type()->id() == Type::EXTENSION) {
+      auto ext = std::static_pointer_cast<const ExtensionType>(array->type());
+      bool is_f32 = ext->extension_name() == "arrow.complex64";
+      bool is_f64 = !is_f32 && ext->extension_name() == "arrow.complex128";
+
+      if(is_f32 || is_f64) {
+        ArrayVector chunks;
+
+        for(int c=0; c < array->num_chunks(); ++c) {
+          chunks.push_back(
+            std::static_pointer_cast<ExtensionArray>(
+              array->chunk(c))->storage());
+        }
+
+        auto dtype = is_f32 ? float32() : float64();
+
+        // std::cout << "Converting " << (*fields)[i]->ToString()
+        //           << " to " << (*fields)[i]->WithType(dtype)->ToString()
+        //           << std::endl;
+
+
+        (*arrays)[i] = std::make_shared<ChunkedArray>(chunks, dtype);
+        (*fields)[i] = (*fields)[i]->WithType(dtype);
+      }
+    }         
+  }           
+
+  return Status::OK();
+}
+
+
 Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arrays,
                            FieldVector* fields) {
   std::vector<int> columns_to_encode;
@@ -2302,6 +2403,7 @@ Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr<Table>
   table = nullptr;
 
   RETURN_NOT_OK(ConvertCategoricals(options, &arrays, &fields));
+  RETURN_NOT_OK(ConvertComplexArrays(options, &arrays, &fields));
 
   PandasOptions modified_options = options;
   modified_options.strings_to_categorical = false;
diff --git a/cpp/src/arrow/python/type_traits.cc b/cpp/src/arrow/python/type_traits.cc
new file mode 100644
index 00000000000..8558af24ad9
--- /dev/null
+++ b/cpp/src/arrow/python/type_traits.cc
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/type_traits.h"
+
+namespace arrow {
+namespace py {
+namespace internal {
+
+constexpr std::complex<float> npy_traits<NPY_COMPLEX64>::na_sentinel;
+constexpr std::complex<double> npy_traits<NPY_COMPLEX128>::na_sentinel;
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
diff --git a/cpp/src/arrow/python/type_traits.h b/cpp/src/arrow/python/type_traits.h
index e15150ca2f8..0e31cad9397 100644
--- a/cpp/src/arrow/python/type_traits.h
+++ b/cpp/src/arrow/python/type_traits.h
@@ -127,6 +127,11 @@ struct npy_traits<NPY_FLOAT64> {
   static inline bool isnull(double v) { return v != v; }
 };
 
+template <typename T>
+constexpr std::complex<T> make_complex_nan()
+{
+}
+
 template <>
 struct npy_traits<NPY_COMPLEX64> {
   using TypeClass = ComplexFloatType;
@@ -136,8 +141,11 @@ struct npy_traits<NPY_COMPLEX64> {
   // have a default constructor either
   using value_type = std::complex<float>;
 
-  static constexpr std::complex<float> na_sentinel = std::complex<float>(std::numeric_limits<float>::quiet_NaN(),
-                                                                         std::numeric_limits<float>::quiet_NaN());
+  static constexpr std::complex<float> na_sentinel = 
+    std::complex<float>(
+      std::numeric_limits<float>::quiet_NaN(),
+      std::numeric_limits<float>::quiet_NaN());
+
   static constexpr bool supports_nulls = true;
   static inline bool isnull(const std::complex<float> & v) { return v != v; }
 };
@@ -151,8 +159,11 @@ struct npy_traits<NPY_COMPLEX128> {
   // have a default constructor either
   using value_type = std::complex<double>;
   
-  static constexpr std::complex<double> na_sentinel = std::complex<double>(std::numeric_limits<double>::quiet_NaN(),
-                                                                           std::numeric_limits<double>::quiet_NaN());
+  static constexpr std::complex<double> na_sentinel = 
+    std::complex<double>(
+      std::numeric_limits<double>::quiet_NaN(),
+      std::numeric_limits<double>::quiet_NaN());
+
   static constexpr bool supports_nulls = true;
   static inline bool isnull(const std::complex<double> & v) { return v != v; }
 };
@@ -330,12 +341,12 @@ struct arrow_traits<Type::BINARY> {
 
 static inline NPY_DATETIMEUNIT NumPyFrequency(TimeUnit::type unit) {
   switch (unit) {
-    case TimestampType::Unit::SECOND:
+    case TimeUnit::SECOND:
       return NPY_FR_s;
-    case TimestampType::Unit::MILLI:
+    case TimeUnit::MILLI:
       return NPY_FR_ms;
       break;
-    case TimestampType::Unit::MICRO:
+    case TimeUnit::MICRO:
       return NPY_FR_us;
     default:
       // NANO
@@ -366,6 +377,10 @@ static inline int NumPyTypeSize(int npy_type) {
       return 4;
     case NPY_FLOAT64:
       return 8;
+    case NPY_COMPLEX64:
+      return 16;
+    case NPY_COMPLEX128:
+      return 32;
     case NPY_DATETIME:
       return 8;
     case NPY_OBJECT:
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index e4b13175fe1..d1957f6a57b 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -63,6 +63,8 @@ def get_logical_type_map():
             pa.lib.Type_BINARY: 'bytes',
             pa.lib.Type_FIXED_SIZE_BINARY: 'bytes',
             pa.lib.Type_STRING: 'unicode',
+            'arrow.complex64': 'complex64',
+            'arrow.complex128': 'complex128',
         })
     return _logical_type_map
 
@@ -81,6 +83,12 @@ def get_logical_type(arrow_type):
             return 'datetimetz' if arrow_type.tz is not None else 'datetime'
         elif isinstance(arrow_type, pa.lib.Decimal128Type):
             return 'decimal'
+        elif isinstance(arrow_type, pa.lib.BaseExtensionType):
+            try:
+                return logical_type_map[arrow_type.extension_name]
+            except KeyError:
+                pass
+        
         return 'object'
 
 
@@ -96,6 +104,8 @@ def get_logical_type(arrow_type):
     np.uint64: 'uint64',
     np.float32: 'float32',
     np.float64: 'float64',
+    np.complex64: 'complex64',
+    np.complex128: 'complex128',
     'datetime64[D]': 'date',
     np.unicode_: 'string',
     np.bytes_: 'bytes',
@@ -743,11 +753,15 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
         assert len(placement) == 1
         name = columns[placement[0]]
         pandas_dtype = extension_columns[name]
-        if not hasattr(pandas_dtype, '__from_arrow__'):
+
+        if pandas_dtype in {np.complex64, np.complex128}:
+            block = _int.make_block(arr, placement=placement)
+        elif not hasattr(pandas_dtype, '__from_arrow__'):
             raise ValueError("This column does not support to be converted "
                              "to a pandas ExtensionArray")
-        pd_ext_arr = pandas_dtype.__from_arrow__(arr)
-        block = _int.make_block(pd_ext_arr, placement=placement)
+        else:
+            pd_ext_arr = pandas_dtype.__from_arrow__(arr)
+            block = _int.make_block(pd_ext_arr, placement=placement)
     else:
         block = _int.make_block(block_arr, placement=placement)
 
@@ -793,11 +807,11 @@ def table_to_blockmanager(options, table, categories=None,
 
 
 # Set of the string repr of all numpy dtypes that can be stored in a pandas
-# dataframe (complex not included since not supported by Arrow)
+# dataframe
 _pandas_supported_numpy_types = {
     str(np.dtype(typ))
     for typ in (np.sctypes['int'] + np.sctypes['uint'] + np.sctypes['float'] +
-                ['object', 'bool'])
+                ['object', 'bool'] + ['complex64', 'complex128'])
 }
 
 

From 7ec8d3676aaf71e875652092f168d117524ef4bf Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Fri, 5 Nov 2021 18:54:57 +0200
Subject: [PATCH 57/61] [skip ci] Mark converted complex columns

---
 cpp/src/arrow/python/arrow_to_pandas.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index a948650a1c1..2ce681cbd28 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -41,6 +41,7 @@
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/hashing.h"
 #include "arrow/util/int_util.h"
+#include "arrow/util/key_value_metadata.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/parallel.h"
@@ -2288,8 +2289,9 @@ Status ConvertComplexArrays(const PandasOptions& options,
         //           << std::endl;
 
 
+        auto meta = key_value_metadata({"__complex_field_marker__"}, {"true"});
         (*arrays)[i] = std::make_shared<ChunkedArray>(chunks, dtype);
-        (*fields)[i] = (*fields)[i]->WithType(dtype);
+        (*fields)[i] = (*fields)[i]->WithType(dtype)->WithMergedMetadata(meta);
       }
     }         
   }           

From 11a285c427855a7e4bde005f80ecc108b821613a Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Mon, 8 Nov 2021 11:20:09 +0200
Subject: [PATCH 58/61] [skip ci] Fix up complex -> float conversion

---
 cpp/src/arrow/python/arrow_to_pandas.cc | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index 2ce681cbd28..beb25515a34 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -2263,10 +2263,12 @@ class SplitBlockCreator : public PandasBlockCreator {
 
 Status ConvertComplexArrays(const PandasOptions& options,
                             ChunkedArrayVector* arrays,
-                            FieldVector* fields) {
+                            FieldVector* fields,
+                            PandasOptions* modified_options) {
 
   for (int i = 0; i < static_cast<int>(arrays->size()); i++) {
     auto array = (*arrays)[i];
+    auto field = (*fields)[i];
 
     if (array->type()->id() == Type::EXTENSION) {
       auto ext = std::static_pointer_cast<const ExtensionType>(array->type());
@@ -2277,21 +2279,17 @@ Status ConvertComplexArrays(const PandasOptions& options,
         ArrayVector chunks;
 
         for(int c=0; c < array->num_chunks(); ++c) {
-          chunks.push_back(
-            std::static_pointer_cast<ExtensionArray>(
-              array->chunk(c))->storage());
+          auto ext = std::static_pointer_cast<ExtensionArray>(array->chunk(c));
+          auto storage = std::static_pointer_cast<FixedSizeListArray>(ext->storage());
+          chunks.push_back(storage->Flatten().ValueOrDie());
         }
 
         auto dtype = is_f32 ? float32() : float64();
-
-        // std::cout << "Converting " << (*fields)[i]->ToString()
-        //           << " to " << (*fields)[i]->WithType(dtype)->ToString()
-        //           << std::endl;
-
-
         auto meta = key_value_metadata({"__complex_field_marker__"}, {"true"});
+
         (*arrays)[i] = std::make_shared<ChunkedArray>(chunks, dtype);
-        (*fields)[i] = (*fields)[i]->WithType(dtype)->WithMergedMetadata(meta);
+        (*fields)[i] = field->WithType(dtype)->WithMergedMetadata(meta);
+        modified_options->extension_columns.erase(field->name());
       }
     }         
   }           
@@ -2403,11 +2401,11 @@ Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr<Table>
   // ARROW-3789: allow "self-destructing" by releasing references to columns as
   // we convert them to pandas
   table = nullptr;
+  PandasOptions modified_options = options;
 
   RETURN_NOT_OK(ConvertCategoricals(options, &arrays, &fields));
-  RETURN_NOT_OK(ConvertComplexArrays(options, &arrays, &fields));
+  RETURN_NOT_OK(ConvertComplexArrays(options, &arrays, &fields, &modified_options));
 
-  PandasOptions modified_options = options;
   modified_options.strings_to_categorical = false;
   modified_options.categorical_columns.clear();
 

From 765022e63ddfc8f5501357c13a5ac0fcf04ec8d5 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Mon, 8 Nov 2021 21:57:38 +0200
Subject: [PATCH 59/61] [skip ci] WIP

---
 cpp/src/arrow/python/arrow_to_pandas.cc | 98 ++++++++++++-------------
 1 file changed, 49 insertions(+), 49 deletions(-)

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index beb25515a34..c29b07c163a 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -333,6 +333,8 @@ class PandasWriter {
     HALF_FLOAT,
     FLOAT,
     DOUBLE,
+    COMPLEX_FLOAT,
+    COMPLEX_DOUBLE,
     BOOL,
     DATETIME_DAY,
     DATETIME_SECOND,
@@ -917,6 +919,15 @@ inline void ConvertNumericNullableCast(const ChunkedArray& data, InType na_value
   }
 }
 
+template <typename OutType>
+inline void ConvertNumericNullableComplex(const ChunkedArray& data,
+                                          OutType* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const auto& arr = *data.chunk(c);
+    arr.num_fields();
+  }
+}
+
 template <int NPY_TYPE>
 class TypedPandasWriter : public PandasWriter {
  public:
@@ -1293,26 +1304,26 @@ class FloatWriter : public TypedPandasWriter<NPY_TYPE> {
       case Type::DOUBLE:
         ConvertNumericNullableCast(*data, npy_traits<NPY_TYPE>::na_sentinel, out_values);
         break;
-      // case Type::EXTENSION:
-      //   {
-      //     auto ext_type = std::static_pointer_cast<const ExtensionType>(data->type());
-
-      //     if(ext_type == nullptr) {
-      //       return Status::TypeError(
-      //         "Unable to cast ", data->type()->ToString(), "to ExtensionType");
-      //     }
-
-      //     if(ext_type->extension_name() == "arrow.complex64") {
-      //       ConvertNumericNullableCast(*data, npy_traits<NPY_TYPE>::na_sentinel, out_values);
-      //     } else if (ext_type->extension_name() == "arrow.complex128") {
-      //       ConvertNumericNullableCast(*data, npy_traits<NPY_TYPE>::na_sentinel, out_values);
-      //     } else {
-      //       return Status::NotImplemented("Cannot write Arrow data of type ",
-      //                                     data->type()->ToString(),
-      //                                     " to a Pandas floating point block");
-      //     }
-      //   }
-      //   break;
+      case Type::EXTENSION:
+        {
+          auto ext_type = std::static_pointer_cast<const ExtensionType>(data->type());
+
+          if(ext_type == nullptr) {
+            return Status::TypeError(
+              "Unable to cast ", data->type()->ToString(), "to ExtensionType");
+          }
+
+          if(ext_type->extension_name() == "arrow.complex64") {
+            ConvertNumericNullableComplex(*data, out_values);
+          } else if (ext_type->extension_name() == "arrow.complex128") {
+            ConvertNumericNullableComplex(*data, out_values);
+          } else {
+            return Status::NotImplemented("Cannot write Arrow data of type ",
+                                          data->type()->ToString(),
+                                          " to a Pandas floating point block");
+          }
+        }
+        break;
       default:
         return Status::NotImplemented("Cannot write Arrow data of type ",
                                       data->type()->ToString(),
@@ -1336,8 +1347,8 @@ using Int64Writer = IntWriter<NPY_INT64>;
 using Float16Writer = FloatWriter<NPY_FLOAT16>;
 using Float32Writer = FloatWriter<NPY_FLOAT32>;
 using Float64Writer = FloatWriter<NPY_FLOAT64>;
-// using Complex64Writer = FloatWriter<NPY_COMPLEX64>;
-// using Complex128Writer = FloatWriter<NPY_COMPLEX128>;
+using Complex64Writer = FloatWriter<NPY_COMPLEX64>;
+using Complex128Writer = FloatWriter<NPY_COMPLEX128>;
 
 class BoolWriter : public TypedPandasWriter<NPY_BOOL> {
  public:
@@ -1847,22 +1858,6 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type,
       }
     } break;
     case PandasWriter::EXTENSION:
-      // {
-      //   auto ext_type = dynamic_cast<const ExtensionType *>(&type);
-
-      //   if(ext_type == nullptr) {
-      //     return Status::TypeError(
-      //       "Unable to cast ", type.ToString(), "to ExtensionType");
-      //   }
-
-      //   if(ext_type->extension_name() == "arrow.complex64") {
-      //     *writer = std::make_shared<Float32Writer>(options, num_rows, num_columns);
-      //   } else if (ext_type->extension_name() == "arrow.complex128") {
-      //     *writer = std::make_shared<Float64Writer>(options, num_rows, num_columns);
-      //   } else {
-      //     *writer = std::make_shared<ExtensionWriter>(options, num_rows, num_columns);
-      //   }
-      // }
       *writer = std::make_shared<ExtensionWriter>(options, num_rows, num_columns);
       break;
       BLOCK_CASE(OBJECT, ObjectWriter);
@@ -1877,6 +1872,8 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type,
       BLOCK_CASE(HALF_FLOAT, Float16Writer);
       BLOCK_CASE(FLOAT, Float32Writer);
       BLOCK_CASE(DOUBLE, Float64Writer);
+      BLOCK_CASE(COMPLEX_FLOAT, Complex64Writer);
+      BLOCK_CASE(COMPLEX_DOUBLE, Complex128Writer);
       BLOCK_CASE(BOOL, BoolWriter);
       BLOCK_CASE(DATETIME_DAY, DatetimeDayWriter);
       BLOCK_CASE(DATETIME_SECOND, DatetimeSecondWriter);
@@ -1901,7 +1898,8 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type,
   return Status::OK();
 }
 
-static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& options,
+static Status GetPandasWriterType(const ChunkedArray& data,
+                                  const PandasOptions& options,
                                   PandasWriter::type* output_type) {
 #define INTEGER_CASE(NAME)                                                             \
   *output_type =                                                                       \
@@ -2025,15 +2023,15 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions&
       *output_type = PandasWriter::CATEGORICAL;
       break;
     case Type::EXTENSION:
-      // {
-      //   auto ext_type = std::static_pointer_cast<ExtensionType>(data.type());
+      {
+        auto ext_type = std::static_pointer_cast<ExtensionType>(data.type());
 
-      //   if(ext_type->extension_name() == "arrow.complex64") {
-      //     return Status::NotImplemented("complex64");
-      //   } else if (ext_type->extension_name() == "arrow.complex128") {
-      //     return Status::NotImplemented("complex128");
-      //   }
-      // }
+        if(ext_type->extension_name() == "arrow.complex64") {
+          *output_type = PandasWriter::COMPLEX_FLOAT;
+        } else if (ext_type->extension_name() == "arrow.complex128") {
+          *output_type = PandasWriter::COMPLEX_DOUBLE;
+        }
+      }
 
       *output_type = PandasWriter::EXTENSION;
       break;
@@ -2120,7 +2118,8 @@ class ConsolidatedBlockCreator : public PandasBlockCreator {
       *out = PandasWriter::EXTENSION;
       return Status::OK();
     } else {
-      return GetPandasWriterType(*arrays_[column_index], options_, out);
+      return GetPandasWriterType(*arrays_[column_index],
+                                 options_, out);
     }
   }
 
@@ -2226,7 +2225,8 @@ class SplitBlockCreator : public PandasBlockCreator {
       output_type = PandasWriter::EXTENSION;
     } else {
       // Null count needed to determine output type
-      RETURN_NOT_OK(GetPandasWriterType(*arrays_[i], options_, &output_type));
+      RETURN_NOT_OK(GetPandasWriterType(*arrays_[i],
+                                        options_, &output_type));
     }
     return MakeWriter(this->options_, output_type, type, num_rows_, 1, writer);
   }

From e31d556b03f5a8eae31df9a954e9dca8a0f27f14 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Wed, 22 Dec 2021 16:43:49 +0200
Subject: [PATCH 60/61] Fix byte widths for complex types

---
 cpp/src/arrow/python/type_traits.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/python/type_traits.h b/cpp/src/arrow/python/type_traits.h
index 0e31cad9397..da39ff0e0e5 100644
--- a/cpp/src/arrow/python/type_traits.h
+++ b/cpp/src/arrow/python/type_traits.h
@@ -378,9 +378,9 @@ static inline int NumPyTypeSize(int npy_type) {
     case NPY_FLOAT64:
       return 8;
     case NPY_COMPLEX64:
-      return 16;
+      return 8;
     case NPY_COMPLEX128:
-      return 32;
+      return 16;
     case NPY_DATETIME:
       return 8;
     case NPY_OBJECT:

From 7e8dbbb60ebe3fa1c65a8cf9ac6782b65fa004f4 Mon Sep 17 00:00:00 2001
From: Simon Perkins <simon.perkins@gmail.com>
Date: Wed, 22 Dec 2021 17:04:42 +0200
Subject: [PATCH 61/61] [skip ci] Add ComplexWriter WIP

---
 cpp/src/arrow/python/arrow_to_pandas.cc | 80 ++++++++++++++++++++++---
 1 file changed, 72 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index c29b07c163a..dace91c2ca6 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -1249,19 +1249,84 @@ class IntWriter : public TypedPandasWriter<NPY_TYPE> {
 
 
 template <int NPY_TYPE>
-class ComplexWriter : public TypedPandasWriter<NPY_TYPE> {
+class FloatWriter : public TypedPandasWriter<NPY_TYPE> {
  public:
   using ArrowType = typename npy_traits<NPY_TYPE>::TypeClass;
   using TypedPandasWriter<NPY_TYPE>::TypedPandasWriter;
   using T = typename ArrowType::c_type;
 
   bool CanZeroCopy(const ChunkedArray& data) const override {
-    return IsNonNullContiguous(data) && data.type()->id() == ArrowType::type_id;
+    return data.type()->id() == ArrowType::type_id && IsNonNullContiguous(data);
+  }
+
+  Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+    Type::type in_type = data->type()->id();
+    auto out_values = this->GetBlockColumnStart(rel_placement);
+
+#define INTEGER_CASE(IN_TYPE)                                             \
+  ConvertIntegerWithNulls<IN_TYPE, T>(this->options_, *data, out_values); \
+  break;
+
+    switch (in_type) {
+      case Type::UINT8:
+        INTEGER_CASE(uint8_t);
+      case Type::INT8:
+        INTEGER_CASE(int8_t);
+      case Type::UINT16:
+        INTEGER_CASE(uint16_t);
+      case Type::INT16:
+        INTEGER_CASE(int16_t);
+      case Type::UINT32:
+        INTEGER_CASE(uint32_t);
+      case Type::INT32:
+        INTEGER_CASE(int32_t);
+      case Type::UINT64:
+        INTEGER_CASE(uint64_t);
+      case Type::INT64:
+        INTEGER_CASE(int64_t);
+      case Type::HALF_FLOAT:
+        ConvertNumericNullableCast(*data, npy_traits<NPY_TYPE>::na_sentinel, out_values);
+      case Type::FLOAT:
+        ConvertNumericNullableCast(*data, npy_traits<NPY_TYPE>::na_sentinel, out_values);
+        break;
+      case Type::DOUBLE:
+        ConvertNumericNullableCast(*data, npy_traits<NPY_TYPE>::na_sentinel, out_values);
+        break;
+      case Type::EXTENSION:
+        {
+          auto ext_type = std::static_pointer_cast<const ExtensionType>(data->type());
+
+          if(ext_type == nullptr) {
+            return Status::TypeError(
+              "Unable to cast ", data->type()->ToString(), "to ExtensionType");
+          }
+
+          if(ext_type->extension_name() == "arrow.complex64") {
+            ConvertNumericNullableComplex(*data, out_values);
+          } else if (ext_type->extension_name() == "arrow.complex128") {
+            ConvertNumericNullableComplex(*data, out_values);
+          } else {
+            return Status::NotImplemented("Cannot write Arrow data of type ",
+                                          data->type()->ToString(),
+                                          " to a Pandas floating point block");
+          }
+        }
+        break;
+      default:
+        return Status::NotImplemented("Cannot write Arrow data of type ",
+                                      data->type()->ToString(),
+                                      " to a Pandas floating point block");
+    }
+
+#undef INTEGER_CASE
+
+    return Status::OK();
   }
 };
 
+
 template <int NPY_TYPE>
-class FloatWriter : public TypedPandasWriter<NPY_TYPE> {
+class ComplexWriter : public TypedPandasWriter<NPY_TYPE> {
  public:
   using ArrowType = typename npy_traits<NPY_TYPE>::TypeClass;
   using TypedPandasWriter<NPY_TYPE>::TypedPandasWriter;
@@ -1320,16 +1385,15 @@ class FloatWriter : public TypedPandasWriter<NPY_TYPE> {
           } else {
             return Status::NotImplemented("Cannot write Arrow data of type ",
                                           data->type()->ToString(),
-                                          " to a Pandas floating point block");
+                                          " to a Pandas complex number block");
           }
         }
         break;
       default:
         return Status::NotImplemented("Cannot write Arrow data of type ",
                                       data->type()->ToString(),
-                                      " to a Pandas floating point block");
+                                      " to a Pandas complex number block");
     }
-
 #undef INTEGER_CASE
 
     return Status::OK();
@@ -1347,8 +1411,8 @@ using Int64Writer = IntWriter<NPY_INT64>;
 using Float16Writer = FloatWriter<NPY_FLOAT16>;
 using Float32Writer = FloatWriter<NPY_FLOAT32>;
 using Float64Writer = FloatWriter<NPY_FLOAT64>;
-using Complex64Writer = FloatWriter<NPY_COMPLEX64>;
-using Complex128Writer = FloatWriter<NPY_COMPLEX128>;
+using Complex64Writer = ComplexWriter<NPY_COMPLEX64>;
+using Complex128Writer = ComplexWriter<NPY_COMPLEX128>;
 
 class BoolWriter : public TypedPandasWriter<NPY_BOOL> {
  public: