From a4202a931ca5c914e82ca2cc65bb1971a0730798 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Thu, 21 Jan 2021 11:45:32 -1000
Subject: [PATCH 01/33] Merge/rebase

---
 cpp/src/arrow/dataset/partition_test.cc | 73 ++++++++++++++++++++-----
 1 file changed, 60 insertions(+), 13 deletions(-)
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 286848d9ae9..a412e9f7b8f 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -27,6 +27,7 @@
 #include <vector>
 
 #include "arrow/compute/api_scalar.h"
+#include "arrow/compute/api_vector.h"
 #include "arrow/dataset/scanner_internal.h"
 #include "arrow/dataset/test_util.h"
 #include "arrow/filesystem/path_util.h"
@@ -77,6 +78,31 @@ class TestPartitioning : public ::testing::Test {
     ASSERT_OK_AND_ASSIGN(partitioning_, factory_->Finish(actual));
   }
 
+  void AssertPartition(const std::shared_ptr<Partitioning> partitioning,
+                       const std::shared_ptr<RecordBatch> full_batch,
+                       const std::vector<std::vector<int>>& expected_partition_indices) {
+    ASSERT_OK_AND_ASSIGN(auto partition_results, partitioning->Partition(full_batch));
+    ASSERT_EQ(partition_results.batches.size(), expected_partition_indices.size());
+    auto max_index =
+        std::min(partition_results.batches.size(), expected_partition_indices.size());
+    for (int partition_index = 0; partition_index < max_index; partition_index++) {
+      std::shared_ptr<RecordBatch> actual = partition_results.batches[partition_index];
+      std::shared_ptr<ChunkedArray> indices_arr;
+      ChunkedArrayFromVector<Int32Type>({expected_partition_indices[partition_index]},
+                                        &indices_arr);
+      auto expected = compute::Take(full_batch, indices_arr);
+      ASSERT_EQ(expected, actual);
+    }
+  }
+
+  void AssertPartition(const std::shared_ptr<Partitioning> partitioning,
+                       const std::shared_ptr<Schema> schema,
+                       const std::string& record_batch_json,
+                       const std::vector<std::vector<int>>& expected_partition_indices) {
+    auto record_batch = RecordBatchFromJSON(schema, record_batch_json);
+    AssertPartition(partitioning, record_batch, expected_partition_indices);
+  }
+
   void AssertInspectError(const std::vector<std::string>& paths) {
     ASSERT_RAISES(Invalid, factory_->Inspect(paths));
   }
@@ -103,6 +129,21 @@ class TestPartitioning : public ::testing::Test {
   std::shared_ptr<Schema> written_schema_;
 };
 
+TEST_F(TestPartitioning, Basic) {
+  auto schema_ = schema({field("a", int32()), field("b", utf8())});
+  auto partitioning = std::make_shared<DirectoryPartitioning>(schema_);
+  std::string json = R"([{"a": 3,    "b": "x"},
+                         {"a": 3,    "b": "x"},
+                         {"a": 1,    "b": null},
+                         {"a": null,    "b": null},
+                         {"a": null,    "b": "z"},
+                         {"a": null,    "b": null}
+                       ])";
+  AssertPartition(partitioning, schema_, json, {{0, 1}, {2}, {3, 5}, {4}});
+}
+
+TEST_F(TestPartitioning, StructDictionaryNull) {}
+
 TEST_F(TestPartitioning, DirectoryPartitioning) {
   partitioning_ = std::make_shared<DirectoryPartitioning>(
       schema({field("alpha", int32()), field("beta", utf8())}));
@@ -600,20 +641,26 @@ TEST(GroupTest, Basics) {
 }
 
 TEST(GroupTest, WithNulls) {
-  auto has_nulls = checked_pointer_cast<StructArray>(
-      ArrayFromJSON(struct_({field("a", utf8()), field("b", int32())}), R"([
-    {"a": "ex",  "b": 0},
-    {"a": null,  "b": 0},
-    {"a": "why", "b": 0},
-    {"a": "ex",  "b": 1},
-    {"a": "why", "b": 0},
-    {"a": "ex",  "b": 1},
-    {"a": "ex",  "b": 0},
-    {"a": "why", "b": null}
-  ])"));
-  ASSERT_RAISES(NotImplemented, MakeGroupings(*has_nulls));
+  AssertGrouping({field("a", utf8()), field("b", int32())},
+                 R"([
+                   {"a": "ex",  "b": 0},
+                   {"a": null,  "b": 0},
+                   {"a": null,  "b": 0},
+                   {"a": "ex",  "b": 1},
+                   {"a": null,  "b": null},
+                   {"a": "ex",  "b": 1},
+                   {"a": "ex",  "b": 0},
+                   {"a": "why", "b": null}
+                 ])",
+                 R"([
+                   {"a": "ex", "b": 0, "ids": [0, 6]},
+                   {"a": null, "b": 0, "ids": [1, 2]},
+                   {"a": "ex", "b": 1, "ids": [3, 5]},
+                   {"a": null, "b": null, "ids": [4]},
+                   {"a": "why", "b": null, "ids": [7]}
+  ])");
 
-  has_nulls = checked_pointer_cast<StructArray>(
+  auto has_nulls = checked_pointer_cast<StructArray>(
       ArrayFromJSON(struct_({field("a", utf8()), field("b", int32())}), R"([
     {"a": "ex",  "b": 0},
     null,

From 53853b621cae0c5acb7442d66f2e5891a249b493 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 25 Jan 2021 08:15:00 -1000
Subject: [PATCH 02/33] WIP commit

---
 cpp/src/arrow/compute/api_vector.cc          |  5 ++-
 cpp/src/arrow/compute/api_vector.h           | 20 ++++++++-
 cpp/src/arrow/compute/kernels/vector_hash.cc |  8 +++-
 cpp/src/arrow/pretty_print.cc                | 45 ++++++++++++++++++++
 cpp/src/arrow/pretty_print.h                 | 13 ++++++
 5 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc
index f5ab46ac603..0082d48112d 100644
--- a/cpp/src/arrow/compute/api_vector.cc
+++ b/cpp/src/arrow/compute/api_vector.cc
@@ -74,8 +74,9 @@ Result<std::shared_ptr<Array>> Unique(const Datum& value, ExecContext* ctx) {
   return result.make_array();
 }
 
-Result<Datum> DictionaryEncode(const Datum& value, ExecContext* ctx) {
-  return CallFunction("dictionary_encode", {value}, ctx);
+Result<Datum> DictionaryEncode(const Datum& value, const DictionaryEncodeOptions& options,
+                               ExecContext* ctx) {
+  return CallFunction("dictionary_encode", {value}, &options, ctx);
 }
 
 const char kValuesFieldName[] = "values";
diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h
index 9e9cad9e5d9..9dcf4df2894 100644
--- a/cpp/src/arrow/compute/api_vector.h
+++ b/cpp/src/arrow/compute/api_vector.h
@@ -63,6 +63,23 @@ enum class SortOrder {
   Descending,
 };
 
+struct DictionaryEncodeOptions : public FunctionOptions {
+  /// Configure how null values will be encoded
+  enum NullEncodingBehavior {
+    /// the null value will be added to the dictionary with a proper index
+    ENCODE,
+    /// the null value will be masked in the indices array
+    MASK,
+  };
+
+  explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK)
+      : null_encoding_behavior(null_encoding) {}
+
+  static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); }
+
+  NullEncodingBehavior null_encoding_behavior = MASK;
+};
+
 /// \brief One sort key for PartitionNthIndices (TODO) and SortIndices
 struct ARROW_EXPORT SortKey {
   explicit SortKey(std::string name, SortOrder order = SortOrder::Ascending)
@@ -296,7 +313,8 @@ Result<std::shared_ptr<StructArray>> ValueCounts(const Datum& value,
 /// \since 1.0.0
 /// \note API not yet finalized
 ARROW_EXPORT
-Result<Datum> DictionaryEncode(const Datum& data, ExecContext* ctx = NULLPTR);
+Result<Datum> DictionaryEncode(const Datum& data, const DictionaryEncodeOptions& options,
+                               ExecContext* ctx = NULLPTR);
 
 // ----------------------------------------------------------------------
 // Deprecated functions
diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index 34d18c24a0c..0a4a1aff5f9 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -641,7 +641,8 @@ const FunctionDoc value_counts_doc(
 
 const FunctionDoc dictionary_encode_doc(
     "Dictionary-encode array",
-    ("Return a dictionary-encoded version of the input array."), {"array"});
+    ("Return a dictionary-encoded version of the input array."), {"array"},
+    "DictionaryEncodeOptions");
 
 }  // namespace
 
@@ -687,11 +688,14 @@ void RegisterVectorHash(FunctionRegistry* registry) {
   // ----------------------------------------------------------------------
   // dictionary_encode
 
+  const auto kDefaultDictionaryEncodeOptions = DictionaryEncodeOptions::Defaults();
+
   base.finalize = DictEncodeFinalize;
   // Unique and ValueCounts output unchunked arrays
   base.output_chunked = true;
   auto dict_encode = std::make_shared<VectorFunction>("dictionary_encode", Arity::Unary(),
-                                                      &dictionary_encode_doc);
+                                                      &dictionary_encode_doc,
+                                                      &kDefaultDictionaryEncodeOptions);
   AddHashKernels<DictEncodeAction>(dict_encode.get(), base, OutputType(DictEncodeOutput));
 
   // Calling dictionary_encode on dictionary input not supported, but if it
diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc
index 8c2ac376d1e..d61e6cde2b6 100644
--- a/cpp/src/arrow/pretty_print.cc
+++ b/cpp/src/arrow/pretty_print.cc
@@ -670,4 +670,49 @@ Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options,
   return Status::OK();
 }
 
+void GdbPrintArray(const Array& arr, int indent) {
+  PrettyPrintOptions options;
+  options.indent = indent;
+  auto print_st = PrettyPrint(arr, options, &std::cout);
+  if (!print_st.ok()) {
+    std::cout << "Could not print: " << print_st.message();
+  }
+}
+
+void GdbPrintRecordBatch(const RecordBatch& rb, int indent) {
+  PrettyPrintOptions options;
+  options.indent = indent;
+  auto print_st = PrettyPrint(rb, options, &std::cout);
+  if (!print_st.ok()) {
+    std::cout << "Could not print: " << print_st.message();
+  }
+}
+
+void GdbPrintTable(const Table& table, int indent) {
+  PrettyPrintOptions options;
+  options.indent = indent;
+  auto print_st = PrettyPrint(table, options, &std::cout);
+  if (!print_st.ok()) {
+    std::cout << "Could not print: " << print_st.message();
+  }
+}
+
+void GdbPrintChunkedArray(const ChunkedArray& chunked_arr, int indent) {
+  PrettyPrintOptions options;
+  options.indent = indent;
+  auto print_st = PrettyPrint(chunked_arr, options, &std::cout);
+  if (!print_st.ok()) {
+    std::cout << "Could not print: " << print_st.message();
+  }
+}
+
+void GdbPrintSchema(const Schema& schema, int indent) {
+  PrettyPrintOptions options;
+  options.indent = indent;
+  auto print_st = PrettyPrint(schema, options, &std::cout);
+  if (!print_st.ok()) {
+    std::cout << "Could not print: " << print_st.message();
+  }
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/pretty_print.h b/cpp/src/arrow/pretty_print.h
index 9d2c72c7186..43948b8f149 100644
--- a/cpp/src/arrow/pretty_print.h
+++ b/cpp/src/arrow/pretty_print.h
@@ -120,4 +120,17 @@ Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options,
 ARROW_EXPORT
 Status DebugPrint(const Array& arr, int indent);
 
+// These print routines are used in the gdb pretty printers which are
+// not capable of passing "out" params and do a poor job of overload resolution
+
+ARROW_EXPORT void GdbPrintArray(const Array& arr, int indent);
+
+ARROW_EXPORT void GdbPrintRecordBatch(const RecordBatch& rb, int indent);
+
+ARROW_EXPORT void GdbPrintTable(const Table& table, int indent);
+
+ARROW_EXPORT void GdbPrintChunkedArray(const ChunkedArray& chunked_arr, int indent);
+
+ARROW_EXPORT void GdbPrintSchema(const Schema& schema, int indent);
+
 }  // namespace arrow

From 570de2c963b26ff21494409921be884229028eb5 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 25 Jan 2021 17:24:30 -1000
Subject: [PATCH 03/33] Added tests of vector_hash for inputs with nulls. 
 Added ability to specify encoded nulls when encoding a dictionary

---
 cpp/src/arrow/compute/api_vector.h            |  4 +-
 .../arrow/compute/kernels/scalar_cast_test.cc |  3 +-
 cpp/src/arrow/compute/kernels/vector_hash.cc  | 67 +++++++++++---
 .../compute/kernels/vector_hash_benchmark.cc  |  6 +-
 .../arrow/compute/kernels/vector_hash_test.cc | 88 +++++++++++++++++--
 cpp/src/arrow/dataset/partition.cc            | 14 +--
 cpp/src/arrow/dataset/partition_test.cc       | 66 ++++++++------
 cpp/src/arrow/python/arrow_to_pandas.cc       |  9 +-
 .../parquet/arrow/arrow_reader_writer_test.cc |  4 +-
 9 files changed, 199 insertions(+), 62 deletions(-)

diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h
index 9dcf4df2894..6a334dffda1 100644
--- a/cpp/src/arrow/compute/api_vector.h
+++ b/cpp/src/arrow/compute/api_vector.h
@@ -70,9 +70,11 @@ struct DictionaryEncodeOptions : public FunctionOptions {
     ENCODE,
     /// the null value will be masked in the indices array
     MASK,
+    /// the null value will not be included in the dictionary
+    SKIP
   };
 
-  explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK)
+  explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = SKIP)
       : null_encoding_behavior(null_encoding) {}
 
   static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); }
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 99a56346c1b..d84aefa5b19 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -1472,7 +1472,8 @@ TEST(Cast, FromDictionary) {
     ASSERT_OK_AND_ASSIGN(auto no_nulls, Take(*dict, *indices));
     ASSERT_EQ(no_nulls->null_count(), 0);
 
-    ASSERT_OK_AND_ASSIGN(Datum encoded, DictionaryEncode(no_nulls));
+    ASSERT_OK_AND_ASSIGN(Datum encoded,
+                         DictionaryEncode(no_nulls, DictionaryEncodeOptions::Defaults()));
 
     // Make a new dict array with nullptr bitmap buffer
     auto data = encoded.array()->Copy();
diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index 0a4a1aff5f9..3ea8f905745 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -173,12 +173,16 @@ class DictEncodeAction final : public ActionBase {
 
   template <class Index>
   void ObserveNullFound(Index index) {
-    indices_builder_.UnsafeAppendNull();
+    if (index < 0) {
+      indices_builder_.UnsafeAppendNull();
+    } else {
+      indices_builder_.UnsafeAppend(index);
+    }
   }
 
   template <class Index>
   void ObserveNullNotFound(Index index) {
-    indices_builder_.UnsafeAppendNull();
+    ObserveNullFound(index);
   }
 
   template <class Index>
@@ -206,6 +210,9 @@ class DictEncodeAction final : public ActionBase {
 
 class HashKernel : public KernelState {
  public:
+  HashKernel() : options_(DictionaryEncodeOptions::Defaults()) {}
+  explicit HashKernel(const DictionaryEncodeOptions& options) : options_(options) {}
+
   // Reset for another run.
   virtual Status Reset() = 0;
 
@@ -229,6 +236,7 @@ class HashKernel : public KernelState {
   virtual Status Append(const ArrayData& arr) = 0;
 
  protected:
+  DictionaryEncodeOptions options_;
   std::mutex lock_;
 };
 
@@ -241,8 +249,9 @@ template <typename Type, typename Scalar, typename Action,
           bool with_memo_visit_null = Action::with_memo_visit_null>
 class RegularHashKernel : public HashKernel {
  public:
-  RegularHashKernel(const std::shared_ptr<DataType>& type, MemoryPool* pool)
-      : pool_(pool), type_(type), action_(type, pool) {}
+  RegularHashKernel(const std::shared_ptr<DataType>& type,
+                    const DictionaryEncodeOptions& options, MemoryPool* pool)
+      : HashKernel(options), pool_(pool), type_(type), action_(type, pool) {}
 
   Status Reset() override {
     memo_table_.reset(new MemoTable(pool_, 0));
@@ -282,7 +291,9 @@ class RegularHashKernel : public HashKernel {
                                           &unused_memo_index);
         },
         [this]() {
-          if (with_memo_visit_null) {
+          if (with_memo_visit_null ||
+              options_.null_encoding_behavior ==
+                  DictionaryEncodeOptions::NullEncodingBehavior::ENCODE) {
             auto on_found = [this](int32_t memo_index) {
               action_.ObserveNullFound(memo_index);
             };
@@ -345,18 +356,23 @@ class RegularHashKernel : public HashKernel {
 // ----------------------------------------------------------------------
 // Hash kernel implementation for nulls
 
-template <typename Action>
+template <typename Action, bool with_error_status = Action::with_error_status>
 class NullHashKernel : public HashKernel {
  public:
-  NullHashKernel(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+  NullHashKernel(const std::shared_ptr<DataType>& type,
+                 const DictionaryEncodeOptions& options, MemoryPool* pool)
       : pool_(pool), type_(type), action_(type, pool) {}
 
   Status Reset() override { return action_.Reset(); }
 
-  Status Append(const ArrayData& arr) override {
+  Status Append(const ArrayData& arr) override { return DoAppend(arr); }
+
+  template <bool HasError = with_error_status>
+  enable_if_t<!HasError, Status> DoAppend(const ArrayData& arr) {
     RETURN_NOT_OK(action_.Reserve(arr.length));
     for (int64_t i = 0; i < arr.length; ++i) {
       if (i == 0) {
+        seen_null_ = true;
         action_.ObserveNullNotFound(0);
       } else {
         action_.ObserveNullFound(0);
@@ -365,12 +381,31 @@ class NullHashKernel : public HashKernel {
     return Status::OK();
   }
 
+  template <bool HasError = with_error_status>
+  enable_if_t<HasError, Status> DoAppend(const ArrayData& arr) {
+    Status s = Status::OK();
+    RETURN_NOT_OK(action_.Reserve(arr.length));
+    for (int64_t i = 0; i < arr.length; ++i) {
+      if (seen_null_ == false && i == 0) {
+        seen_null_ = true;
+        action_.ObserveNullNotFound(0, &s);
+      } else {
+        action_.ObserveNullFound(0);
+      }
+    }
+    return s;
+  }
+
   Status Flush(Datum* out) override { return action_.Flush(out); }
   Status FlushFinal(Datum* out) override { return action_.FlushFinal(out); }
 
   Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
-    // TODO(wesm): handle null being a valid dictionary value
-    auto null_array = std::make_shared<NullArray>(0);
+    std::shared_ptr<NullArray> null_array;
+    if (seen_null_) {
+      null_array = std::make_shared<NullArray>(1);
+    } else {
+      null_array = std::make_shared<NullArray>(0);
+    }
     *out = null_array->data();
     return Status::OK();
   }
@@ -380,6 +415,7 @@ class NullHashKernel : public HashKernel {
  protected:
   MemoryPool* pool_;
   std::shared_ptr<DataType> type_;
+  bool seen_null_ = false;
   Action action_;
 };
 
@@ -451,8 +487,12 @@ struct HashKernelTraits<Type, Action, enable_if_has_string_view<Type>> {
 template <typename Type, typename Action>
 std::unique_ptr<HashKernel> HashInitImpl(KernelContext* ctx, const KernelInitArgs& args) {
   using HashKernelType = typename HashKernelTraits<Type, Action>::HashKernel;
-  auto result = ::arrow::internal::make_unique<HashKernelType>(args.inputs[0].type,
-                                                               ctx->memory_pool());
+  DictionaryEncodeOptions options;
+  if (auto options_ptr = static_cast<const DictionaryEncodeOptions*>(args.options)) {
+    options = *options_ptr;
+  }
+  auto result = ::arrow::internal::make_unique<HashKernelType>(
+      args.inputs[0].type, options, ctx->memory_pool());
   ctx->SetStatus(result->Reset());
   return std::move(result);
 }
@@ -507,6 +547,8 @@ KernelInit GetHashInit(Type::type type_id) {
   }
 }
 
+using DictionaryEncodeState = OptionsWrapper<DictionaryEncodeOptions>;
+
 template <typename Action>
 std::unique_ptr<KernelState> DictionaryHashInit(KernelContext* ctx,
                                                 const KernelInitArgs& args) {
@@ -529,6 +571,7 @@ std::unique_ptr<KernelState> DictionaryHashInit(KernelContext* ctx,
       DCHECK(false) << "Unsupported dictionary index type";
       break;
   }
+  DictionaryEncodeOptions options = DictionaryEncodeOptions::Defaults();
   return ::arrow::internal::make_unique<DictionaryHashKernel>(std::move(indices_hasher));
 }
 
diff --git a/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc
index 3be549d05ce..d6b203181eb 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc
@@ -46,7 +46,7 @@ static void BuildDictionary(benchmark::State& state) {  // NOLINT non-const refe
   ArrayFromVector<Int64Type, int64_t>(is_valid, values, &arr);
 
   while (state.KeepRunning()) {
-    ABORT_NOT_OK(DictionaryEncode(arr).status());
+    ABORT_NOT_OK(DictionaryEncode(arr, DictionaryEncodeOptions::Defaults()).status());
   }
   state.counters["null_percent"] =
       static_cast<double>(arr->null_count()) / arr->length() * 100;
@@ -73,7 +73,7 @@ static void BuildStringDictionary(
   ArrayFromVector<StringType, std::string>(data, &arr);
 
   while (state.KeepRunning()) {
-    ABORT_NOT_OK(DictionaryEncode(arr).status());
+    ABORT_NOT_OK(DictionaryEncode(arr, DictionaryEncodeOptions::Defaults()).status());
   }
   state.SetBytesProcessed(state.iterations() * total_bytes);
   state.SetItemsProcessed(state.iterations() * data.size());
@@ -169,7 +169,7 @@ void BenchDictionaryEncode(benchmark::State& state, const ParamType& params) {
   std::shared_ptr<Array> arr;
   params.GenerateTestData(&arr);
   while (state.KeepRunning()) {
-    ABORT_NOT_OK(DictionaryEncode(arr).status());
+    ABORT_NOT_OK(DictionaryEncode(arr, DictionaryEncodeOptions::Defaults()).status());
   }
   params.SetMetadata(state);
 }
diff --git a/cpp/src/arrow/compute/kernels/vector_hash_test.cc b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
index e9ae4a64d97..4dc106138d7 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
@@ -126,7 +126,8 @@ void CheckDictEncode(const std::shared_ptr<Array>& input,
   auto type = dictionary(expected_indices->type(), expected_values->type());
   DictionaryArray expected(type, expected_indices, expected_values);
 
-  ASSERT_OK_AND_ASSIGN(Datum datum_out, DictionaryEncode(input));
+  ASSERT_OK_AND_ASSIGN(Datum datum_out,
+                       DictionaryEncode(input, DictionaryEncodeOptions::Defaults()));
   std::shared_ptr<Array> result = MakeArray(datum_out.array());
   ASSERT_OK(result->ValidateFull());
 
@@ -204,7 +205,8 @@ TYPED_TEST(TestHashKernelPrimitive, ZeroChunks) {
   auto type = TypeTraits<TypeParam>::type_singleton();
 
   auto zero_chunks = std::make_shared<ChunkedArray>(ArrayVector{}, type);
-  ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(zero_chunks));
+  ASSERT_OK_AND_ASSIGN(
+      Datum result, DictionaryEncode(zero_chunks, DictionaryEncodeOptions::Defaults()));
 
   ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY);
   AssertChunkedEqual(*result.chunked_array(),
@@ -305,6 +307,11 @@ TEST_F(TestHashKernel, ValueCountsBoolean) {
                    ArrayFromJSON(boolean(), "[false]"), ArrayFromJSON(int64(), "[2]"));
 }
 
+TEST_F(TestHashKernel, ValueCountsNull) {
+  CheckValueCounts<NullType, std::nullptr_t>(
+      null(), {nullptr, nullptr, nullptr}, {true, false, true}, {nullptr}, {false}, {3});
+}
+
 TEST_F(TestHashKernel, DictEncodeBoolean) {
   CheckDictEncode<BooleanType, bool>(boolean(), {true, true, false, true, false},
                                      {true, false, true, true, true}, {true, false}, {},
@@ -365,7 +372,8 @@ TYPED_TEST(TestHashKernelBinaryTypes, ZeroChunks) {
   auto type = this->type();
 
   auto zero_chunks = std::make_shared<ChunkedArray>(ArrayVector{}, type);
-  ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(zero_chunks));
+  ASSERT_OK_AND_ASSIGN(
+      Datum result, DictionaryEncode(zero_chunks, DictionaryEncodeOptions::Defaults()));
 
   ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY);
   AssertChunkedEqual(*result.chunked_array(),
@@ -381,7 +389,8 @@ TYPED_TEST(TestHashKernelBinaryTypes, TwoChunks) {
           ArrayFromJSON(type, "[\"b\"]"),
       },
       type);
-  ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(two_chunks));
+  ASSERT_OK_AND_ASSIGN(Datum result,
+                       DictionaryEncode(two_chunks, DictionaryEncodeOptions::Defaults()));
 
   auto dict_type = dictionary(int32(), type);
   auto dictionary = ArrayFromJSON(type, R"(["a", "b"])");
@@ -542,6 +551,12 @@ TEST_F(TestHashKernel, UniqueDecimal) {
                                           {true, false, true, true}, expected, {1, 0, 1});
 }
 
+TEST_F(TestHashKernel, UniqueNull) {
+  CheckUnique<NullType, std::nullptr_t>(null(), {nullptr, nullptr}, {false, true},
+                                        {nullptr}, {false});
+  CheckUnique<NullType, std::nullptr_t>(null(), {}, {}, {}, {});
+}
+
 TEST_F(TestHashKernel, ValueCountsDecimal) {
   std::vector<Decimal128> values{12, 12, 11, 12};
   std::vector<Decimal128> expected{12, 0, 11};
@@ -586,6 +601,33 @@ TEST_F(TestHashKernel, DictionaryUniqueAndValueCounts) {
     auto different_dictionaries = *ChunkedArray::Make({input, input2});
     ASSERT_RAISES(Invalid, Unique(different_dictionaries));
     ASSERT_RAISES(Invalid, ValueCounts(different_dictionaries));
+
+    // Dictionary with encoded nulls
+    auto dict_with_null = ArrayFromJSON(int64(), "[10, null, 30, 40]");
+    input = std::make_shared<DictionaryArray>(dict_ty, indices, dict_with_null);
+    ex_uniques = std::make_shared<DictionaryArray>(dict_ty, ex_indices, dict_with_null);
+    CheckUnique(input, ex_uniques);
+
+    CheckValueCounts(input, ex_uniques, ex_counts);
+
+    // Dictionary with masked nulls
+    auto indices_with_null =
+        ArrayFromJSON(index_ty, "[3, 0, 0, 0, null, null, 3, 0, null, 3, 0, null]");
+    auto ex_indices_with_null = ArrayFromJSON(index_ty, "[3, 0, null]");
+    ex_uniques = std::make_shared<DictionaryArray>(dict_ty, ex_indices_with_null, dict);
+    input = std::make_shared<DictionaryArray>(dict_ty, indices_with_null, dict);
+    CheckUnique(input, ex_uniques);
+
+    CheckValueCounts(input, ex_uniques, ex_counts);
+
+    // Dictionary with encoded AND masked nulls
+    auto some_indices_with_null =
+        ArrayFromJSON(index_ty, "[3, 0, 0, 0, 1, 1, 3, 0, null, 3, 0, null]");
+    ex_uniques =
+        std::make_shared<DictionaryArray>(dict_ty, ex_indices_with_null, dict_with_null);
+    input = std::make_shared<DictionaryArray>(dict_ty, indices_with_null, dict_with_null);
+    CheckUnique(input, ex_uniques);
+    CheckValueCounts(input, ex_uniques, ex_counts);
   }
 }
 
@@ -640,7 +682,8 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) {
   ASSERT_ARRAYS_EQUAL(*ex_counts, *counts->field(1));
 
   // Dictionary encode
-  ASSERT_OK_AND_ASSIGN(Datum encoded_out, DictionaryEncode(carr));
+  ASSERT_OK_AND_ASSIGN(Datum encoded_out,
+                       DictionaryEncode(carr, DictionaryEncodeOptions::Defaults()));
   ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind());
 
   AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array());
@@ -649,13 +692,42 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) {
 TEST_F(TestHashKernel, ZeroLengthDictionaryEncode) {
   // ARROW-7008
   auto values = ArrayFromJSON(utf8(), "[]");
-  ASSERT_OK_AND_ASSIGN(Datum datum_result, DictionaryEncode(values));
+  ASSERT_OK_AND_ASSIGN(Datum datum_result,
+                       DictionaryEncode(values, DictionaryEncodeOptions::Defaults()));
 
   std::shared_ptr<Array> result = datum_result.make_array();
   const auto& dict_result = checked_cast<const DictionaryArray&>(*result);
   ASSERT_OK(dict_result.ValidateFull());
 }
 
+TEST_F(TestHashKernel, NullEncodingSchemes) {
+  auto values = ArrayFromJSON(uint8(), "[1, 1, null, 2, null]");
+
+  // Masking should put null in the indices array
+  auto expected_mask_indices = ArrayFromJSON(int32(), "[0, 0, null, 1, null]");
+  auto expected_mask_dictionary = ArrayFromJSON(uint8(), "[1, 2]");
+  auto dictionary_type = dictionary(int32(), uint8());
+  std::shared_ptr<Array> expected = std::make_shared<DictionaryArray>(
+      dictionary_type, expected_mask_indices, expected_mask_dictionary);
+
+  ASSERT_OK_AND_ASSIGN(Datum datum_result,
+                       DictionaryEncode(values, DictionaryEncodeOptions::Defaults()));
+  std::shared_ptr<Array> result = datum_result.make_array();
+  AssertArraysEqual(*expected, *result);
+
+  // Encoding should put null in the dictionary
+  auto expected_encoded_indices = ArrayFromJSON(int32(), "[0, 0, 1, 2, 1]");
+  auto expected_encoded_dict = ArrayFromJSON(uint8(), "[1, null, 2]");
+  expected = std::make_shared<DictionaryArray>(dictionary_type, expected_encoded_indices,
+                                               expected_encoded_dict);
+
+  auto options = DictionaryEncodeOptions::Defaults();
+  options.null_encoding_behavior = DictionaryEncodeOptions::ENCODE;
+  ASSERT_OK_AND_ASSIGN(datum_result, DictionaryEncode(values, options));
+  result = datum_result.make_array();
+  AssertArraysEqual(*expected, *result);
+}
+
 TEST_F(TestHashKernel, ChunkedArrayZeroChunk) {
   // ARROW-6857
   auto chunked_array = std::make_shared<ChunkedArray>(ArrayVector{}, utf8());
@@ -670,7 +742,9 @@ TEST_F(TestHashKernel, ChunkedArrayZeroChunk) {
                            "[]");
   AssertArraysEqual(*expected, *result_array);
 
-  ASSERT_OK_AND_ASSIGN(Datum result_datum, DictionaryEncode(chunked_array));
+  ASSERT_OK_AND_ASSIGN(
+      Datum result_datum,
+      DictionaryEncode(chunked_array, DictionaryEncodeOptions::Defaults()));
   auto dict_type = dictionary(int32(), chunked_array->type());
   ASSERT_EQ(result_datum.kind(), Datum::CHUNKED_ARRAY);
 
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index d6a3723d055..879e28597f5 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -578,10 +578,6 @@ class StructDictionary {
     Encoded out{nullptr, std::make_shared<StructDictionary>()};
 
     for (const auto& column : columns) {
-      if (column->null_count() != 0) {
-        return Status::NotImplemented("Grouping on a field with nulls");
-      }
-
       RETURN_NOT_OK(out.dictionary->AddOne(column, &out.indices));
     }
 
@@ -626,7 +622,11 @@ class StructDictionary {
  private:
   Status AddOne(Datum column, std::shared_ptr<Int32Array>* fused_indices) {
     if (column.type()->id() != Type::DICTIONARY) {
-      ARROW_ASSIGN_OR_RAISE(column, compute::DictionaryEncode(std::move(column)));
+      compute::DictionaryEncodeOptions options;
+      options.null_encoding_behavior =
+          compute::DictionaryEncodeOptions::NullEncodingBehavior::ENCODE;
+      ARROW_ASSIGN_OR_RAISE(column,
+                            compute::DictionaryEncode(std::move(column), options));
     }
 
     auto dict_column = column.array_as<DictionaryArray>();
@@ -664,7 +664,9 @@ class StructDictionary {
   Status RestoreDictionaryEncoding(std::shared_ptr<DictionaryType> expected_type,
                                    Datum* column) {
     DCHECK_NE(column->type()->id(), Type::DICTIONARY);
-    ARROW_ASSIGN_OR_RAISE(*column, compute::DictionaryEncode(std::move(*column)));
+    ARROW_ASSIGN_OR_RAISE(
+        *column, compute::DictionaryEncode(std::move(*column),
+                                           compute::DictionaryEncodeOptions::Defaults()));
 
     if (expected_type->index_type()->id() == Type::INT32) {
       // dictionary_encode has already yielded the expected index_type
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index a412e9f7b8f..876bc77b0ba 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -80,27 +80,30 @@ class TestPartitioning : public ::testing::Test {
 
   void AssertPartition(const std::shared_ptr<Partitioning> partitioning,
                        const std::shared_ptr<RecordBatch> full_batch,
-                       const std::vector<std::vector<int>>& expected_partition_indices) {
+                       const RecordBatchVector& expected_batches) {
     ASSERT_OK_AND_ASSIGN(auto partition_results, partitioning->Partition(full_batch));
-    ASSERT_EQ(partition_results.batches.size(), expected_partition_indices.size());
-    auto max_index =
-        std::min(partition_results.batches.size(), expected_partition_indices.size());
-    for (int partition_index = 0; partition_index < max_index; partition_index++) {
+    std::shared_ptr<RecordBatch> rest = full_batch;
+    ASSERT_EQ(partition_results.batches.size(), expected_batches.size());
+    auto max_index = std::min(partition_results.batches.size(), expected_batches.size());
+    for (std::size_t partition_index = 0; partition_index < max_index;
+         partition_index++) {
       std::shared_ptr<RecordBatch> actual = partition_results.batches[partition_index];
-      std::shared_ptr<ChunkedArray> indices_arr;
-      ChunkedArrayFromVector<Int32Type>({expected_partition_indices[partition_index]},
-                                        &indices_arr);
-      auto expected = compute::Take(full_batch, indices_arr);
-      ASSERT_EQ(expected, actual);
+      AssertBatchesEqual(*expected_batches[partition_index], *actual);
     }
   }
 
   void AssertPartition(const std::shared_ptr<Partitioning> partitioning,
                        const std::shared_ptr<Schema> schema,
                        const std::string& record_batch_json,
-                       const std::vector<std::vector<int>>& expected_partition_indices) {
+                       const std::shared_ptr<Schema> partitioned_schema,
+                       const std::vector<std::string>& expected_record_batch_strs) {
     auto record_batch = RecordBatchFromJSON(schema, record_batch_json);
-    AssertPartition(partitioning, record_batch, expected_partition_indices);
+    RecordBatchVector expected_batches;
+    for (const auto& expected_record_batch_str : expected_record_batch_strs) {
+      expected_batches.push_back(
+          RecordBatchFromJSON(partitioned_schema, expected_record_batch_str));
+    }
+    AssertPartition(partitioning, record_batch, expected_batches);
   }
 
   void AssertInspectError(const std::vector<std::string>& paths) {
@@ -130,16 +133,21 @@ class TestPartitioning : public ::testing::Test {
 };
 
 TEST_F(TestPartitioning, Basic) {
-  auto schema_ = schema({field("a", int32()), field("b", utf8())});
-  auto partitioning = std::make_shared<DirectoryPartitioning>(schema_);
-  std::string json = R"([{"a": 3,    "b": "x"},
-                         {"a": 3,    "b": "x"},
-                         {"a": 1,    "b": null},
-                         {"a": null,    "b": null},
-                         {"a": null,    "b": "z"},
-                         {"a": null,    "b": null}
+  auto partition_schema = schema({field("a", int32()), field("b", utf8())});
+  auto schema_ = schema({field("a", int32()), field("b", utf8()), field("c", uint32())});
+  auto remaining_schema = schema({field("c", uint32())});
+  auto partitioning = std::make_shared<DirectoryPartitioning>(partition_schema);
+  std::string json = R"([{"a": 3,    "b": "x",  "c": 0},
+                         {"a": 3,    "b": "x",  "c": 1},
+                         {"a": 1,    "b": null, "c": 2},
+                         {"a": null, "b": null, "c": 3},
+                         {"a": null, "b": "z",  "c": 4},
+                         {"a": null, "b": null, "c": 5}
                        ])";
-  AssertPartition(partitioning, schema_, json, {{0, 1}, {2}, {3, 5}, {4}});
+  std::vector<std::string> expected_batches = {R"([{"c": 0}, {"c": 1}])", R"([{"c": 2}])",
+                                               R"([{"c": 3}, {"c": 5}])",
+                                               R"([{"c": 4}])"};
+  AssertPartition(partitioning, schema_, json, remaining_schema, expected_batches);
 }
 
 TEST_F(TestPartitioning, StructDictionaryNull) {}
@@ -643,14 +651,14 @@ TEST(GroupTest, Basics) {
 TEST(GroupTest, WithNulls) {
   AssertGrouping({field("a", utf8()), field("b", int32())},
                  R"([
-                   {"a": "ex",  "b": 0},
-                   {"a": null,  "b": 0},
-                   {"a": null,  "b": 0},
-                   {"a": "ex",  "b": 1},
-                   {"a": null,  "b": null},
-                   {"a": "ex",  "b": 1},
-                   {"a": "ex",  "b": 0},
-                   {"a": "why", "b": null}
+                   {"a": "ex",  "b": 0,    "id": 0},
+                   {"a": null,  "b": 0,    "id": 1},
+                   {"a": null,  "b": 0,    "id": 2},
+                   {"a": "ex",  "b": 1,    "id": 3},
+                   {"a": null,  "b": null, "id": 4},
+                   {"a": "ex",  "b": 1,    "id": 5},
+                   {"a": "ex",  "b": 0,    "id": 6},
+                   {"a": "why", "b": null, "id": 7}
                  ])",
                  R"([
                    {"a": "ex", "b": 0, "ids": [0, 6]},
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index 09245285030..cd861deda31 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -2183,7 +2183,10 @@ Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arr
                              "only zero-copy conversions allowed");
     }
     compute::ExecContext ctx(options.pool);
-    ARROW_ASSIGN_OR_RAISE(Datum out, DictionaryEncode((*arrays)[i], &ctx));
+    // TODO: Should we include nulls here?
+    ARROW_ASSIGN_OR_RAISE(
+        Datum out, DictionaryEncode((*arrays)[i],
+                                    compute::DictionaryEncodeOptions::Defaults(), &ctx));
     (*arrays)[i] = out.chunked_array();
     (*fields)[i] = (*fields)[i]->WithType((*arrays)[i]->type());
     return Status::OK();
@@ -2232,7 +2235,9 @@ Status ConvertChunkedArrayToPandas(const PandasOptions& options,
                              "only zero-copy conversions allowed");
     }
     compute::ExecContext ctx(options.pool);
-    ARROW_ASSIGN_OR_RAISE(Datum out, DictionaryEncode(arr, &ctx));
+    ARROW_ASSIGN_OR_RAISE(
+        Datum out,
+        DictionaryEncode(arr, compute::DictionaryEncodeOptions::Defaults(), &ctx));
     arr = out.chunked_array();
   }
 
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index ca702152d61..45c04ed5f81 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -79,6 +79,7 @@ using arrow::Status;
 using arrow::Table;
 using arrow::TimeUnit;
 using arrow::compute::DictionaryEncode;
+using arrow::compute::DictionaryEncodeOptions;
 using arrow::internal::checked_cast;
 using arrow::internal::checked_pointer_cast;
 using arrow::internal::Iota;
@@ -884,7 +885,8 @@ TYPED_TEST(TestParquetIO, SingleColumnOptionalDictionaryWrite) {
 
   ASSERT_OK(NullableArray<TypeParam>(SMALL_SIZE, 10, kDefaultSeed, &values));
 
-  ASSERT_OK_AND_ASSIGN(Datum out, DictionaryEncode(values));
+  ASSERT_OK_AND_ASSIGN(Datum out,
+                       DictionaryEncode(values, DictionaryEncodeOptions::Defaults()));
   std::shared_ptr<Array> dict_values = MakeArray(out.array());
   std::shared_ptr<GroupNode> schema =
       MakeSimpleSchema(*dict_values->type(), Repetition::OPTIONAL);

From 807c600651eec7f5a4277eacea493b34564ea559 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 25 Jan 2021 17:49:10 -1000
Subject: [PATCH 04/33] Prevent using dictionary columns as partition columns. 
 It wouldn't work.

---
 cpp/src/arrow/dataset/partition.cc      | 16 ++++++++++------
 cpp/src/arrow/python/arrow_to_pandas.cc |  1 -
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 879e28597f5..bb52e2d8fbd 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -621,13 +621,17 @@ class StructDictionary {
 
  private:
   Status AddOne(Datum column, std::shared_ptr<Int32Array>* fused_indices) {
-    if (column.type()->id() != Type::DICTIONARY) {
-      compute::DictionaryEncodeOptions options;
-      options.null_encoding_behavior =
-          compute::DictionaryEncodeOptions::NullEncodingBehavior::ENCODE;
-      ARROW_ASSIGN_OR_RAISE(column,
-                            compute::DictionaryEncode(std::move(column), options));
+    if (column.type()->id() == Type::DICTIONARY) {
+      // compute::DictionaryEncode doesn't support dictionary and, even if it did, it
+      // would be a null op and return a flat dictionary.  In order to group by dictionary
+      // we would need to be able to create a nested dictionary.
+      return Status::NotImplemented(
+          "Cannot use column of type dictionary as grouping criteria");
     }
+    compute::DictionaryEncodeOptions options;
+    options.null_encoding_behavior =
+        compute::DictionaryEncodeOptions::NullEncodingBehavior::ENCODE;
+    ARROW_ASSIGN_OR_RAISE(column, compute::DictionaryEncode(std::move(column), options));
 
     auto dict_column = column.array_as<DictionaryArray>();
     dictionaries_.push_back(dict_column->dictionary());
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index cd861deda31..1c47f9742de 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -2183,7 +2183,6 @@ Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arr
                              "only zero-copy conversions allowed");
     }
     compute::ExecContext ctx(options.pool);
-    // TODO: Should we include nulls here?
     ARROW_ASSIGN_OR_RAISE(
         Datum out, DictionaryEncode((*arrays)[i],
                                     compute::DictionaryEncodeOptions::Defaults(), &ctx));

From 353ea9d4e2c106de0cc903f3ba8ff3c34b494522 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 1 Feb 2021 15:41:54 -1000
Subject: [PATCH 05/33] Addressing PR comments

---
 cpp/src/arrow/compute/api_vector.h            | 25 ++++--
 .../arrow/compute/kernels/scalar_cast_test.cc |  3 +-
 cpp/src/arrow/compute/kernels/vector_hash.cc  | 77 ++++++++++---------
 .../compute/kernels/vector_hash_benchmark.cc  |  6 +-
 .../arrow/compute/kernels/vector_hash_test.cc | 25 ++----
 cpp/src/arrow/dataset/partition.cc            | 33 +++++---
 cpp/src/arrow/dataset/partition_test.cc       | 19 +++++
 cpp/src/arrow/pretty_print.cc                 | 45 -----------
 cpp/src/arrow/pretty_print.h                  | 13 ----
 .../parquet/arrow/arrow_reader_writer_test.cc |  3 +-
 10 files changed, 114 insertions(+), 135 deletions(-)

diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h
index 6a334dffda1..d67568e1567 100644
--- a/cpp/src/arrow/compute/api_vector.h
+++ b/cpp/src/arrow/compute/api_vector.h
@@ -63,18 +63,17 @@ enum class SortOrder {
   Descending,
 };
 
+/// \brief Options for the dictionary encode function
 struct DictionaryEncodeOptions : public FunctionOptions {
   /// Configure how null values will be encoded
   enum NullEncodingBehavior {
     /// the null value will be added to the dictionary with a proper index
     ENCODE,
     /// the null value will be masked in the indices array
-    MASK,
-    /// the null value will not be included in the dictionary
-    SKIP
+    MASK
   };
 
-  explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = SKIP)
+  explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK)
       : null_encoding_behavior(null_encoding) {}
 
   static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); }
@@ -308,15 +307,29 @@ Result<std::shared_ptr<StructArray>> ValueCounts(const Datum& value,
                                                  ExecContext* ctx = NULLPTR);
 
 /// \brief Dictionary-encode values in an array-like object
+///
+/// Any nulls encountered in the dictionary will be handled according to the
+/// specified null encoding behavior.
+///
+/// For example, given values ["a", "b", null, "a", null] the output will be
+/// (null_encoding == ENCODE) Indices: [0, 1, 2, 0, 2] / Dict: ["a", "b", null]
+/// (null_encoding == MASK)   Indices: [0, 1, null, 0, null] / Dict: ["a", "b"]
+///
+/// If the input is already dictionary encoded this function is a no-op unless
+/// it needs to modify the null_encoding (TODO)
+///
 /// \param[in] data array-like input
 /// \param[in] ctx the function execution context, optional
+/// \param[in] options configures null encoding behavior
 /// \return result with same shape and type as input
 ///
 /// \since 1.0.0
 /// \note API not yet finalized
 ARROW_EXPORT
-Result<Datum> DictionaryEncode(const Datum& data, const DictionaryEncodeOptions& options,
-                               ExecContext* ctx = NULLPTR);
+Result<Datum> DictionaryEncode(
+    const Datum& data,
+    const DictionaryEncodeOptions& options = DictionaryEncodeOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
 
 // ----------------------------------------------------------------------
 // Deprecated functions
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index d84aefa5b19..99a56346c1b 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -1472,8 +1472,7 @@ TEST(Cast, FromDictionary) {
     ASSERT_OK_AND_ASSIGN(auto no_nulls, Take(*dict, *indices));
     ASSERT_EQ(no_nulls->null_count(), 0);
 
-    ASSERT_OK_AND_ASSIGN(Datum encoded,
-                         DictionaryEncode(no_nulls, DictionaryEncodeOptions::Defaults()));
+    ASSERT_OK_AND_ASSIGN(Datum encoded, DictionaryEncode(no_nulls));
 
     // Make a new dict array with nullptr bitmap buffer
     auto data = encoded.array()->Copy();
diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index 3ea8f905745..c7b25347624 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -58,7 +58,10 @@ class UniqueAction final : public ActionBase {
   using ActionBase::ActionBase;
 
   static constexpr bool with_error_status = false;
-  static constexpr bool with_memo_visit_null = true;
+
+  UniqueAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+               MemoryPool* pool)
+      : ActionBase(type, pool) {}
 
   Status Reset() { return Status::OK(); }
 
@@ -76,6 +79,8 @@ class UniqueAction final : public ActionBase {
   template <class Index>
   void ObserveNotFound(Index index) {}
 
+  bool ShouldEncodeNulls() { return true; }
+
   Status Flush(Datum* out) { return Status::OK(); }
 
   Status FlushFinal(Datum* out) { return Status::OK(); }
@@ -89,9 +94,9 @@ class ValueCountsAction final : ActionBase {
   using ActionBase::ActionBase;
 
   static constexpr bool with_error_status = true;
-  static constexpr bool with_memo_visit_null = true;
 
-  ValueCountsAction(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+  ValueCountsAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+                    MemoryPool* pool)
       : ActionBase(type, pool), count_builder_(pool) {}
 
   Status Reserve(const int64_t length) {
@@ -147,6 +152,8 @@ class ValueCountsAction final : ActionBase {
     }
   }
 
+  bool ShouldEncodeNulls() { return true; }
+
  private:
   Int64Builder count_builder_;
 };
@@ -159,10 +166,14 @@ class DictEncodeAction final : public ActionBase {
   using ActionBase::ActionBase;
 
   static constexpr bool with_error_status = false;
-  static constexpr bool with_memo_visit_null = false;
 
-  DictEncodeAction(const std::shared_ptr<DataType>& type, MemoryPool* pool)
-      : ActionBase(type, pool), indices_builder_(pool) {}
+  DictEncodeAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+                   MemoryPool* pool)
+      : ActionBase(type, pool), indices_builder_(pool) {
+    if (auto options_ptr = static_cast<const DictionaryEncodeOptions*>(options)) {
+      encode_options_ = *options_ptr;
+    }
+  }
 
   Status Reset() {
     indices_builder_.Reset();
@@ -173,7 +184,7 @@ class DictEncodeAction final : public ActionBase {
 
   template <class Index>
   void ObserveNullFound(Index index) {
-    if (index < 0) {
+    if (encode_options_.null_encoding_behavior == DictionaryEncodeOptions::MASK) {
       indices_builder_.UnsafeAppendNull();
     } else {
       indices_builder_.UnsafeAppend(index);
@@ -195,6 +206,10 @@ class DictEncodeAction final : public ActionBase {
     ObserveFound(index);
   }
 
+  bool ShouldEncodeNulls() {
+    return encode_options_.null_encoding_behavior == DictionaryEncodeOptions::ENCODE;
+  }
+
   Status Flush(Datum* out) {
     std::shared_ptr<ArrayData> result;
     RETURN_NOT_OK(indices_builder_.FinishInternal(&result));
@@ -206,12 +221,13 @@ class DictEncodeAction final : public ActionBase {
 
  private:
   Int32Builder indices_builder_;
+  DictionaryEncodeOptions encode_options_;
 };
 
 class HashKernel : public KernelState {
  public:
-  HashKernel() : options_(DictionaryEncodeOptions::Defaults()) {}
-  explicit HashKernel(const DictionaryEncodeOptions& options) : options_(options) {}
+  HashKernel() : options_(nullptr) {}
+  explicit HashKernel(const FunctionOptions* options) : options_(options) {}
 
   // Reset for another run.
   virtual Status Reset() = 0;
@@ -236,7 +252,7 @@ class HashKernel : public KernelState {
   virtual Status Append(const ArrayData& arr) = 0;
 
  protected:
-  DictionaryEncodeOptions options_;
+  const FunctionOptions* options_;
   std::mutex lock_;
 };
 
@@ -245,13 +261,12 @@ class HashKernel : public KernelState {
 // (NullType has a separate implementation)
 
 template <typename Type, typename Scalar, typename Action,
-          bool with_error_status = Action::with_error_status,
-          bool with_memo_visit_null = Action::with_memo_visit_null>
+          bool with_error_status = Action::with_error_status>
 class RegularHashKernel : public HashKernel {
  public:
-  RegularHashKernel(const std::shared_ptr<DataType>& type,
-                    const DictionaryEncodeOptions& options, MemoryPool* pool)
-      : HashKernel(options), pool_(pool), type_(type), action_(type, pool) {}
+  RegularHashKernel(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+                    MemoryPool* pool)
+      : HashKernel(options), pool_(pool), type_(type), action_(type, options, pool) {}
 
   Status Reset() override {
     memo_table_.reset(new MemoTable(pool_, 0));
@@ -291,9 +306,7 @@ class RegularHashKernel : public HashKernel {
                                           &unused_memo_index);
         },
         [this]() {
-          if (with_memo_visit_null ||
-              options_.null_encoding_behavior ==
-                  DictionaryEncodeOptions::NullEncodingBehavior::ENCODE) {
+          if (action_.ShouldEncodeNulls()) {
             auto on_found = [this](int32_t memo_index) {
               action_.ObserveNullFound(memo_index);
             };
@@ -329,16 +342,14 @@ class RegularHashKernel : public HashKernel {
         [this]() {
           // Null
           Status s = Status::OK();
-          if (with_memo_visit_null) {
-            auto on_found = [this](int32_t memo_index) {
-              action_.ObserveNullFound(memo_index);
-            };
-            auto on_not_found = [this, &s](int32_t memo_index) {
-              action_.ObserveNullNotFound(memo_index, &s);
-            };
+          auto on_found = [this](int32_t memo_index) {
+            action_.ObserveNullFound(memo_index);
+          };
+          auto on_not_found = [this, &s](int32_t memo_index) {
+            action_.ObserveNullNotFound(memo_index, &s);
+          };
+          if (action_.ShouldEncodeNulls()) {
             memo_table_->GetOrInsertNull(std::move(on_found), std::move(on_not_found));
-          } else {
-            action_.ObserveNullNotFound(-1);
           }
           return s;
         });
@@ -359,9 +370,9 @@ class RegularHashKernel : public HashKernel {
 template <typename Action, bool with_error_status = Action::with_error_status>
 class NullHashKernel : public HashKernel {
  public:
-  NullHashKernel(const std::shared_ptr<DataType>& type,
-                 const DictionaryEncodeOptions& options, MemoryPool* pool)
-      : pool_(pool), type_(type), action_(type, pool) {}
+  NullHashKernel(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+                 MemoryPool* pool)
+      : pool_(pool), type_(type), action_(type, options, pool) {}
 
   Status Reset() override { return action_.Reset(); }
 
@@ -487,12 +498,8 @@ struct HashKernelTraits<Type, Action, enable_if_has_string_view<Type>> {
 template <typename Type, typename Action>
 std::unique_ptr<HashKernel> HashInitImpl(KernelContext* ctx, const KernelInitArgs& args) {
   using HashKernelType = typename HashKernelTraits<Type, Action>::HashKernel;
-  DictionaryEncodeOptions options;
-  if (auto options_ptr = static_cast<const DictionaryEncodeOptions*>(args.options)) {
-    options = *options_ptr;
-  }
   auto result = ::arrow::internal::make_unique<HashKernelType>(
-      args.inputs[0].type, options, ctx->memory_pool());
+      args.inputs[0].type, args.options, ctx->memory_pool());
   ctx->SetStatus(result->Reset());
   return std::move(result);
 }
diff --git a/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc
index d6b203181eb..3be549d05ce 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc
@@ -46,7 +46,7 @@ static void BuildDictionary(benchmark::State& state) {  // NOLINT non-const refe
   ArrayFromVector<Int64Type, int64_t>(is_valid, values, &arr);
 
   while (state.KeepRunning()) {
-    ABORT_NOT_OK(DictionaryEncode(arr, DictionaryEncodeOptions::Defaults()).status());
+    ABORT_NOT_OK(DictionaryEncode(arr).status());
   }
   state.counters["null_percent"] =
       static_cast<double>(arr->null_count()) / arr->length() * 100;
@@ -73,7 +73,7 @@ static void BuildStringDictionary(
   ArrayFromVector<StringType, std::string>(data, &arr);
 
   while (state.KeepRunning()) {
-    ABORT_NOT_OK(DictionaryEncode(arr, DictionaryEncodeOptions::Defaults()).status());
+    ABORT_NOT_OK(DictionaryEncode(arr).status());
   }
   state.SetBytesProcessed(state.iterations() * total_bytes);
   state.SetItemsProcessed(state.iterations() * data.size());
@@ -169,7 +169,7 @@ void BenchDictionaryEncode(benchmark::State& state, const ParamType& params) {
   std::shared_ptr<Array> arr;
   params.GenerateTestData(&arr);
   while (state.KeepRunning()) {
-    ABORT_NOT_OK(DictionaryEncode(arr, DictionaryEncodeOptions::Defaults()).status());
+    ABORT_NOT_OK(DictionaryEncode(arr).status());
   }
   params.SetMetadata(state);
 }
diff --git a/cpp/src/arrow/compute/kernels/vector_hash_test.cc b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
index 4dc106138d7..f4cd7dbf41f 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
@@ -126,8 +126,7 @@ void CheckDictEncode(const std::shared_ptr<Array>& input,
   auto type = dictionary(expected_indices->type(), expected_values->type());
   DictionaryArray expected(type, expected_indices, expected_values);
 
-  ASSERT_OK_AND_ASSIGN(Datum datum_out,
-                       DictionaryEncode(input, DictionaryEncodeOptions::Defaults()));
+  ASSERT_OK_AND_ASSIGN(Datum datum_out, DictionaryEncode(input));
   std::shared_ptr<Array> result = MakeArray(datum_out.array());
   ASSERT_OK(result->ValidateFull());
 
@@ -205,8 +204,7 @@ TYPED_TEST(TestHashKernelPrimitive, ZeroChunks) {
   auto type = TypeTraits<TypeParam>::type_singleton();
 
   auto zero_chunks = std::make_shared<ChunkedArray>(ArrayVector{}, type);
-  ASSERT_OK_AND_ASSIGN(
-      Datum result, DictionaryEncode(zero_chunks, DictionaryEncodeOptions::Defaults()));
+  ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(zero_chunks));
 
   ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY);
   AssertChunkedEqual(*result.chunked_array(),
@@ -372,8 +370,7 @@ TYPED_TEST(TestHashKernelBinaryTypes, ZeroChunks) {
   auto type = this->type();
 
   auto zero_chunks = std::make_shared<ChunkedArray>(ArrayVector{}, type);
-  ASSERT_OK_AND_ASSIGN(
-      Datum result, DictionaryEncode(zero_chunks, DictionaryEncodeOptions::Defaults()));
+  ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(zero_chunks));
 
   ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY);
   AssertChunkedEqual(*result.chunked_array(),
@@ -389,8 +386,7 @@ TYPED_TEST(TestHashKernelBinaryTypes, TwoChunks) {
           ArrayFromJSON(type, "[\"b\"]"),
       },
       type);
-  ASSERT_OK_AND_ASSIGN(Datum result,
-                       DictionaryEncode(two_chunks, DictionaryEncodeOptions::Defaults()));
+  ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(two_chunks));
 
   auto dict_type = dictionary(int32(), type);
   auto dictionary = ArrayFromJSON(type, R"(["a", "b"])");
@@ -682,8 +678,7 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) {
   ASSERT_ARRAYS_EQUAL(*ex_counts, *counts->field(1));
 
   // Dictionary encode
-  ASSERT_OK_AND_ASSIGN(Datum encoded_out,
-                       DictionaryEncode(carr, DictionaryEncodeOptions::Defaults()));
+  ASSERT_OK_AND_ASSIGN(Datum encoded_out, DictionaryEncode(carr));
   ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind());
 
   AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array());
@@ -692,8 +687,7 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) {
 TEST_F(TestHashKernel, ZeroLengthDictionaryEncode) {
   // ARROW-7008
   auto values = ArrayFromJSON(utf8(), "[]");
-  ASSERT_OK_AND_ASSIGN(Datum datum_result,
-                       DictionaryEncode(values, DictionaryEncodeOptions::Defaults()));
+  ASSERT_OK_AND_ASSIGN(Datum datum_result, DictionaryEncode(values));
 
   std::shared_ptr<Array> result = datum_result.make_array();
   const auto& dict_result = checked_cast<const DictionaryArray&>(*result);
@@ -710,8 +704,7 @@ TEST_F(TestHashKernel, NullEncodingSchemes) {
   std::shared_ptr<Array> expected = std::make_shared<DictionaryArray>(
       dictionary_type, expected_mask_indices, expected_mask_dictionary);
 
-  ASSERT_OK_AND_ASSIGN(Datum datum_result,
-                       DictionaryEncode(values, DictionaryEncodeOptions::Defaults()));
+  ASSERT_OK_AND_ASSIGN(Datum datum_result, DictionaryEncode(values));
   std::shared_ptr<Array> result = datum_result.make_array();
   AssertArraysEqual(*expected, *result);
 
@@ -742,9 +735,7 @@ TEST_F(TestHashKernel, ChunkedArrayZeroChunk) {
                            "[]");
   AssertArraysEqual(*expected, *result_array);
 
-  ASSERT_OK_AND_ASSIGN(
-      Datum result_datum,
-      DictionaryEncode(chunked_array, DictionaryEncodeOptions::Defaults()));
+  ASSERT_OK_AND_ASSIGN(Datum result_datum, DictionaryEncode(chunked_array));
   auto dict_type = dictionary(int32(), chunked_array->type());
   ASSERT_EQ(result_datum.kind(), Datum::CHUNKED_ARRAY);
 
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index bb52e2d8fbd..2cd9fac1f3e 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -622,16 +622,27 @@ class StructDictionary {
  private:
   Status AddOne(Datum column, std::shared_ptr<Int32Array>* fused_indices) {
     if (column.type()->id() == Type::DICTIONARY) {
-      // compute::DictionaryEncode doesn't support dictionary and, even if it did, it
-      // would be a null op and return a flat dictionary.  In order to group by dictionary
-      // we would need to be able to create a nested dictionary.
-      return Status::NotImplemented(
-          "Cannot use column of type dictionary as grouping criteria");
+      if (column.null_count() != 0) {
+        // TODO Optimize this by allowign DictionaryEncode to transfer a null-masked
+        // dictionary to a null-encoded dictionary.  At the moment we decode and then
+        // encode causing one extra copy, and a potentially expansive decoding copy at
+        // that.
+        ARROW_ASSIGN_OR_RAISE(
+            auto decoded_dictionary,
+            compute::Cast(
+                column,
+                std::static_pointer_cast<DictionaryType>(column.type())->value_type(),
+                compute::CastOptions()));
+        column = decoded_dictionary;
+      }
+    }
+    if (column.type()->id() != Type::DICTIONARY) {
+      compute::DictionaryEncodeOptions options;
+      options.null_encoding_behavior =
+          compute::DictionaryEncodeOptions::NullEncodingBehavior::ENCODE;
+      ARROW_ASSIGN_OR_RAISE(column,
+                            compute::DictionaryEncode(std::move(column), options));
     }
-    compute::DictionaryEncodeOptions options;
-    options.null_encoding_behavior =
-        compute::DictionaryEncodeOptions::NullEncodingBehavior::ENCODE;
-    ARROW_ASSIGN_OR_RAISE(column, compute::DictionaryEncode(std::move(column), options));
 
     auto dict_column = column.array_as<DictionaryArray>();
     dictionaries_.push_back(dict_column->dictionary());
@@ -668,9 +679,7 @@ class StructDictionary {
   Status RestoreDictionaryEncoding(std::shared_ptr<DictionaryType> expected_type,
                                    Datum* column) {
     DCHECK_NE(column->type()->id(), Type::DICTIONARY);
-    ARROW_ASSIGN_OR_RAISE(
-        *column, compute::DictionaryEncode(std::move(*column),
-                                           compute::DictionaryEncodeOptions::Defaults()));
+    ARROW_ASSIGN_OR_RAISE(*column, compute::DictionaryEncode(std::move(*column)));
 
     if (expected_type->index_type()->id() == Type::INT32) {
       // dictionary_encode has already yielded the expected index_type
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 876bc77b0ba..e7c4baf85ad 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -668,6 +668,25 @@ TEST(GroupTest, WithNulls) {
                    {"a": "why", "b": null, "ids": [7]}
   ])");
 
+  AssertGrouping({field("a", dictionary(int32(), utf8())), field("b", int32())},
+                 R"([
+                   {"a": "ex",  "b": 0,    "id": 0},
+                   {"a": null,  "b": 0,    "id": 1},
+                   {"a": null,  "b": 0,    "id": 2},
+                   {"a": "ex",  "b": 1,    "id": 3},
+                   {"a": null,  "b": null, "id": 4},
+                   {"a": "ex",  "b": 1,    "id": 5},
+                   {"a": "ex",  "b": 0,    "id": 6},
+                   {"a": "why", "b": null, "id": 7}
+                 ])",
+                 R"([
+                   {"a": "ex", "b": 0, "ids": [0, 6]},
+                   {"a": null, "b": 0, "ids": [1, 2]},
+                   {"a": "ex", "b": 1, "ids": [3, 5]},
+                   {"a": null, "b": null, "ids": [4]},
+                   {"a": "why", "b": null, "ids": [7]}
+  ])");
+
   auto has_nulls = checked_pointer_cast<StructArray>(
       ArrayFromJSON(struct_({field("a", utf8()), field("b", int32())}), R"([
     {"a": "ex",  "b": 0},
diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc
index d61e6cde2b6..8c2ac376d1e 100644
--- a/cpp/src/arrow/pretty_print.cc
+++ b/cpp/src/arrow/pretty_print.cc
@@ -670,49 +670,4 @@ Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options,
   return Status::OK();
 }
 
-void GdbPrintArray(const Array& arr, int indent) {
-  PrettyPrintOptions options;
-  options.indent = indent;
-  auto print_st = PrettyPrint(arr, options, &std::cout);
-  if (!print_st.ok()) {
-    std::cout << "Could not print: " << print_st.message();
-  }
-}
-
-void GdbPrintRecordBatch(const RecordBatch& rb, int indent) {
-  PrettyPrintOptions options;
-  options.indent = indent;
-  auto print_st = PrettyPrint(rb, options, &std::cout);
-  if (!print_st.ok()) {
-    std::cout << "Could not print: " << print_st.message();
-  }
-}
-
-void GdbPrintTable(const Table& table, int indent) {
-  PrettyPrintOptions options;
-  options.indent = indent;
-  auto print_st = PrettyPrint(table, options, &std::cout);
-  if (!print_st.ok()) {
-    std::cout << "Could not print: " << print_st.message();
-  }
-}
-
-void GdbPrintChunkedArray(const ChunkedArray& chunked_arr, int indent) {
-  PrettyPrintOptions options;
-  options.indent = indent;
-  auto print_st = PrettyPrint(chunked_arr, options, &std::cout);
-  if (!print_st.ok()) {
-    std::cout << "Could not print: " << print_st.message();
-  }
-}
-
-void GdbPrintSchema(const Schema& schema, int indent) {
-  PrettyPrintOptions options;
-  options.indent = indent;
-  auto print_st = PrettyPrint(schema, options, &std::cout);
-  if (!print_st.ok()) {
-    std::cout << "Could not print: " << print_st.message();
-  }
-}
-
 }  // namespace arrow
diff --git a/cpp/src/arrow/pretty_print.h b/cpp/src/arrow/pretty_print.h
index 43948b8f149..9d2c72c7186 100644
--- a/cpp/src/arrow/pretty_print.h
+++ b/cpp/src/arrow/pretty_print.h
@@ -120,17 +120,4 @@ Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options,
 ARROW_EXPORT
 Status DebugPrint(const Array& arr, int indent);
 
-// These print routines are used in the gdb pretty printers which are
-// not capable of passing "out" params and do a poor job of overload resolution
-
-ARROW_EXPORT void GdbPrintArray(const Array& arr, int indent);
-
-ARROW_EXPORT void GdbPrintRecordBatch(const RecordBatch& rb, int indent);
-
-ARROW_EXPORT void GdbPrintTable(const Table& table, int indent);
-
-ARROW_EXPORT void GdbPrintChunkedArray(const ChunkedArray& chunked_arr, int indent);
-
-ARROW_EXPORT void GdbPrintSchema(const Schema& schema, int indent);
-
 }  // namespace arrow
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 45c04ed5f81..c6304ec4213 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -885,8 +885,7 @@ TYPED_TEST(TestParquetIO, SingleColumnOptionalDictionaryWrite) {
 
   ASSERT_OK(NullableArray<TypeParam>(SMALL_SIZE, 10, kDefaultSeed, &values));
 
-  ASSERT_OK_AND_ASSIGN(Datum out,
-                       DictionaryEncode(values, DictionaryEncodeOptions::Defaults()));
+  ASSERT_OK_AND_ASSIGN(Datum out, DictionaryEncode(values));
   std::shared_ptr<Array> dict_values = MakeArray(out.array());
   std::shared_ptr<GroupNode> schema =
       MakeSimpleSchema(*dict_values->type(), Repetition::OPTIONAL);

From 68cf487a7f4ea4ee1a156233a6509f583538f692 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 1 Feb 2021 15:44:51 -1000
Subject: [PATCH 06/33] Taking out an extraneous using that I missed in the
 last commit

---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index c6304ec4213..ca702152d61 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -79,7 +79,6 @@ using arrow::Status;
 using arrow::Table;
 using arrow::TimeUnit;
 using arrow::compute::DictionaryEncode;
-using arrow::compute::DictionaryEncodeOptions;
 using arrow::internal::checked_cast;
 using arrow::internal::checked_pointer_cast;
 using arrow::internal::Iota;

From ae0b8595ac7a4d22c8276488cfcf7c3405f2fc87 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Thu, 4 Feb 2021 10:46:49 -1000
Subject: [PATCH 07/33] WIP

---
 cpp/src/arrow/dataset/expression.cc     | 2 ++
 cpp/src/arrow/dataset/expression.h      | 3 +++
 cpp/src/arrow/dataset/partition_test.cc | 3 +++
 3 files changed, 8 insertions(+)

diff --git a/cpp/src/arrow/dataset/expression.cc b/cpp/src/arrow/dataset/expression.cc
index 56339430ee9..d5bcd3fb0eb 100644
--- a/cpp/src/arrow/dataset/expression.cc
+++ b/cpp/src/arrow/dataset/expression.cc
@@ -51,6 +51,8 @@ Expression::Expression(Parameter parameter)
 
 Expression literal(Datum lit) { return Expression(std::move(lit)); }
 
+Expression null_literal() { return Expression(Datum()); }
+
 Expression field_ref(FieldRef ref) {
   return Expression(Expression::Parameter{std::move(ref), {}});
 }
diff --git a/cpp/src/arrow/dataset/expression.h b/cpp/src/arrow/dataset/expression.h
index 13c714b2d72..33ffdddb8a6 100644
--- a/cpp/src/arrow/dataset/expression.h
+++ b/cpp/src/arrow/dataset/expression.h
@@ -135,6 +135,9 @@ inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equ
 ARROW_DS_EXPORT
 Expression literal(Datum lit);
 
+ARROW_DS_EXPORT
+Expression null_literal();
+
 template <typename Arg>
 Expression literal(Arg&& arg) {
   return literal(Datum(std::forward<Arg>(arg)));
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index e7c4baf85ad..91555f9d1fd 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -341,6 +341,9 @@ TEST_F(TestPartitioning, HivePartitioningFormat) {
                      equal(field_ref("beta"), literal(3.25f))}),
                "alpha=0/beta=3.25");
 
+  AssertFormat(equal(field_ref("alpha"), literal(MakeNullScalar(int32()))),
+               "alpha=_HIVE_DEFAULT_PARTITION_");
+
   // written_schema_ is incompatible with partitioning_'s schema
   written_schema_ = schema({field("alpha", utf8()), field("beta", utf8())});
   AssertFormatError<StatusCode::TypeError>(

From c941bae6ceb74608cb3087e69754daa00e3358bc Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Thu, 4 Feb 2021 23:00:22 -1000
Subject: [PATCH 08/33] Adding the null fallback logic to the python half

---
 cpp/src/arrow/dataset/partition.cc           |   24 +-
 cpp/src/arrow/dataset/partition.h            |   19 +-
 cpp/src/arrow/dataset/partition_test.cc      |   21 +-
 python/pyarrow/_dataset.pyx                  |   20 +-
 python/pyarrow/includes/libarrow_dataset.pxd |    7 +-
 python/pyarrow/tests/test_dataset.py         | 1316 ++++++++++--------
 6 files changed, 781 insertions(+), 626 deletions(-)

diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 2cd9fac1f3e..595cce8021d 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -410,12 +410,17 @@ std::shared_ptr<PartitioningFactory> DirectoryPartitioning::MakeFactory(
 }
 
 util::optional<KeyValuePartitioning::Key> HivePartitioning::ParseKey(
-    const std::string& segment) {
+    const std::string& segment, const std::string& null_fallback) {
   auto name_end = string_view(segment).find_first_of('=');
+  // Keep for backwards compatibility, this would be produced by arrow <= 3
   if (name_end == string_view::npos) {
     return util::nullopt;
   }
 
+  auto value = segment.substr(name_end + 1);
+  if (value == null_fallback) {
+    return util::nullopt;
+  }
   return Key{segment.substr(0, name_end), segment.substr(name_end + 1)};
 }
 
@@ -424,7 +429,7 @@ std::vector<KeyValuePartitioning::Key> HivePartitioning::ParseKeys(
   std::vector<Key> keys;
 
   for (const auto& segment : fs::internal::SplitAbstractPath(path)) {
-    if (auto key = ParseKey(segment)) {
+    if (auto key = ParseKey(segment, null_fallback_)) {
       keys.push_back(std::move(*key));
     }
   }
@@ -438,12 +443,10 @@ Result<std::string> HivePartitioning::FormatValues(const ScalarVector& values) c
   for (int i = 0; i < schema_->num_fields(); ++i) {
     const std::string& name = schema_->field(i)->name();
 
-    if (values[i] == nullptr) {
-      if (!NextValid(values, i)) break;
-
+    if (values[i] == nullptr || !values[i]->is_valid) {
       // If no key is available just provide a placeholder segment to maintain the
       // field_index <-> path nesting relation
-      segments[i] = name;
+      segments[i] = name + "=" + null_fallback_;
     } else {
       segments[i] = name + "=" + values[i]->ToString();
     }
@@ -454,8 +457,8 @@ Result<std::string> HivePartitioning::FormatValues(const ScalarVector& values) c
 
 class HivePartitioningFactory : public KeyValuePartitioningFactory {
  public:
-  explicit HivePartitioningFactory(PartitioningFactoryOptions options)
-      : KeyValuePartitioningFactory(options) {}
+  explicit HivePartitioningFactory(HivePartitioningFactoryOptions options)
+      : KeyValuePartitioningFactory(options), null_fallback_(options.null_fallback) {}
 
   std::string type_name() const override { return "hive"; }
 
@@ -463,7 +466,7 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory {
       const std::vector<std::string>& paths) override {
     for (auto path : paths) {
       for (auto&& segment : fs::internal::SplitAbstractPath(path)) {
-        if (auto key = HivePartitioning::ParseKey(segment)) {
+        if (auto key = HivePartitioning::ParseKey(segment, null_fallback_)) {
           RETURN_NOT_OK(InsertRepr(key->name, key->value));
         }
       }
@@ -491,11 +494,12 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory {
   }
 
  private:
+  const std::string null_fallback_;
   std::vector<std::string> field_names_;
 };
 
 std::shared_ptr<PartitioningFactory> HivePartitioning::MakeFactory(
-    PartitioningFactoryOptions options) {
+    HivePartitioningFactoryOptions options) {
   return std::shared_ptr<PartitioningFactory>(new HivePartitioningFactory(options));
 }
 
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index 944434e64f7..5cdf7a1df66 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -92,6 +92,11 @@ struct PartitioningFactoryOptions {
   bool infer_dictionary = false;
 };
 
+struct HivePartitioningFactoryOptions : PartitioningFactoryOptions {
+  /// The hive partitioning scheme maps null to a hard coded fallback string.
+  std::string null_fallback;
+};
+
 /// \brief PartitioningFactory provides creation of a partitioning  when the
 /// specific schema must be inferred from available paths (no explicit schema is known).
 class ARROW_DS_EXPORT PartitioningFactory {
@@ -175,6 +180,8 @@ class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning {
   Result<std::string> FormatValues(const ScalarVector& values) const override;
 };
 
+static constexpr char kDefaultHiveNullFallback[] = "__HIVE_DEFAULT_PARTITION__";
+
 /// \brief Multi-level, directory based partitioning
 /// originating from Apache Hive with all data files stored in the
 /// leaf directories. Data is partitioned by static values of a
@@ -188,17 +195,21 @@ class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning {
  public:
   // If a field in schema is of dictionary type, the corresponding element of dictionaries
   // must be contain the dictionary of values for that field.
-  explicit HivePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries = {})
-      : KeyValuePartitioning(std::move(schema), std::move(dictionaries)) {}
+  explicit HivePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries = {},
+                            std::string null_fallback = kDefaultHiveNullFallback)
+      : KeyValuePartitioning(std::move(schema), std::move(dictionaries)),
+        null_fallback_(null_fallback) {}
 
   std::string type_name() const override { return "hive"; }
 
-  static util::optional<Key> ParseKey(const std::string& segment);
+  static util::optional<Key> ParseKey(const std::string& segment,
+                                      const std::string& null_fallback);
 
   static std::shared_ptr<PartitioningFactory> MakeFactory(
-      PartitioningFactoryOptions = {});
+      HivePartitioningFactoryOptions = {});
 
  private:
+  const std::string null_fallback_;
   std::vector<Key> ParseKeys(const std::string& path) const override;
 
   Result<std::string> FormatValues(const ScalarVector& values) const override;
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 91555f9d1fd..2558af293da 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -258,6 +258,8 @@ TEST_F(TestPartitioning, DictionaryInference) {
   // successful dictionary inference
   AssertInspect({"/a/0"}, {DictStr("alpha"), DictInt("beta")});
   AssertInspect({"/a/0", "/a/1"}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({"/a/0", "/a"}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({"/0/a", "/1"}, {DictInt("alpha"), DictStr("beta")});
   AssertInspect({"/a/0", "/b/0", "/a/1", "/b/1"}, {DictStr("alpha"), DictInt("beta")});
   AssertInspect({"/a/-", "/b/-", "/a/_", "/b/_"}, {DictStr("alpha"), DictStr("beta")});
 }
@@ -320,7 +322,7 @@ TEST_F(TestPartitioning, HivePartitioning) {
 
 TEST_F(TestPartitioning, HivePartitioningFormat) {
   partitioning_ = std::make_shared<HivePartitioning>(
-      schema({field("alpha", int32()), field("beta", float32())}));
+      schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz");
 
   written_schema_ = partitioning_->schema();
 
@@ -330,9 +332,9 @@ TEST_F(TestPartitioning, HivePartitioningFormat) {
   AssertFormat(and_(equal(field_ref("beta"), literal(3.25f)),
                     equal(field_ref("alpha"), literal(0))),
                "alpha=0/beta=3.25");
-  AssertFormat(equal(field_ref("alpha"), literal(0)), "alpha=0");
-  AssertFormat(equal(field_ref("beta"), literal(3.25f)), "alpha/beta=3.25");
-  AssertFormat(literal(true), "");
+  AssertFormat(equal(field_ref("alpha"), literal(0)), "alpha=0/beta=xyz");
+  AssertFormat(equal(field_ref("beta"), literal(3.25f)), "alpha=xyz/beta=3.25");
+  AssertFormat(literal(true), "alpha=xyz/beta=xyz");
 
   ASSERT_OK_AND_ASSIGN(written_schema_,
                        written_schema_->AddField(0, field("gamma", utf8())));
@@ -342,7 +344,7 @@ TEST_F(TestPartitioning, HivePartitioningFormat) {
                "alpha=0/beta=3.25");
 
   AssertFormat(equal(field_ref("alpha"), literal(MakeNullScalar(int32()))),
-               "alpha=_HIVE_DEFAULT_PARTITION_");
+               "alpha=xyz/beta=xyz");
 
   // written_schema_ is incompatible with partitioning_'s schema
   written_schema_ = schema({field("alpha", utf8()), field("beta", utf8())});
@@ -374,8 +376,9 @@ TEST_F(TestPartitioning, DiscoverHiveSchema) {
 }
 
 TEST_F(TestPartitioning, HiveDictionaryInference) {
-  PartitioningFactoryOptions options;
+  HivePartitioningFactoryOptions options;
   options.infer_dictionary = true;
+  options.null_fallback = "xyz";
   factory_ = HivePartitioning::MakeFactory(options);
 
   // type is still int32 if possible
@@ -387,6 +390,8 @@ TEST_F(TestPartitioning, HiveDictionaryInference) {
   // successful dictionary inference
   AssertInspect({"/alpha=a/beta=0"}, {DictStr("alpha"), DictInt("beta")});
   AssertInspect({"/alpha=a/beta=0", "/alpha=a/1"}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({"/alpha=a/beta=0", "/alpha=xyz/beta=xyz"},
+                {DictStr("alpha"), DictInt("beta")});
   AssertInspect(
       {"/alpha=a/beta=0", "/alpha=b/beta=0", "/alpha=a/beta=1", "/alpha=b/beta=1"},
       {DictStr("alpha"), DictInt("beta")});
@@ -396,7 +401,7 @@ TEST_F(TestPartitioning, HiveDictionaryInference) {
 }
 
 TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) {
-  PartitioningFactoryOptions options;
+  HivePartitioningFactoryOptions options;
   options.infer_dictionary = true;
   factory_ = HivePartitioning::MakeFactory(options);
 
@@ -519,7 +524,7 @@ class RangePartitioning : public Partitioning {
     std::vector<Expression> ranges;
 
     for (auto segment : fs::internal::SplitAbstractPath(path)) {
-      auto key = HivePartitioning::ParseKey(segment);
+      auto key = HivePartitioning::ParseKey(segment, "");
       if (!key) {
         return Status::Invalid("can't parse '", segment, "' as a range");
       }
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index c67dbc99d77..5fa2b118ed5 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -1546,7 +1546,7 @@ cdef class DirectoryPartitioning(Partitioning):
 
         Returns
         -------
-        DirectoryPartitioningFactory
+        PartitioningFactory
             To be used in the FileSystemFactoryOptions.
         """
         cdef:
@@ -1590,6 +1590,8 @@ cdef class HivePartitioning(Partitioning):
         corresponding entry of `dictionaries` must be an array containing
         every value which may be taken by the corresponding column or an
         error will be raised in parsing.
+    null_fallback : str
+        If any field is None then this fallback will be used as a label
 
     Returns
     -------
@@ -1608,13 +1610,15 @@ cdef class HivePartitioning(Partitioning):
     cdef:
         CHivePartitioning* hive_partitioning
 
-    def __init__(self, Schema schema not None, dictionaries=None):
+    def __init__(self, Schema schema not None, dictionaries=None, null_fallback="__HIVE_DEFAULT_PARTITION__"):
         cdef:
             shared_ptr[CHivePartitioning] c_partitioning
+            c_string c_null_fallback = tobytes(null_fallback)
 
         c_partitioning = make_shared[CHivePartitioning](
             pyarrow_unwrap_schema(schema),
-            _partitioning_dictionaries(schema, dictionaries)
+            _partitioning_dictionaries(schema, dictionaries),
+            c_null_fallback
         )
         self.init(<shared_ptr[CPartitioning]> c_partitioning)
 
@@ -1623,7 +1627,7 @@ cdef class HivePartitioning(Partitioning):
         self.hive_partitioning = <CHivePartitioning*> sp.get()
 
     @staticmethod
-    def discover(infer_dictionary=False, max_partition_dictionary_size=0):
+    def discover(infer_dictionary=False, max_partition_dictionary_size=0, null_fallback="__HIVE_DEFAULT_PARTITION__"):
         """
         Discover a HivePartitioning.
 
@@ -1639,6 +1643,10 @@ cdef class HivePartitioning(Partitioning):
             Synonymous with infer_dictionary for backwards compatibility with
             1.0: setting this to -1 or None is equivalent to passing
             infer_dictionary=True.
+        null_fallback : str, default "__HIVE_DEFAULT_PARTITION__"
+            When inferring a schema for partition fields this value will be
+            replaced by null.  The default is set to __HIVE_DEFAULT_PARTITION__
+            for compatibility with Spark
 
         Returns
         -------
@@ -1646,7 +1654,7 @@ cdef class HivePartitioning(Partitioning):
             To be used in the FileSystemFactoryOptions.
         """
         cdef:
-            CPartitioningFactoryOptions c_options
+            CHivePartitioningFactoryOptions c_options
 
         if max_partition_dictionary_size in {-1, None}:
             infer_dictionary = True
@@ -1657,6 +1665,8 @@ cdef class HivePartitioning(Partitioning):
         if infer_dictionary:
             c_options.infer_dictionary = True
 
+        c_options.null_fallback = tobytes(null_fallback)
+
         return PartitioningFactory.wrap(
             CHivePartitioning.MakeFactory(c_options))
 
diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
index 29f9738dedc..93bc0edddc1 100644
--- a/python/pyarrow/includes/libarrow_dataset.pxd
+++ b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -274,6 +274,11 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
             "arrow::dataset::PartitioningFactoryOptions":
         c_bool infer_dictionary
 
+    cdef cppclass CHivePartitioningFactoryOptions \
+            "arrow::dataset::HivePartitioningFactoryOptions":
+        c_bool infer_dictionary,
+        c_string null_fallback
+
     cdef cppclass CPartitioningFactory "arrow::dataset::PartitioningFactory":
         pass
 
@@ -293,7 +298,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
 
         @staticmethod
         shared_ptr[CPartitioningFactory] MakeFactory(
-            CPartitioningFactoryOptions)
+            CHivePartitioningFactoryOptions)
 
     cdef cppclass CPartitioningOrFactory \
             "arrow::dataset::PartitioningOrFactory":
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 796f6d998e8..48ef421694d 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -21,6 +21,7 @@
 import textwrap
 
 import numpy as np
+from numpy.core.fromnumeric import partition
 import pytest
 
 import pyarrow as pa
@@ -49,23 +50,25 @@ def _generate_data(n):
 
     day = datetime.datetime(2000, 1, 1)
     interval = datetime.timedelta(days=5)
-    colors = itertools.cycle(['green', 'blue', 'yellow', 'red', 'orange'])
+    colors = itertools.cycle(["green", "blue", "yellow", "red", "orange"])
 
     data = []
     for i in range(n):
         data.append((day, i, float(i), next(colors)))
         day += interval
 
-    return pd.DataFrame(data, columns=['date', 'index', 'value', 'color'])
+    return pd.DataFrame(data, columns=["date", "index", "value", "color"])
 
 
 def _table_from_pandas(df):
-    schema = pa.schema([
-        pa.field('date', pa.date32()),
-        pa.field('index', pa.int64()),
-        pa.field('value', pa.float64()),
-        pa.field('color', pa.string()),
-    ])
+    schema = pa.schema(
+        [
+            pa.field("date", pa.date32()),
+            pa.field("index", pa.int64()),
+            pa.field("value", pa.float64()),
+            pa.field("color", pa.string()),
+        ]
+    )
     table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
     return table.replace_schema_metadata()
 
@@ -78,26 +81,28 @@ def mockfs():
     mockfs = fs._MockFileSystem()
 
     directories = [
-        'subdir/1/xxx',
-        'subdir/2/yyy',
+        "subdir/1/xxx",
+        "subdir/2/yyy",
     ]
 
     for i, directory in enumerate(directories):
-        path = '{}/file{}.parquet'.format(directory, i)
+        path = "{}/file{}.parquet".format(directory, i)
         mockfs.create_dir(directory)
         with mockfs.open_output_stream(path) as out:
             data = [
                 list(range(5)),
                 list(map(float, range(5))),
                 list(map(str, range(5))),
-                [i] * 5
+                [i] * 5,
             ]
-            schema = pa.schema([
-                pa.field('i64', pa.int64()),
-                pa.field('f64', pa.float64()),
-                pa.field('str', pa.string()),
-                pa.field('const', pa.int64()),
-            ])
+            schema = pa.schema(
+                [
+                    pa.field("i64", pa.int64()),
+                    pa.field("f64", pa.float64()),
+                    pa.field("str", pa.string()),
+                    pa.field("const", pa.int64()),
+                ]
+            )
             batch = pa.record_batch(data, schema=schema)
             table = pa.Table.from_batches([batch])
 
@@ -138,10 +143,10 @@ def assert_opens(expected_opened):
     return fs, assert_opens
 
 
-@pytest.fixture(scope='module')
+@pytest.fixture(scope="module")
 def multisourcefs(request):
-    request.config.pyarrow.requires('pandas')
-    request.config.pyarrow.requires('parquet')
+    request.config.pyarrow.requires("pandas")
+    request.config.pyarrow.requires("parquet")
     import pyarrow.parquet as pq
 
     df = _generate_data(1000)
@@ -153,35 +158,35 @@ def multisourcefs(request):
 
     # create a directory containing a flat sequence of parquet files without
     # any partitioning involved
-    mockfs.create_dir('plain')
+    mockfs.create_dir("plain")
     for i, chunk in enumerate(np.array_split(df_a, 10)):
-        path = 'plain/chunk-{}.parquet'.format(i)
+        path = "plain/chunk-{}.parquet".format(i)
         with mockfs.open_output_stream(path) as out:
             pq.write_table(_table_from_pandas(chunk), out)
 
     # create one with schema partitioning by weekday and color
-    mockfs.create_dir('schema')
+    mockfs.create_dir("schema")
     for part, chunk in df_b.groupby([df_b.date.dt.dayofweek, df_b.color]):
-        folder = 'schema/{}/{}'.format(*part)
-        path = '{}/chunk.parquet'.format(folder)
+        folder = "schema/{}/{}".format(*part)
+        path = "{}/chunk.parquet".format(folder)
         mockfs.create_dir(folder)
         with mockfs.open_output_stream(path) as out:
             pq.write_table(_table_from_pandas(chunk), out)
 
     # create one with hive partitioning by year and month
-    mockfs.create_dir('hive')
+    mockfs.create_dir("hive")
     for part, chunk in df_c.groupby([df_c.date.dt.year, df_c.date.dt.month]):
-        folder = 'hive/year={}/month={}'.format(*part)
-        path = '{}/chunk.parquet'.format(folder)
+        folder = "hive/year={}/month={}".format(*part)
+        path = "{}/chunk.parquet".format(folder)
         mockfs.create_dir(folder)
         with mockfs.open_output_stream(path) as out:
             pq.write_table(_table_from_pandas(chunk), out)
 
     # create one with hive partitioning by color
-    mockfs.create_dir('hive_color')
+    mockfs.create_dir("hive_color")
     for part, chunk in df_d.groupby(["color"]):
-        folder = 'hive_color/color={}'.format(*part)
-        path = '{}/chunk.parquet'.format(folder)
+        folder = "hive_color/color={}".format(*part)
+        path = "{}/chunk.parquet".format(folder)
         mockfs.create_dir(folder)
         with mockfs.open_output_stream(path) as out:
             pq.write_table(_table_from_pandas(chunk), out)
@@ -192,36 +197,40 @@ def multisourcefs(request):
 @pytest.fixture
 def dataset(mockfs):
     format = ds.ParquetFileFormat()
-    selector = fs.FileSelector('subdir', recursive=True)
-    options = ds.FileSystemFactoryOptions('subdir')
+    selector = fs.FileSelector("subdir", recursive=True)
+    options = ds.FileSystemFactoryOptions("subdir")
     options.partitioning = ds.DirectoryPartitioning(
-        pa.schema([
-            pa.field('group', pa.int32()),
-            pa.field('key', pa.string())
-        ])
+        pa.schema([pa.field("group", pa.int32()), pa.field("key", pa.string())])
     )
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     return factory.finish()
 
 
 def test_filesystem_dataset(mockfs):
-    schema = pa.schema([
-        pa.field('const', pa.int64())
-    ])
+    schema = pa.schema([pa.field("const", pa.int64())])
     file_format = ds.ParquetFileFormat()
-    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
-    partitions = [ds.field('part') == x for x in range(1, 3)]
-    fragments = [file_format.make_fragment(path, mockfs, part)
-                 for path, part in zip(paths, partitions)]
-    root_partition = ds.field('level') == ds.scalar(1337)
+    paths = ["subdir/1/xxx/file0.parquet", "subdir/2/yyy/file1.parquet"]
+    partitions = [ds.field("part") == x for x in range(1, 3)]
+    fragments = [
+        file_format.make_fragment(path, mockfs, part)
+        for path, part in zip(paths, partitions)
+    ]
+    root_partition = ds.field("level") == ds.scalar(1337)
 
     dataset_from_fragments = ds.FileSystemDataset(
-        fragments, schema=schema, format=file_format,
-        filesystem=mockfs, root_partition=root_partition,
+        fragments,
+        schema=schema,
+        format=file_format,
+        filesystem=mockfs,
+        root_partition=root_partition,
     )
     dataset_from_paths = ds.FileSystemDataset.from_paths(
-        paths, schema=schema, format=file_format, filesystem=mockfs,
-        partitions=partitions, root_partition=root_partition,
+        paths,
+        schema=schema,
+        format=file_format,
+        filesystem=mockfs,
+        partitions=partitions,
+        root_partition=root_partition,
     )
 
     for dataset in [dataset_from_fragments, dataset_from_paths]:
@@ -268,8 +277,9 @@ def test_filesystem_dataset(mockfs):
         ds.FileSystemDataset(fragments, file_format, schema)
     # validation of root_partition
     with pytest.raises(TypeError, match="incorrect type"):
-        ds.FileSystemDataset(fragments, schema=schema,
-                             format=file_format, root_partition=1)
+        ds.FileSystemDataset(
+            fragments, schema=schema, format=file_format, root_partition=1
+        )
     # missing required argument in from_paths
     with pytest.raises(TypeError, match="incorrect type"):
         ds.FileSystemDataset.from_paths(fragments, format=file_format)
@@ -277,15 +287,15 @@ def test_filesystem_dataset(mockfs):
 
 def test_filesystem_dataset_no_filesystem_interaction():
     # ARROW-8283
-    schema = pa.schema([
-        pa.field('f1', pa.int64())
-    ])
+    schema = pa.schema([pa.field("f1", pa.int64())])
     file_format = ds.IpcFileFormat()
-    paths = ['nonexistingfile.arrow']
+    paths = ["nonexistingfile.arrow"]
 
     # creating the dataset itself doesn't raise
     dataset = ds.FileSystemDataset.from_paths(
-        paths, schema=schema, format=file_format,
+        paths,
+        schema=schema,
+        format=file_format,
         filesystem=fs.LocalFileSystem(),
     )
 
@@ -317,27 +327,27 @@ def test_dataset(dataset):
     assert isinstance(table, pa.Table)
     assert len(table) == 10
 
-    condition = ds.field('i64') == 1
+    condition = ds.field("i64") == 1
     result = dataset.to_table(use_threads=True, filter=condition).to_pydict()
 
     # don't rely on the scanning order
-    assert result['i64'] == [1, 1]
-    assert result['f64'] == [1., 1.]
-    assert sorted(result['group']) == [1, 2]
-    assert sorted(result['key']) == ['xxx', 'yyy']
+    assert result["i64"] == [1, 1]
+    assert result["f64"] == [1.0, 1.0]
+    assert sorted(result["group"]) == [1, 2]
+    assert sorted(result["key"]) == ["xxx", "yyy"]
 
 
 def test_scanner(dataset):
-    scanner = ds.Scanner.from_dataset(dataset,
-                                      memory_pool=pa.default_memory_pool())
+    scanner = ds.Scanner.from_dataset(dataset, memory_pool=pa.default_memory_pool())
     assert isinstance(scanner, ds.Scanner)
     assert len(list(scanner.scan())) == 2
 
     with pytest.raises(pa.ArrowInvalid):
-        ds.Scanner.from_dataset(dataset, columns=['unknown'])
+        ds.Scanner.from_dataset(dataset, columns=["unknown"])
 
-    scanner = ds.Scanner.from_dataset(dataset, columns=['i64'],
-                                      memory_pool=pa.default_memory_pool())
+    scanner = ds.Scanner.from_dataset(
+        dataset, columns=["i64"], memory_pool=pa.default_memory_pool()
+    )
 
     assert isinstance(scanner, ds.Scanner)
     assert len(list(scanner.scan())) == 2
@@ -358,46 +368,45 @@ def test_abstract_classes():
 
 
 def test_partitioning():
-    schema = pa.schema([
-        pa.field('i64', pa.int64()),
-        pa.field('f64', pa.float64())
-    ])
+    schema = pa.schema([pa.field("i64", pa.int64()), pa.field("f64", pa.float64())])
     for klass in [ds.DirectoryPartitioning, ds.HivePartitioning]:
         partitioning = klass(schema)
         assert isinstance(partitioning, ds.Partitioning)
 
     partitioning = ds.DirectoryPartitioning(
-        pa.schema([
-            pa.field('group', pa.int64()),
-            pa.field('key', pa.float64())
-        ])
+        pa.schema([pa.field("group", pa.int64()), pa.field("key", pa.float64())])
     )
-    expr = partitioning.parse('/3/3.14')
+    expr = partitioning.parse("/3/3.14")
     assert isinstance(expr, ds.Expression)
 
-    expected = (ds.field('group') == 3) & (ds.field('key') == 3.14)
+    expected = (ds.field("group") == 3) & (ds.field("key") == 3.14)
     assert expr.equals(expected)
 
     with pytest.raises(pa.ArrowInvalid):
-        partitioning.parse('/prefix/3/aaa')
+        partitioning.parse("/prefix/3/aaa")
 
     partitioning = ds.HivePartitioning(
-        pa.schema([
-            pa.field('alpha', pa.int64()),
-            pa.field('beta', pa.int64())
-        ])
-    )
-    expr = partitioning.parse('/alpha=0/beta=3')
-    expected = (
-        (ds.field('alpha') == ds.scalar(0)) &
-        (ds.field('beta') == ds.scalar(3))
+        pa.schema([pa.field("alpha", pa.int64()), pa.field("beta", pa.int64())])
     )
+    expr = partitioning.parse("/alpha=0/beta=3")
+    expected = (ds.field("alpha") == ds.scalar(0)) & (ds.field("beta") == ds.scalar(3))
     assert expr.equals(expected)
 
-    for shouldfail in ['/alpha=one/beta=2', '/alpha=one', '/beta=two']:
+    for shouldfail in ["/alpha=one/beta=2", "/alpha=one", "/beta=two"]:
         with pytest.raises(pa.ArrowInvalid):
             partitioning.parse(shouldfail)
 
+    partitioning = ds.HivePartitioning(
+        pa.schema([pa.field("alpha", pa.int64()), pa.field("beta", pa.int64())]),
+        None,
+        "xyz",
+    )
+    expr = partitioning.parse("/alpha=xyz/beta=3")
+    expected = (ds.field("alpha") == ds.scalar(None)) & (
+        ds.field("beta") == ds.scalar(3)
+    )
+    assert expr.equals(expected)
+
 
 def test_expression_serialization():
     a = ds.scalar(1)
@@ -405,14 +414,30 @@ def test_expression_serialization():
     c = ds.scalar(True)
     d = ds.scalar("string")
     e = ds.scalar(None)
-    f = ds.scalar({'a': 1})
+    f = ds.scalar({"a": 1})
     g = ds.scalar(pa.scalar(1))
 
-    all_exprs = [a, b, c, d, e, f, g, a == b, a > b, a & b, a | b, ~c,
-                 d.is_valid(), a.cast(pa.int32(), safe=False),
-                 a.cast(pa.int32(), safe=False), a.isin([1, 2, 3]),
-                 ds.field('i64') > 5, ds.field('i64') == 5,
-                 ds.field('i64') == 7]
+    all_exprs = [
+        a,
+        b,
+        c,
+        d,
+        e,
+        f,
+        g,
+        a == b,
+        a > b,
+        a & b,
+        a | b,
+        ~c,
+        d.is_valid(),
+        a.cast(pa.int32(), safe=False),
+        a.cast(pa.int32(), safe=False),
+        a.isin([1, 2, 3]),
+        ds.field("i64") > 5,
+        ds.field("i64") == 5,
+        ds.field("i64") == 7,
+    ]
     for expr in all_exprs:
         assert isinstance(expr, ds.Expression)
         restored = pickle.loads(pickle.dumps(expr))
@@ -460,13 +485,13 @@ def test_expression_boolean_operators():
 
 
 def test_partition_keys():
-    a, b, c = [ds.field(f) == f for f in 'abc']
-    assert ds._get_partition_keys(a) == {'a': 'a'}
-    assert ds._get_partition_keys(a & b & c) == {f: f for f in 'abc'}
+    a, b, c = [ds.field(f) == f for f in "abc"]
+    assert ds._get_partition_keys(a) == {"a": "a"}
+    assert ds._get_partition_keys(a & b & c) == {f: f for f in "abc"}
 
-    nope = ds.field('d') >= 3
+    nope = ds.field("d") >= 3
     assert ds._get_partition_keys(nope) == {}
-    assert ds._get_partition_keys(a & nope) == {'a': 'a'}
+    assert ds._get_partition_keys(a & nope) == {"a": "a"}
 
 
 def test_parquet_read_options():
@@ -508,25 +533,24 @@ def test_file_format_pickling():
     formats = [
         ds.IpcFileFormat(),
         ds.CsvFileFormat(),
-        ds.CsvFileFormat(pa.csv.ParseOptions(delimiter='\t',
-                                             ignore_empty_lines=True)),
+        ds.CsvFileFormat(pa.csv.ParseOptions(delimiter="\t", ignore_empty_lines=True)),
         ds.ParquetFileFormat(),
         ds.ParquetFileFormat(
             read_options=ds.ParquetReadOptions(use_buffered_stream=True)
         ),
         ds.ParquetFileFormat(
             read_options={
-                'use_buffered_stream': True,
-                'buffer_size': 4096,
+                "use_buffered_stream": True,
+                "buffer_size": 4096,
             }
-        )
+        ),
     ]
     for file_format in formats:
         assert pickle.loads(pickle.dumps(file_format)) == file_format
 
 
-@pytest.mark.parametrize('paths_or_selector', [
-    fs.FileSelector('subdir', recursive=True),
+@pytest.mark.parametrize(
+    "paths_or_selector",
     [
         'subdir/1/xxx/file0.parquet',
         'subdir/2/yyy/file1.parquet',
@@ -539,34 +563,33 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer):
                                            pre_buffer=pre_buffer)
     )
 
-    options = ds.FileSystemFactoryOptions('subdir')
+    options = ds.FileSystemFactoryOptions("subdir")
     options.partitioning = ds.DirectoryPartitioning(
-        pa.schema([
-            pa.field('group', pa.int32()),
-            pa.field('key', pa.string())
-        ])
+        pa.schema([pa.field("group", pa.int32()), pa.field("key", pa.string())])
     )
-    assert options.partition_base_dir == 'subdir'
-    assert options.selector_ignore_prefixes == ['.', '_']
+    assert options.partition_base_dir == "subdir"
+    assert options.selector_ignore_prefixes == [".", "_"]
     assert options.exclude_invalid_files is False
 
-    factory = ds.FileSystemDatasetFactory(
-        mockfs, paths_or_selector, format, options
-    )
+    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options)
     inspected_schema = factory.inspect()
 
-    assert factory.inspect().equals(pa.schema([
-        pa.field('i64', pa.int64()),
-        pa.field('f64', pa.float64()),
-        pa.field('str', pa.dictionary(pa.int32(), pa.string())),
-        pa.field('const', pa.int64()),
-        pa.field('group', pa.int32()),
-        pa.field('key', pa.string()),
-    ]), check_metadata=False)
+    assert factory.inspect().equals(
+        pa.schema(
+            [
+                pa.field("i64", pa.int64()),
+                pa.field("f64", pa.float64()),
+                pa.field("str", pa.dictionary(pa.int32(), pa.string())),
+                pa.field("const", pa.int64()),
+                pa.field("group", pa.int32()),
+                pa.field("key", pa.string()),
+            ]
+        ),
+        check_metadata=False,
+    )
 
     assert isinstance(factory.inspect_schemas(), list)
-    assert isinstance(factory.finish(inspected_schema),
-                      ds.FileSystemDataset)
+    assert isinstance(factory.finish(inspected_schema), ds.FileSystemDataset)
     assert factory.root_partition.equals(ds.scalar(True))
 
     dataset = factory.finish()
@@ -578,9 +601,9 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer):
     expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64())
     expected_str = pa.DictionaryArray.from_arrays(
         pa.array([0, 1, 2, 3, 4], type=pa.int32()),
-        pa.array("0 1 2 3 4".split(), type=pa.string())
+        pa.array("0 1 2 3 4".split(), type=pa.string()),
     )
-    for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']):
+    for task, group, key in zip(scanner.scan(), [1, 2], ["xxx", "yyy"]):
         expected_group = pa.array([group] * 5, type=pa.int32())
         expected_key = pa.array([key] * 5, type=pa.string())
         expected_const = pa.array([group - 1] * 5, type=pa.int64())
@@ -601,15 +624,15 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer):
 
 def test_make_fragment(multisourcefs):
     parquet_format = ds.ParquetFileFormat()
-    dataset = ds.dataset('/plain', filesystem=multisourcefs,
-                         format=parquet_format)
+    dataset = ds.dataset("/plain", filesystem=multisourcefs, format=parquet_format)
 
     for path in dataset.files:
         fragment = parquet_format.make_fragment(path, multisourcefs)
         assert fragment.row_groups == [0]
 
-        row_group_fragment = parquet_format.make_fragment(path, multisourcefs,
-                                                          row_groups=[0])
+        row_group_fragment = parquet_format.make_fragment(
+            path, multisourcefs, row_groups=[0]
+        )
         for f in [fragment, row_group_fragment]:
             assert isinstance(f, ds.ParquetFileFragment)
             assert f.path == path
@@ -618,21 +641,23 @@ def test_make_fragment(multisourcefs):
 
 
 def test_make_csv_fragment_from_buffer():
-    content = textwrap.dedent("""
+    content = textwrap.dedent(
+        """
         alpha,num,animal
         a,12,dog
         b,11,cat
         c,10,rabbit
-    """)
-    buffer = pa.py_buffer(content.encode('utf-8'))
+    """
+    )
+    buffer = pa.py_buffer(content.encode("utf-8"))
 
     csv_format = ds.CsvFileFormat()
     fragment = csv_format.make_fragment(buffer)
 
-    expected = pa.table([['a', 'b', 'c'],
-                         [12, 11, 10],
-                         ['dog', 'cat', 'rabbit']],
-                        names=['alpha', 'num', 'animal'])
+    expected = pa.table(
+        [["a", "b", "c"], [12, 11, 10], ["dog", "cat", "rabbit"]],
+        names=["alpha", "num", "animal"],
+    )
     assert fragment.to_table().equals(expected)
 
     pickled = pickle.loads(pickle.dumps(fragment))
@@ -644,29 +669,26 @@ def test_make_parquet_fragment_from_buffer():
     import pyarrow.parquet as pq
 
     arrays = [
-        pa.array(['a', 'b', 'c']),
+        pa.array(["a", "b", "c"]),
         pa.array([12, 11, 10]),
-        pa.array(['dog', 'cat', 'rabbit'])
+        pa.array(["dog", "cat", "rabbit"]),
     ]
     dictionary_arrays = [
         arrays[0].dictionary_encode(),
         arrays[1],
-        arrays[2].dictionary_encode()
+        arrays[2].dictionary_encode(),
     ]
     dictionary_format = ds.ParquetFileFormat(
         read_options=ds.ParquetReadOptions(
             use_buffered_stream=True,
             buffer_size=4096,
-            dictionary_columns=['alpha', 'animal']
+            dictionary_columns=["alpha", "animal"],
         )
     )
 
-    cases = [
-        (arrays, ds.ParquetFileFormat()),
-        (dictionary_arrays, dictionary_format)
-    ]
+    cases = [(arrays, ds.ParquetFileFormat()), (dictionary_arrays, dictionary_format)]
     for arrays, format_ in cases:
-        table = pa.table(arrays, names=['alpha', 'num', 'animal'])
+        table = pa.table(arrays, names=["alpha", "num", "animal"])
 
         out = pa.BufferOutputStream()
         pq.write_table(table, out)
@@ -683,15 +705,13 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None):
     import pyarrow.parquet as pq
 
     table = pa.table(
-        [range(8), [1] * 8, ['a'] * 4 + ['b'] * 4],
-        names=['f1', 'f2', 'part']
+        [range(8), [1] * 8, ["a"] * 4 + ["b"] * 4], names=["f1", "f2", "part"]
     )
 
     path = str(tempdir / "test_parquet_dataset")
 
     # write_to_dataset currently requires pandas
-    pq.write_to_dataset(table, path,
-                        partition_cols=["part"], chunk_size=chunk_size)
+    pq.write_to_dataset(table, path, partition_cols=["part"], chunk_size=chunk_size)
     dataset = ds.dataset(
         path, format="parquet", partitioning="hive", filesystem=filesystem
     )
@@ -709,11 +729,11 @@ def test_fragments(tempdir):
     assert len(fragments) == 2
     f = fragments[0]
 
-    physical_names = ['f1', 'f2']
+    physical_names = ["f1", "f2"]
     # file's schema does not include partition column
     assert f.physical_schema.names == physical_names
     assert f.format.inspect(f.path, f.filesystem) == f.physical_schema
-    assert f.partition_expression.equals(ds.field('part') == 'a')
+    assert f.partition_expression.equals(ds.field("part") == "a")
 
     # By default, the partition column is not part of the schema.
     result = f.to_table()
@@ -723,13 +743,13 @@ def test_fragments(tempdir):
     # scanning fragment includes partition columns when given the proper
     # schema.
     result = f.to_table(schema=dataset.schema)
-    assert result.column_names == ['f1', 'f2', 'part']
+    assert result.column_names == ["f1", "f2", "part"]
     assert result.equals(table.slice(0, 4))
     assert f.physical_schema == result.schema.remove(2)
 
     # scanning fragments follow filter predicate
-    result = f.to_table(schema=dataset.schema, filter=ds.field('f1') < 2)
-    assert result.column_names == ['f1', 'f2', 'part']
+    result = f.to_table(schema=dataset.schema, filter=ds.field("f1") < 2)
+    assert result.column_names == ["f1", "f2", "part"]
 
 
 @pytest.mark.pandas
@@ -738,11 +758,11 @@ def test_fragments_implicit_cast(tempdir):
     # ARROW-8693
     import pyarrow.parquet as pq
 
-    table = pa.table([range(8), [1] * 4 + [2] * 4], names=['col', 'part'])
+    table = pa.table([range(8), [1] * 4 + [2] * 4], names=["col", "part"])
     path = str(tempdir / "test_parquet_dataset")
     pq.write_to_dataset(table, path, partition_cols=["part"])
 
-    part = ds.partitioning(pa.schema([('part', 'int8')]), flavor="hive")
+    part = ds.partitioning(pa.schema([("part", "int8")]), flavor="hive")
     dataset = ds.dataset(path, format="parquet", partitioning=part)
     fragments = dataset.get_fragments(filter=ds.field("part") >= 2)
     assert len(list(fragments)) == 1
@@ -753,10 +773,8 @@ def test_fragments_implicit_cast(tempdir):
 def test_fragments_reconstruct(tempdir):
     table, dataset = _create_dataset_for_fragments(tempdir)
 
-    def assert_yields_projected(fragment, row_slice,
-                                columns=None, filter=None):
-        actual = fragment.to_table(
-            schema=table.schema, columns=columns, filter=filter)
+    def assert_yields_projected(fragment, row_slice, columns=None, filter=None):
+        actual = fragment.to_table(schema=table.schema, columns=columns, filter=filter)
         column_names = columns if columns else table.column_names
         assert actual.column_names == column_names
 
@@ -772,40 +790,52 @@ def assert_yields_projected(fragment, row_slice,
 
     # manually re-construct a fragment, with explicit schema
     new_fragment = parquet_format.make_fragment(
-        fragment.path, fragment.filesystem,
-        partition_expression=fragment.partition_expression)
+        fragment.path,
+        fragment.filesystem,
+        partition_expression=fragment.partition_expression,
+    )
     assert new_fragment.to_table().equals(fragment.to_table())
     assert_yields_projected(new_fragment, (0, 4))
 
     # filter / column projection, inspected schema
     new_fragment = parquet_format.make_fragment(
-        fragment.path, fragment.filesystem,
-        partition_expression=fragment.partition_expression)
-    assert_yields_projected(new_fragment, (0, 2), filter=ds.field('f1') < 2)
+        fragment.path,
+        fragment.filesystem,
+        partition_expression=fragment.partition_expression,
+    )
+    assert_yields_projected(new_fragment, (0, 2), filter=ds.field("f1") < 2)
 
     # filter requiring cast / column projection, inspected schema
     new_fragment = parquet_format.make_fragment(
-        fragment.path, fragment.filesystem,
-        partition_expression=fragment.partition_expression)
-    assert_yields_projected(new_fragment, (0, 2),
-                            columns=['f1'], filter=ds.field('f1') < 2.0)
+        fragment.path,
+        fragment.filesystem,
+        partition_expression=fragment.partition_expression,
+    )
+    assert_yields_projected(
+        new_fragment, (0, 2), columns=["f1"], filter=ds.field("f1") < 2.0
+    )
 
     # filter on the partition column
     new_fragment = parquet_format.make_fragment(
-        fragment.path, fragment.filesystem,
-        partition_expression=fragment.partition_expression)
-    assert_yields_projected(new_fragment, (0, 4),
-                            filter=ds.field('part') == 'a')
+        fragment.path,
+        fragment.filesystem,
+        partition_expression=fragment.partition_expression,
+    )
+    assert_yields_projected(new_fragment, (0, 4), filter=ds.field("part") == "a")
 
     # Fragments don't contain the partition's columns if not provided to the
     # `to_table(schema=...)` method.
-    pattern = (r'No match for FieldRef.Name\(part\) in ' +
-               fragment.physical_schema.to_string(False, False, False))
+    pattern = (
+        r"No match for FieldRef.Name\(part\) in "
+        + fragment.physical_schema.to_string(False, False, False)
+    )
     with pytest.raises(ValueError, match=pattern):
         new_fragment = parquet_format.make_fragment(
-            fragment.path, fragment.filesystem,
-            partition_expression=fragment.partition_expression)
-        new_fragment.to_table(filter=ds.field('part') == 'a')
+            fragment.path,
+            fragment.filesystem,
+            partition_expression=fragment.partition_expression,
+        )
+        new_fragment.to_table(filter=ds.field("part") == "a")
 
 
 @pytest.mark.pandas
@@ -819,21 +849,21 @@ def test_fragments_parquet_row_groups(tempdir):
     row_group_fragments = list(fragment.split_by_row_group())
     assert len(row_group_fragments) == fragment.num_row_groups == 2
     result = row_group_fragments[0].to_table(schema=dataset.schema)
-    assert result.column_names == ['f1', 'f2', 'part']
+    assert result.column_names == ["f1", "f2", "part"]
     assert len(result) == 2
     assert result.equals(table.slice(0, 2))
 
     assert row_group_fragments[0].row_groups is not None
     assert row_group_fragments[0].num_row_groups == 1
     assert row_group_fragments[0].row_groups[0].statistics == {
-        'f1': {'min': 0, 'max': 1},
-        'f2': {'min': 1, 'max': 1},
+        "f1": {"min": 0, "max": 1},
+        "f2": {"min": 1, "max": 1},
     }
 
-    fragment = list(dataset.get_fragments(filter=ds.field('f1') < 1))[0]
-    row_group_fragments = list(fragment.split_by_row_group(ds.field('f1') < 1))
+    fragment = list(dataset.get_fragments(filter=ds.field("f1") < 1))[0]
+    row_group_fragments = list(fragment.split_by_row_group(ds.field("f1") < 1))
     assert len(row_group_fragments) == 1
-    result = row_group_fragments[0].to_table(filter=ds.field('f1') < 1)
+    result = row_group_fragments[0].to_table(filter=ds.field("f1") < 1)
     assert len(result) == 1
 
 
@@ -841,15 +871,15 @@ def test_fragments_parquet_row_groups(tempdir):
 def test_fragments_parquet_num_row_groups(tempdir):
     import pyarrow.parquet as pq
 
-    table = pa.table({'a': range(8)})
+    table = pa.table({"a": range(8)})
     pq.write_table(table, tempdir / "test.parquet", row_group_size=2)
     dataset = ds.dataset(tempdir / "test.parquet", format="parquet")
     original_fragment = list(dataset.get_fragments())[0]
 
     # create fragment with subset of row groups
     fragment = original_fragment.format.make_fragment(
-        original_fragment.path, original_fragment.filesystem,
-        row_groups=[1, 3])
+        original_fragment.path, original_fragment.filesystem, row_groups=[1, 3]
+    )
     assert fragment.num_row_groups == 2
     # ensure that parsing metadata preserves correct number of row groups
     fragment.ensure_complete_metadata()
@@ -862,14 +892,16 @@ def test_fragments_parquet_num_row_groups(tempdir):
 def test_fragments_parquet_row_groups_dictionary(tempdir):
     import pandas as pd
 
-    df = pd.DataFrame(dict(col1=['a', 'b'], col2=[1, 2]))
-    df['col1'] = df['col1'].astype("category")
+    df = pd.DataFrame(dict(col1=["a", "b"], col2=[1, 2]))
+    df["col1"] = df["col1"].astype("category")
 
     import pyarrow.parquet as pq
+
     pq.write_table(pa.table(df), tempdir / "test_filter_dictionary.parquet")
 
     import pyarrow.dataset as ds
-    dataset = ds.dataset(tempdir / 'test_filter_dictionary.parquet')
+
+    dataset = ds.dataset(tempdir / "test_filter_dictionary.parquet")
     result = dataset.to_table(filter=ds.field("col1") == "a")
 
     assert (df.iloc[0] == result.to_pandas()).all().all()
@@ -879,9 +911,7 @@ def test_fragments_parquet_row_groups_dictionary(tempdir):
 @pytest.mark.parquet
 def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs):
     fs, assert_opens = open_logging_fs
-    _, dataset = _create_dataset_for_fragments(
-        tempdir, chunk_size=2, filesystem=fs
-    )
+    _, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2, filesystem=fs)
     fragment = list(dataset.get_fragments())[0]
 
     # with default discovery, no metadata loaded
@@ -931,38 +961,38 @@ def _create_dataset_all_types(tempdir, chunk_size=None):
             pa.array([1, 10, 42], pa.uint64()),
             pa.array([1.0, 10.0, 42.0], pa.float32()),
             pa.array([1.0, 10.0, 42.0], pa.float64()),
-            pa.array(['a', None, 'z'], pa.utf8()),
-            pa.array(['a', None, 'z'], pa.binary()),
-            pa.array([1, 10, 42], pa.timestamp('s')),
-            pa.array([1, 10, 42], pa.timestamp('ms')),
-            pa.array([1, 10, 42], pa.timestamp('us')),
+            pa.array(["a", None, "z"], pa.utf8()),
+            pa.array(["a", None, "z"], pa.binary()),
+            pa.array([1, 10, 42], pa.timestamp("s")),
+            pa.array([1, 10, 42], pa.timestamp("ms")),
+            pa.array([1, 10, 42], pa.timestamp("us")),
             pa.array([1, 10, 42], pa.date32()),
             pa.array([1, 10, 4200000000], pa.date64()),
-            pa.array([1, 10, 42], pa.time32('s')),
-            pa.array([1, 10, 42], pa.time64('us')),
+            pa.array([1, 10, 42], pa.time32("s")),
+            pa.array([1, 10, 42], pa.time64("us")),
         ],
         names=[
-            'boolean',
-            'int8',
-            'uint8',
-            'int16',
-            'uint16',
-            'int32',
-            'uint32',
-            'int64',
-            'uint64',
-            'float',
-            'double',
-            'utf8',
-            'binary',
-            'ts[s]',
-            'ts[ms]',
-            'ts[us]',
-            'date32',
-            'date64',
-            'time32',
-            'time64',
-        ]
+            "boolean",
+            "int8",
+            "uint8",
+            "int16",
+            "uint16",
+            "int32",
+            "uint32",
+            "int64",
+            "uint64",
+            "float",
+            "double",
+            "utf8",
+            "binary",
+            "ts[s]",
+            "ts[ms]",
+            "ts[us]",
+            "date32",
+            "date64",
+            "time32",
+            "time64",
+        ],
     )
 
     path = str(tempdir / "test_parquet_dataset_all_types")
@@ -981,9 +1011,16 @@ def test_parquet_fragment_statistics(tempdir):
     fragment = list(dataset.get_fragments())[0]
 
     import datetime
-    def dt_s(x): return datetime.datetime(1970, 1, 1, 0, 0, x)
-    def dt_ms(x): return datetime.datetime(1970, 1, 1, 0, 0, 0, x*1000)
-    def dt_us(x): return datetime.datetime(1970, 1, 1, 0, 0, 0, x)
+
+    def dt_s(x):
+        return datetime.datetime(1970, 1, 1, 0, 0, x)
+
+    def dt_ms(x):
+        return datetime.datetime(1970, 1, 1, 0, 0, 0, x * 1000)
+
+    def dt_us(x):
+        return datetime.datetime(1970, 1, 1, 0, 0, 0, x)
+
     date = datetime.date
     time = datetime.time
 
@@ -994,26 +1031,26 @@ def dt_us(x): return datetime.datetime(1970, 1, 1, 0, 0, 0, x)
     assert row_group.num_rows == 3
     assert row_group.total_byte_size > 1000
     assert row_group.statistics == {
-        'boolean': {'min': False, 'max': True},
-        'int8': {'min': 1, 'max': 42},
-        'uint8': {'min': 1, 'max': 42},
-        'int16': {'min': 1, 'max': 42},
-        'uint16': {'min': 1, 'max': 42},
-        'int32': {'min': 1, 'max': 42},
-        'uint32': {'min': 1, 'max': 42},
-        'int64': {'min': 1, 'max': 42},
-        'uint64': {'min': 1, 'max': 42},
-        'float': {'min': 1.0, 'max': 42.0},
-        'double': {'min': 1.0, 'max': 42.0},
-        'utf8': {'min': 'a', 'max': 'z'},
-        'binary': {'min': b'a', 'max': b'z'},
-        'ts[s]': {'min': dt_s(1), 'max': dt_s(42)},
-        'ts[ms]': {'min': dt_ms(1), 'max': dt_ms(42)},
-        'ts[us]': {'min': dt_us(1), 'max': dt_us(42)},
-        'date32': {'min': date(1970, 1, 2), 'max': date(1970, 2, 12)},
-        'date64': {'min': date(1970, 1, 1), 'max': date(1970, 2, 18)},
-        'time32': {'min': time(0, 0, 1), 'max': time(0, 0, 42)},
-        'time64': {'min': time(0, 0, 0, 1), 'max': time(0, 0, 0, 42)},
+        "boolean": {"min": False, "max": True},
+        "int8": {"min": 1, "max": 42},
+        "uint8": {"min": 1, "max": 42},
+        "int16": {"min": 1, "max": 42},
+        "uint16": {"min": 1, "max": 42},
+        "int32": {"min": 1, "max": 42},
+        "uint32": {"min": 1, "max": 42},
+        "int64": {"min": 1, "max": 42},
+        "uint64": {"min": 1, "max": 42},
+        "float": {"min": 1.0, "max": 42.0},
+        "double": {"min": 1.0, "max": 42.0},
+        "utf8": {"min": "a", "max": "z"},
+        "binary": {"min": b"a", "max": b"z"},
+        "ts[s]": {"min": dt_s(1), "max": dt_s(42)},
+        "ts[ms]": {"min": dt_ms(1), "max": dt_ms(42)},
+        "ts[us]": {"min": dt_us(1), "max": dt_us(42)},
+        "date32": {"min": date(1970, 1, 2), "max": date(1970, 2, 12)},
+        "date64": {"min": date(1970, 1, 1), "max": date(1970, 2, 18)},
+        "time32": {"min": time(0, 0, 1), "max": time(0, 0, 42)},
+        "time64": {"min": time(0, 0, 0, 1), "max": time(0, 0, 0, 42)},
     }
 
 
@@ -1021,7 +1058,7 @@ def dt_us(x): return datetime.datetime(1970, 1, 1, 0, 0, 0, x)
 def test_parquet_fragment_statistics_nulls(tempdir):
     import pyarrow.parquet as pq
 
-    table = pa.table({'a': [0, 1, None, None], 'b': ['a', 'b', None, None]})
+    table = pa.table({"a": [0, 1, None, None], "b": ["a", "b", None, None]})
     pq.write_table(table, tempdir / "test.parquet", row_group_size=2)
 
     dataset = ds.dataset(tempdir / "test.parquet", format="parquet")
@@ -1048,21 +1085,25 @@ def test_fragments_parquet_row_groups_predicate(tempdir):
     table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2)
 
     fragment = list(dataset.get_fragments())[0]
-    assert fragment.partition_expression.equals(ds.field('part') == 'a')
+    assert fragment.partition_expression.equals(ds.field("part") == "a")
 
     # predicate may reference a partition field not present in the
     # physical_schema if an explicit schema is provided to split_by_row_group
 
     # filter matches partition_expression: all row groups
     row_group_fragments = list(
-        fragment.split_by_row_group(filter=ds.field('part') == 'a',
-                                    schema=dataset.schema))
+        fragment.split_by_row_group(
+            filter=ds.field("part") == "a", schema=dataset.schema
+        )
+    )
     assert len(row_group_fragments) == 2
 
     # filter contradicts partition_expression: no row groups
     row_group_fragments = list(
-        fragment.split_by_row_group(filter=ds.field('part') == 'b',
-                                    schema=dataset.schema))
+        fragment.split_by_row_group(
+            filter=ds.field("part") == "b", schema=dataset.schema
+        )
+    )
     assert len(row_group_fragments) == 0
 
 
@@ -1081,27 +1122,36 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir):
 
     # manually re-construct row group fragments
     new_fragment = parquet_format.make_fragment(
-        fragment.path, fragment.filesystem,
+        fragment.path,
+        fragment.filesystem,
         partition_expression=fragment.partition_expression,
-        row_groups=[0])
+        row_groups=[0],
+    )
     result = new_fragment.to_table()
     assert result.equals(row_group_fragments[0].to_table())
 
     # manually re-construct a row group fragment with filter/column projection
     new_fragment = parquet_format.make_fragment(
-        fragment.path, fragment.filesystem,
+        fragment.path,
+        fragment.filesystem,
         partition_expression=fragment.partition_expression,
-        row_groups={1})
-    result = new_fragment.to_table(schema=table.schema, columns=['f1', 'part'],
-                                   filter=ds.field('f1') < 3, )
-    assert result.column_names == ['f1', 'part']
+        row_groups={1},
+    )
+    result = new_fragment.to_table(
+        schema=table.schema,
+        columns=["f1", "part"],
+        filter=ds.field("f1") < 3,
+    )
+    assert result.column_names == ["f1", "part"]
     assert len(result) == 1
 
     # out of bounds row group index
     new_fragment = parquet_format.make_fragment(
-        fragment.path, fragment.filesystem,
+        fragment.path,
+        fragment.filesystem,
         partition_expression=fragment.partition_expression,
-        row_groups={2})
+        row_groups={2},
+    )
     with pytest.raises(IndexError, match="references row group 2"):
         new_fragment.to_table()
 
@@ -1110,8 +1160,7 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir):
 @pytest.mark.parquet
 def test_fragments_parquet_subset_ids(tempdir, open_logging_fs):
     fs, assert_opens = open_logging_fs
-    table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1,
-                                                   filesystem=fs)
+    table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, filesystem=fs)
     fragment = list(dataset.get_fragments())[0]
 
     # select with row group ids
@@ -1138,8 +1187,7 @@ def test_fragments_parquet_subset_ids(tempdir, open_logging_fs):
 @pytest.mark.parquet
 def test_fragments_parquet_subset_filter(tempdir, open_logging_fs):
     fs, assert_opens = open_logging_fs
-    table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1,
-                                                   filesystem=fs)
+    table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, filesystem=fs)
     fragment = list(dataset.get_fragments())[0]
 
     # select with filter
@@ -1181,62 +1229,62 @@ def test_fragments_parquet_subset_invalid(tempdir):
 
 
 def test_partitioning_factory(mockfs):
-    paths_or_selector = fs.FileSelector('subdir', recursive=True)
+    paths_or_selector = fs.FileSelector("subdir", recursive=True)
     format = ds.ParquetFileFormat()
 
-    options = ds.FileSystemFactoryOptions('subdir')
-    partitioning_factory = ds.DirectoryPartitioning.discover(['group', 'key'])
+    options = ds.FileSystemFactoryOptions("subdir")
+    partitioning_factory = ds.DirectoryPartitioning.discover(["group", "key"])
     assert isinstance(partitioning_factory, ds.PartitioningFactory)
     options.partitioning_factory = partitioning_factory
 
-    factory = ds.FileSystemDatasetFactory(
-        mockfs, paths_or_selector, format, options
-    )
+    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options)
     inspected_schema = factory.inspect()
     # i64/f64 from data, group/key from "/1/xxx" and "/2/yyy" paths
-    expected_schema = pa.schema([
-        ("i64", pa.int64()),
-        ("f64", pa.float64()),
-        ("str", pa.string()),
-        ("const", pa.int64()),
-        ("group", pa.int32()),
-        ("key", pa.string()),
-    ])
+    expected_schema = pa.schema(
+        [
+            ("i64", pa.int64()),
+            ("f64", pa.float64()),
+            ("str", pa.string()),
+            ("const", pa.int64()),
+            ("group", pa.int32()),
+            ("key", pa.string()),
+        ]
+    )
     assert inspected_schema.equals(expected_schema)
 
     hive_partitioning_factory = ds.HivePartitioning.discover()
     assert isinstance(hive_partitioning_factory, ds.PartitioningFactory)
 
 
-@pytest.mark.parametrize('infer_dictionary', [False, True])
+@pytest.mark.parametrize("infer_dictionary", [False, True])
 def test_partitioning_factory_dictionary(mockfs, infer_dictionary):
-    paths_or_selector = fs.FileSelector('subdir', recursive=True)
+    paths_or_selector = fs.FileSelector("subdir", recursive=True)
     format = ds.ParquetFileFormat()
-    options = ds.FileSystemFactoryOptions('subdir')
+    options = ds.FileSystemFactoryOptions("subdir")
 
     options.partitioning_factory = ds.DirectoryPartitioning.discover(
-        ['group', 'key'], infer_dictionary=infer_dictionary)
+        ["group", "key"], infer_dictionary=infer_dictionary
+    )
 
-    factory = ds.FileSystemDatasetFactory(
-        mockfs, paths_or_selector, format, options)
+    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options)
 
     inferred_schema = factory.inspect()
     if infer_dictionary:
         expected_type = pa.dictionary(pa.int32(), pa.string())
-        assert inferred_schema.field('key').type == expected_type
+        assert inferred_schema.field("key").type == expected_type
 
         table = factory.finish().to_table().combine_chunks()
-        actual = table.column('key').chunk(0)
-        expected = pa.array(['xxx'] * 5 + ['yyy'] * 5).dictionary_encode()
+        actual = table.column("key").chunk(0)
+        expected = pa.array(["xxx"] * 5 + ["yyy"] * 5).dictionary_encode()
         assert actual.equals(expected)
 
         # ARROW-9345 ensure filtering on the partition field works
-        table = factory.finish().to_table(filter=ds.field('key') == 'xxx')
-        actual = table.column('key').chunk(0)
+        table = factory.finish().to_table(filter=ds.field("key") == "xxx")
+        actual = table.column("key").chunk(0)
         expected = expected.slice(0, 5)
         assert actual.equals(expected)
     else:
-        assert inferred_schema.field('key').type == pa.string()
+        assert inferred_schema.field("key").type == pa.string()
 
 
 def test_partitioning_function():
@@ -1274,8 +1322,9 @@ def test_partitioning_function():
 
 def _create_single_file(base_dir, table=None, row_group_size=None):
     import pyarrow.parquet as pq
+
     if table is None:
-        table = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5})
+        table = pa.table({"a": range(9), "b": [0.0] * 4 + [1.0] * 5})
     path = base_dir / "test.parquet"
     pq.write_table(table, path, row_group_size=row_group_size)
     return table, path
@@ -1283,10 +1332,11 @@ def _create_single_file(base_dir, table=None, row_group_size=None):
 
 def _create_directory_of_files(base_dir):
     import pyarrow.parquet as pq
-    table1 = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5})
+
+    table1 = pa.table({"a": range(9), "b": [0.0] * 4 + [1.0] * 5})
     path1 = base_dir / "test1.parquet"
     pq.write_table(table1, path1)
-    table2 = pa.table({'a': range(9, 18), 'b': [0.] * 4 + [1.] * 5})
+    table2 = pa.table({"a": range(9, 18), "b": [0.0] * 4 + [1.0] * 5})
     path2 = base_dir / "test2.parquet"
     pq.write_table(table2, path2)
     return (table1, table2), (path1, path2)
@@ -1343,13 +1393,8 @@ def test_open_dataset_list_of_files(tempdir):
     tables, (path1, path2) = _create_directory_of_files(tempdir)
     table = pa.concat_tables(tables)
 
-    datasets = [
-        ds.dataset([path1, path2]),
-        ds.dataset([str(path1), str(path2)])
-    ]
-    datasets += [
-        pickle.loads(pickle.dumps(d)) for d in datasets
-    ]
+    datasets = [ds.dataset([path1, path2]), ds.dataset([str(path1), str(path2)])]
+    datasets += [pickle.loads(pickle.dumps(d)) for d in datasets]
 
     for dataset in datasets:
         assert dataset.schema.equals(table.schema)
@@ -1358,7 +1403,7 @@ def test_open_dataset_list_of_files(tempdir):
 
 
 def test_construct_from_single_file(tempdir):
-    directory = tempdir / 'single-file'
+    directory = tempdir / "single-file"
     directory.mkdir()
     table, path = _create_single_file(directory)
     relative_path = path.relative_to(directory)
@@ -1376,7 +1421,7 @@ def test_construct_from_single_file(tempdir):
 
 
 def test_construct_from_single_directory(tempdir):
-    directory = tempdir / 'single-directory'
+    directory = tempdir / "single-directory"
     directory.mkdir()
     tables, paths = _create_directory_of_files(directory)
 
@@ -1396,7 +1441,7 @@ def test_construct_from_single_directory(tempdir):
 
 def test_construct_from_list_of_files(tempdir):
     # instantiate from a list of files
-    directory = tempdir / 'list-of-files'
+    directory = tempdir / "list-of-files"
     directory.mkdir()
     tables, paths = _create_directory_of_files(directory)
 
@@ -1419,18 +1464,19 @@ def test_construct_from_list_of_files(tempdir):
 def test_construct_from_list_of_mixed_paths_fails(mockfs):
     # isntantiate from a list of mixed paths
     files = [
-        'subdir/1/xxx/file0.parquet',
-        'subdir/1/xxx/doesnt-exist.parquet',
+        "subdir/1/xxx/file0.parquet",
+        "subdir/1/xxx/doesnt-exist.parquet",
     ]
-    with pytest.raises(FileNotFoundError, match='doesnt-exist'):
+    with pytest.raises(FileNotFoundError, match="doesnt-exist"):
         ds.dataset(files, filesystem=mockfs)
 
 
 def test_construct_from_mixed_child_datasets(mockfs):
     # isntantiate from a list of mixed paths
-    a = ds.dataset(['subdir/1/xxx/file0.parquet',
-                    'subdir/2/yyy/file1.parquet'], filesystem=mockfs)
-    b = ds.dataset('subdir', filesystem=mockfs)
+    a = ds.dataset(
+        ["subdir/1/xxx/file0.parquet", "subdir/2/yyy/file1.parquet"], filesystem=mockfs
+    )
+    b = ds.dataset("subdir", filesystem=mockfs)
 
     dataset = ds.dataset([a, b])
 
@@ -1443,8 +1489,10 @@ def test_construct_from_mixed_child_datasets(mockfs):
 
     assert len(dataset.children) == 2
     for child in dataset.children:
-        assert child.files == ['subdir/1/xxx/file0.parquet',
-                               'subdir/2/yyy/file1.parquet']
+        assert child.files == [
+            "subdir/1/xxx/file0.parquet",
+            "subdir/2/yyy/file1.parquet",
+        ]
 
 
 def test_construct_empty_dataset():
@@ -1453,10 +1501,7 @@ def test_construct_empty_dataset():
     assert table.num_rows == 0
     assert table.num_columns == 0
 
-    empty = ds.dataset([], schema=pa.schema([
-        ('a', pa.int64()),
-        ('a', pa.string())
-    ]))
+    empty = ds.dataset([], schema=pa.schema([("a", pa.int64()), ("a", pa.string())]))
     table = empty.to_table()
     assert table.num_rows == 0
     assert table.num_columns == 2
@@ -1464,17 +1509,13 @@ def test_construct_empty_dataset():
 
 def test_construct_from_invalid_sources_raise(multisourcefs):
     child1 = ds.FileSystemDatasetFactory(
-        multisourcefs,
-        fs.FileSelector('/plain'),
-        format=ds.ParquetFileFormat()
+        multisourcefs, fs.FileSelector("/plain"), format=ds.ParquetFileFormat()
     )
     child2 = ds.FileSystemDatasetFactory(
-        multisourcefs,
-        fs.FileSelector('/schema'),
-        format=ds.ParquetFileFormat()
+        multisourcefs, fs.FileSelector("/schema"), format=ds.ParquetFileFormat()
     )
 
-    with pytest.raises(TypeError, match='Expected.*FileSystemDatasetFactory'):
+    with pytest.raises(TypeError, match="Expected.*FileSystemDatasetFactory"):
         ds.dataset([child1, child2])
 
     expected = (
@@ -1495,7 +1536,8 @@ def test_construct_from_invalid_sources_raise(multisourcefs):
 @pytest.mark.parquet
 def test_open_dataset_partitioned_directory(tempdir):
     import pyarrow.parquet as pq
-    table = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5})
+
+    table = pa.table({"a": range(9), "b": [0.0] * 4 + [1.0] * 5})
 
     path = tempdir / "dataset"
     path.mkdir()
@@ -1510,15 +1552,13 @@ def test_open_dataset_partitioned_directory(tempdir):
     _check_dataset_from_path(path, full_table)
 
     # specify partition scheme with discovery
-    dataset = ds.dataset(
-        str(path), partitioning=ds.partitioning(flavor="hive"))
+    dataset = ds.dataset(str(path), partitioning=ds.partitioning(flavor="hive"))
     expected_schema = table.schema.append(pa.field("part", pa.int32()))
     assert dataset.schema.equals(expected_schema)
 
     # specify partition scheme with discovery and relative path
     with change_cwd(tempdir):
-        dataset = ds.dataset(
-            "dataset/", partitioning=ds.partitioning(flavor="hive"))
+        dataset = ds.dataset("dataset/", partitioning=ds.partitioning(flavor="hive"))
         expected_schema = table.schema.append(pa.field("part", pa.int32()))
         assert dataset.schema.equals(expected_schema)
 
@@ -1529,14 +1569,15 @@ def test_open_dataset_partitioned_directory(tempdir):
     # specify partition scheme with explicit scheme
     dataset = ds.dataset(
         str(path),
-        partitioning=ds.partitioning(
-            pa.schema([("part", pa.int8())]), flavor="hive"))
+        partitioning=ds.partitioning(pa.schema([("part", pa.int8())]), flavor="hive"),
+    )
     expected_schema = table.schema.append(pa.field("part", pa.int8()))
     assert dataset.schema.equals(expected_schema)
 
     result = dataset.to_table()
     expected = full_table.append_column(
-        "part", pa.array(np.repeat([0, 1, 2], 9), type=pa.int8()))
+        "part", pa.array(np.repeat([0, 1, 2], 9), type=pa.int8())
+    )
     assert result.equals(expected)
 
 
@@ -1583,7 +1624,7 @@ def test_open_union_dataset(tempdir):
 
 
 def test_open_union_dataset_with_additional_kwargs(multisourcefs):
-    child = ds.dataset('/plain', filesystem=multisourcefs, format='parquet')
+    child = ds.dataset("/plain", filesystem=multisourcefs, format="parquet")
     with pytest.raises(ValueError, match="cannot pass any additional"):
         ds.dataset([child], format="parquet")
 
@@ -1592,33 +1633,57 @@ def test_open_dataset_non_existing_file():
     # ARROW-8213: Opening a dataset with a local incorrect path gives confusing
     #             error message
     with pytest.raises(FileNotFoundError):
-        ds.dataset('i-am-not-existing.parquet', format='parquet')
+        ds.dataset("i-am-not-existing.parquet", format="parquet")
 
-    with pytest.raises(pa.ArrowInvalid, match='cannot be relative'):
-        ds.dataset('file:i-am-not-existing.parquet', format='parquet')
+    with pytest.raises(pa.ArrowInvalid, match="cannot be relative"):
+        ds.dataset("file:i-am-not-existing.parquet", format="parquet")
 
 
 @pytest.mark.parquet
-@pytest.mark.parametrize('partitioning', ["directory", "hive"])
-@pytest.mark.parametrize('partition_keys', [
-    (["A", "B", "C"], [1, 2, 3]),
-    ([1, 2, 3], ["A", "B", "C"]),
-    (["A", "B", "C"], ["D", "E", "F"]),
-    ([1, 2, 3], [4, 5, 6]),
-])
-def test_open_dataset_partitioned_dictionary_type(tempdir, partitioning,
-                                                  partition_keys):
+@pytest.mark.parametrize("partitioning", ["directory", "hive"])
+@pytest.mark.parametrize("null_fallback", ["xyz", None])
+@pytest.mark.parametrize(
+    "partition_keys",
+    [
+        (["A", "B", "C"], [1, 2, 3]),
+        ([1, 2, 3], ["A", "B", "C"]),
+        (["A", "B", "C"], ["D", "E", "F"]),
+        ([1, 2, 3], [4, 5, 6]),
+        ([1, None, 3], ["A", "B", "C"]),
+        ([1, 2, 3], ["A", None, "C"]),
+        ([None, 2, 3], [None, 2, 3]),
+    ],
+)
+def test_open_dataset_partitioned_dictionary_type(
+    tempdir, partitioning, null_fallback, partition_keys
+):
     # ARROW-9288 / ARROW-9476
     import pyarrow.parquet as pq
-    table = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5})
+
+    table = pa.table({"a": range(9), "b": [0.0] * 4 + [1.0] * 5})
+
+    if None in partition_keys[0] or None in partition_keys[1]:
+        # Directory partitioning can't handle the first part being null
+        return
 
     if partitioning == "directory":
         partitioning = ds.DirectoryPartitioning.discover(
-            ["part1", "part2"], infer_dictionary=True)
+            ["part1", "part2"], infer_dictionary=True
+        )
         fmt = "{0}/{1}"
+        null_value = None
     else:
-        partitioning = ds.HivePartitioning.discover(infer_dictionary=True)
+        if null_fallback:
+            partitioning = ds.HivePartitioning.discover(
+                infer_dictionary=True, null_fallback=null_fallback
+            )
+        else:
+            partitioning = ds.HivePartitioning.discover(infer_dictionary=True)
         fmt = "part1={0}/part2={1}"
+        if null_fallback:
+            null_value = null_fallback
+        else:
+            null_value = "__HIVE_DEFAULT_PARTITION__"
 
     basepath = tempdir / "dataset"
     basepath.mkdir()
@@ -1626,7 +1691,7 @@ def test_open_dataset_partitioned_dictionary_type(tempdir, partitioning,
     part_keys1, part_keys2 = partition_keys
     for part1 in part_keys1:
         for part2 in part_keys2:
-            path = basepath / fmt.format(part1, part2)
+            path = basepath / fmt.format(part1 or null_value, part2 or null_value)
             path.mkdir(parents=True)
             pq.write_table(table, path / "test.parquet")
 
@@ -1635,11 +1700,10 @@ def test_open_dataset_partitioned_dictionary_type(tempdir, partitioning,
     def dict_type(key):
         value_type = pa.string() if isinstance(key, str) else pa.int32()
         return pa.dictionary(pa.int32(), value_type)
+
     expected_schema = table.schema.append(
         pa.field("part1", dict_type(part_keys1[0]))
-    ).append(
-        pa.field("part2", dict_type(part_keys2[0]))
-    )
+    ).append(pa.field("part2", dict_type(part_keys2[0])))
     assert dataset.schema.equals(expected_schema)
 
 
@@ -1680,15 +1744,14 @@ def s3_example_simple(s3_connection, s3_server):
     import pyarrow.parquet as pq
 
     host, port, access_key, secret_key = s3_connection
-    uri = (
-        "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}"
-        .format(access_key, secret_key, host, port)
+    uri = "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}".format(
+        access_key, secret_key, host, port
     )
 
     fs, path = FileSystem.from_uri(uri)
 
     fs.create_dir("mybucket")
-    table = pa.table({'a': [1, 2, 3]})
+    table = pa.table({"a": [1, 2, 3]})
     with fs.open_output_stream("mybucket/data.parquet") as out:
         pq.write_table(table, out)
 
@@ -1721,9 +1784,7 @@ def test_open_dataset_from_uri_s3_fsspec(s3_example_simple):
     fs = s3fs.S3FileSystem(
         key=access_key,
         secret=secret_key,
-        client_kwargs={
-            'endpoint_url': 'http://{}:{}'.format(host, port)
-        }
+        client_kwargs={"endpoint_url": "http://{}:{}".format(host, port)},
     )
 
     # passing as fsspec filesystem
@@ -1743,18 +1804,18 @@ def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server):
     import pyarrow.parquet as pq
 
     host, port, access_key, secret_key = s3_connection
-    bucket = 'theirbucket'
-    path = 'nested/folder/data.parquet'
+    bucket = "theirbucket"
+    path = "nested/folder/data.parquet"
     uri = "s3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}".format(
         access_key, secret_key, bucket, path, host, port
     )
 
     fs, path = FileSystem.from_uri(uri)
-    assert path == 'theirbucket/nested/folder/data.parquet'
+    assert path == "theirbucket/nested/folder/data.parquet"
 
     fs.create_dir(bucket)
 
-    table = pa.table({'a': [1, 2, 3]})
+    table = pa.table({"a": [1, 2, 3]})
     with fs.open_output_stream(path) as out:
         pq.write_table(table, out)
 
@@ -1763,27 +1824,25 @@ def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server):
     assert dataset.to_table().equals(table)
 
     # passing filesystem as an uri
-    template = (
-        "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format(
-            access_key, secret_key, host, port
-        )
+    template = "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format(
+        access_key, secret_key, host, port
     )
     cases = [
-        ('theirbucket/nested/folder/', '/data.parquet'),
-        ('theirbucket/nested/folder', 'data.parquet'),
-        ('theirbucket/nested/', 'folder/data.parquet'),
-        ('theirbucket/nested', 'folder/data.parquet'),
-        ('theirbucket', '/nested/folder/data.parquet'),
-        ('theirbucket', 'nested/folder/data.parquet'),
+        ("theirbucket/nested/folder/", "/data.parquet"),
+        ("theirbucket/nested/folder", "data.parquet"),
+        ("theirbucket/nested/", "folder/data.parquet"),
+        ("theirbucket/nested", "folder/data.parquet"),
+        ("theirbucket", "/nested/folder/data.parquet"),
+        ("theirbucket", "nested/folder/data.parquet"),
     ]
     for prefix, path in cases:
         uri = template.format(prefix)
         dataset = ds.dataset(path, filesystem=uri, format="parquet")
         assert dataset.to_table().equals(table)
 
-    with pytest.raises(pa.ArrowInvalid, match='Missing bucket name'):
-        uri = template.format('/')
-        ds.dataset('/theirbucket/nested/folder/data.parquet', filesystem=uri)
+    with pytest.raises(pa.ArrowInvalid, match="Missing bucket name"):
+        uri = template.format("/")
+        ds.dataset("/theirbucket/nested/folder/data.parquet", filesystem=uri)
 
     error = (
         "The path component of the filesystem URI must point to a directory "
@@ -1791,17 +1850,17 @@ def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server):
         "filesystem URI is `{}`"
     )
 
-    path = 'theirbucket/doesnt/exist'
+    path = "theirbucket/doesnt/exist"
     uri = template.format(path)
     with pytest.raises(ValueError) as exc:
-        ds.dataset('data.parquet', filesystem=uri)
-    assert str(exc.value) == error.format('NotFound', path, uri)
+        ds.dataset("data.parquet", filesystem=uri)
+    assert str(exc.value) == error.format("NotFound", path, uri)
 
-    path = 'theirbucket/nested/folder/data.parquet'
+    path = "theirbucket/nested/folder/data.parquet"
     uri = template.format(path)
     with pytest.raises(ValueError) as exc:
-        ds.dataset('data.parquet', filesystem=uri)
-    assert str(exc.value) == error.format('File', path, uri)
+        ds.dataset("data.parquet", filesystem=uri)
+    assert str(exc.value) == error.format("File", path, uri)
 
 
 @pytest.mark.parquet
@@ -1846,18 +1905,17 @@ def test_filter_timestamp(tempdir):
 @pytest.mark.parquet
 def test_filter_implicit_cast(tempdir):
     # ARROW-7652
-    table = pa.table({'a': pa.array([0, 1, 2, 3, 4, 5], type=pa.int8())})
+    table = pa.table({"a": pa.array([0, 1, 2, 3, 4, 5], type=pa.int8())})
     _, path = _create_single_file(tempdir, table)
     dataset = ds.dataset(str(path))
 
-    filter_ = ds.field('a') > 2
+    filter_ = ds.field("a") > 2
     assert len(dataset.to_table(filter=filter_)) == 3
 
 
 def test_dataset_union(multisourcefs):
     child = ds.FileSystemDatasetFactory(
-        multisourcefs, fs.FileSelector('/plain'),
-        format=ds.ParquetFileFormat()
+        multisourcefs, fs.FileSelector("/plain"), format=ds.ParquetFileFormat()
     )
     factory = ds.UnionDatasetFactory([child])
 
@@ -1870,106 +1928,128 @@ def test_dataset_union(multisourcefs):
 
 
 def test_union_dataset_from_other_datasets(tempdir, multisourcefs):
-    child1 = ds.dataset('/plain', filesystem=multisourcefs, format='parquet')
-    child2 = ds.dataset('/schema', filesystem=multisourcefs, format='parquet',
-                        partitioning=['week', 'color'])
-    child3 = ds.dataset('/hive', filesystem=multisourcefs, format='parquet',
-                        partitioning='hive')
+    child1 = ds.dataset("/plain", filesystem=multisourcefs, format="parquet")
+    child2 = ds.dataset(
+        "/schema",
+        filesystem=multisourcefs,
+        format="parquet",
+        partitioning=["week", "color"],
+    )
+    child3 = ds.dataset(
+        "/hive", filesystem=multisourcefs, format="parquet", partitioning="hive"
+    )
 
     assert child1.schema != child2.schema != child3.schema
 
     assembled = ds.dataset([child1, child2, child3])
     assert isinstance(assembled, ds.UnionDataset)
 
-    msg = 'cannot pass any additional arguments'
+    msg = "cannot pass any additional arguments"
     with pytest.raises(ValueError, match=msg):
         ds.dataset([child1, child2], filesystem=multisourcefs)
 
-    expected_schema = pa.schema([
-        ('date', pa.date32()),
-        ('index', pa.int64()),
-        ('value', pa.float64()),
-        ('color', pa.string()),
-        ('week', pa.int32()),
-        ('year', pa.int32()),
-        ('month', pa.int32()),
-    ])
+    expected_schema = pa.schema(
+        [
+            ("date", pa.date32()),
+            ("index", pa.int64()),
+            ("value", pa.float64()),
+            ("color", pa.string()),
+            ("week", pa.int32()),
+            ("year", pa.int32()),
+            ("month", pa.int32()),
+        ]
+    )
     assert assembled.schema.equals(expected_schema)
     assert assembled.to_table().schema.equals(expected_schema)
 
     assembled = ds.dataset([child1, child3])
-    expected_schema = pa.schema([
-        ('date', pa.date32()),
-        ('index', pa.int64()),
-        ('value', pa.float64()),
-        ('color', pa.string()),
-        ('year', pa.int32()),
-        ('month', pa.int32()),
-    ])
+    expected_schema = pa.schema(
+        [
+            ("date", pa.date32()),
+            ("index", pa.int64()),
+            ("value", pa.float64()),
+            ("color", pa.string()),
+            ("year", pa.int32()),
+            ("month", pa.int32()),
+        ]
+    )
     assert assembled.schema.equals(expected_schema)
     assert assembled.to_table().schema.equals(expected_schema)
 
-    expected_schema = pa.schema([
-        ('month', pa.int32()),
-        ('color', pa.string()),
-        ('date', pa.date32()),
-    ])
+    expected_schema = pa.schema(
+        [
+            ("month", pa.int32()),
+            ("color", pa.string()),
+            ("date", pa.date32()),
+        ]
+    )
     assembled = ds.dataset([child1, child3], schema=expected_schema)
     assert assembled.to_table().schema.equals(expected_schema)
 
-    expected_schema = pa.schema([
-        ('month', pa.int32()),
-        ('color', pa.string()),
-        ('unknown', pa.string())  # fill with nulls
-    ])
+    expected_schema = pa.schema(
+        [
+            ("month", pa.int32()),
+            ("color", pa.string()),
+            ("unknown", pa.string()),  # fill with nulls
+        ]
+    )
     assembled = ds.dataset([child1, child3], schema=expected_schema)
     assert assembled.to_table().schema.equals(expected_schema)
 
     # incompatible schemas, date and index columns have conflicting types
-    table = pa.table([range(9), [0.] * 4 + [1.] * 5, 'abcdefghj'],
-                     names=['date', 'value', 'index'])
+    table = pa.table(
+        [range(9), [0.0] * 4 + [1.0] * 5, "abcdefghj"], names=["date", "value", "index"]
+    )
     _, path = _create_single_file(tempdir, table=table)
     child4 = ds.dataset(path)
 
-    with pytest.raises(pa.ArrowInvalid, match='Unable to merge'):
+    with pytest.raises(pa.ArrowInvalid, match="Unable to merge"):
         ds.dataset([child1, child4])
 
 
 def test_dataset_from_a_list_of_local_directories_raises(multisourcefs):
-    msg = 'points to a directory, but only file paths are supported'
+    msg = "points to a directory, but only file paths are supported"
     with pytest.raises(IsADirectoryError, match=msg):
-        ds.dataset(['/plain', '/schema', '/hive'], filesystem=multisourcefs)
+        ds.dataset(["/plain", "/schema", "/hive"], filesystem=multisourcefs)
 
 
 def test_union_dataset_filesystem_datasets(multisourcefs):
     # without partitioning
-    dataset = ds.dataset([
-        ds.dataset('/plain', filesystem=multisourcefs),
-        ds.dataset('/schema', filesystem=multisourcefs),
-        ds.dataset('/hive', filesystem=multisourcefs),
-    ])
-    expected_schema = pa.schema([
-        ('date', pa.date32()),
-        ('index', pa.int64()),
-        ('value', pa.float64()),
-        ('color', pa.string()),
-    ])
+    dataset = ds.dataset(
+        [
+            ds.dataset("/plain", filesystem=multisourcefs),
+            ds.dataset("/schema", filesystem=multisourcefs),
+            ds.dataset("/hive", filesystem=multisourcefs),
+        ]
+    )
+    expected_schema = pa.schema(
+        [
+            ("date", pa.date32()),
+            ("index", pa.int64()),
+            ("value", pa.float64()),
+            ("color", pa.string()),
+        ]
+    )
     assert dataset.schema.equals(expected_schema)
 
     # with hive partitioning for two hive sources
-    dataset = ds.dataset([
-        ds.dataset('/plain', filesystem=multisourcefs),
-        ds.dataset('/schema', filesystem=multisourcefs),
-        ds.dataset('/hive', filesystem=multisourcefs, partitioning='hive')
-    ])
-    expected_schema = pa.schema([
-        ('date', pa.date32()),
-        ('index', pa.int64()),
-        ('value', pa.float64()),
-        ('color', pa.string()),
-        ('year', pa.int32()),
-        ('month', pa.int32()),
-    ])
+    dataset = ds.dataset(
+        [
+            ds.dataset("/plain", filesystem=multisourcefs),
+            ds.dataset("/schema", filesystem=multisourcefs),
+            ds.dataset("/hive", filesystem=multisourcefs, partitioning="hive"),
+        ]
+    )
+    expected_schema = pa.schema(
+        [
+            ("date", pa.date32()),
+            ("index", pa.int64()),
+            ("value", pa.float64()),
+            ("color", pa.string()),
+            ("year", pa.int32()),
+            ("month", pa.int32()),
+        ]
+    )
     assert dataset.schema.equals(expected_schema)
 
 
@@ -1977,7 +2057,7 @@ def test_union_dataset_filesystem_datasets(multisourcefs):
 def test_specified_schema(tempdir):
     import pyarrow.parquet as pq
 
-    table = pa.table({'a': [1, 2, 3], 'b': [.1, .2, .3]})
+    table = pa.table({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
     pq.write_table(table, tempdir / "data.parquet")
 
     def _check_dataset(schema, expected, expected_schema=None):
@@ -2000,24 +2080,24 @@ def _check_dataset(schema, expected, expected_schema=None):
     _check_dataset(schema, expected)
 
     # Specifying schema with change column order
-    schema = pa.schema([('b', 'float64'), ('a', 'int64')])
-    expected = pa.table([[.1, .2, .3], [1, 2, 3]], names=['b', 'a'])
+    schema = pa.schema([("b", "float64"), ("a", "int64")])
+    expected = pa.table([[0.1, 0.2, 0.3], [1, 2, 3]], names=["b", "a"])
     _check_dataset(schema, expected)
 
     # Specifying schema with missing column
-    schema = pa.schema([('a', 'int64')])
-    expected = pa.table([[1, 2, 3]], names=['a'])
+    schema = pa.schema([("a", "int64")])
+    expected = pa.table([[1, 2, 3]], names=["a"])
     _check_dataset(schema, expected)
 
     # Specifying schema with additional column
-    schema = pa.schema([('a', 'int64'), ('c', 'int32')])
-    expected = pa.table([[1, 2, 3],
-                         pa.array([None, None, None], type='int32')],
-                        names=['a', 'c'])
+    schema = pa.schema([("a", "int64"), ("c", "int32")])
+    expected = pa.table(
+        [[1, 2, 3], pa.array([None, None, None], type="int32")], names=["a", "c"]
+    )
     _check_dataset(schema, expected)
 
     # Specifying with incompatible schema
-    schema = pa.schema([('a', 'int32'), ('b', 'float64')])
+    schema = pa.schema([("a", "int32"), ("b", "float64")])
     dataset = ds.dataset(str(tempdir / "data.parquet"), schema=schema)
     assert dataset.schema.equals(schema)
     with pytest.raises(TypeError):
@@ -2025,10 +2105,14 @@ def _check_dataset(schema, expected, expected_schema=None):
 
 
 def test_ipc_format(tempdir):
-    table = pa.table({'a': pa.array([1, 2, 3], type="int8"),
-                      'b': pa.array([.1, .2, .3], type="float64")})
+    table = pa.table(
+        {
+            "a": pa.array([1, 2, 3], type="int8"),
+            "b": pa.array([0.1, 0.2, 0.3], type="float64"),
+        }
+    )
 
-    path = str(tempdir / 'test.arrow')
+    path = str(tempdir / "test.arrow")
     with pa.output_stream(path) as sink:
         writer = pa.RecordBatchFileWriter(sink, table.schema)
         writer.write_batch(table.to_batches()[0])
@@ -2046,17 +2130,21 @@ def test_ipc_format(tempdir):
 
 @pytest.mark.pandas
 def test_csv_format(tempdir):
-    table = pa.table({'a': pa.array([1, 2, 3], type="int64"),
-                      'b': pa.array([.1, .2, .3], type="float64")})
+    table = pa.table(
+        {
+            "a": pa.array([1, 2, 3], type="int64"),
+            "b": pa.array([0.1, 0.2, 0.3], type="float64"),
+        }
+    )
 
-    path = str(tempdir / 'test.csv')
+    path = str(tempdir / "test.csv")
     table.to_pandas().to_csv(path, index=False)
 
     dataset = ds.dataset(path, format=ds.CsvFileFormat())
     result = dataset.to_table()
     assert result.equals(table)
 
-    dataset = ds.dataset(path, format='csv')
+    dataset = ds.dataset(path, format="csv")
     result = dataset.to_table()
     assert result.equals(table)
 
@@ -2064,8 +2152,12 @@ def test_csv_format(tempdir):
 def test_feather_format(tempdir):
     from pyarrow.feather import write_feather
 
-    table = pa.table({'a': pa.array([1, 2, 3], type="int8"),
-                      'b': pa.array([.1, .2, .3], type="float64")})
+    table = pa.table(
+        {
+            "a": pa.array([1, 2, 3], type="int8"),
+            "b": pa.array([0.1, 0.2, 0.3], type="float64"),
+        }
+    )
 
     basedir = tempdir / "feather_dataset"
     basedir.mkdir()
@@ -2097,16 +2189,15 @@ def _create_parquet_dataset_simple(root_path):
     metadata_collector = []
 
     for i in range(4):
-        table = pa.table({'f1': [i] * 10, 'f2': np.random.randn(10)})
+        table = pa.table({"f1": [i] * 10, "f2": np.random.randn(10)})
         pq.write_to_dataset(
             table, str(root_path), metadata_collector=metadata_collector
         )
 
-    metadata_path = str(root_path / '_metadata')
+    metadata_path = str(root_path / "_metadata")
     # write _metadata file
     pq.write_metadata(
-        table.schema, metadata_path,
-        metadata_collector=metadata_collector
+        table.schema, metadata_path, metadata_collector=metadata_collector
     )
     return metadata_path, table
 
@@ -2151,22 +2242,23 @@ def _create_metadata_file(root_path):
         metadata_collector.append(metadata)
 
     metadata_path = root_path / "_metadata"
-    pq.write_metadata(
-        schema, metadata_path, metadata_collector=metadata_collector
-    )
+    pq.write_metadata(schema, metadata_path, metadata_collector=metadata_collector)
     return metadata_path
 
 
 def _create_parquet_dataset_partitioned(root_path):
     import pyarrow.parquet as pq
 
-    table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))],
-        names=["f1", "f2", "part"]
+    table = pa.table(
+        [
+            pa.array(range(20)),
+            pa.array(np.random.randn(20)),
+            pa.array(np.repeat(["a", "b"], 10)),
+        ],
+        names=["f1", "f2", "part"],
     )
     table = table.replace_schema_metadata({"key": "value"})
-    pq.write_to_dataset(table, str(root_path), partition_cols=['part'])
+    pq.write_to_dataset(table, str(root_path), partition_cols=["part"])
     return _create_metadata_file(root_path), table
 
 
@@ -2219,9 +2311,8 @@ def test_parquet_dataset_lazy_filtering(tempdir, open_logging_fs):
     # creating the dataset should only open the metadata file
     with assert_opens([metadata_path]):
         dataset = ds.parquet_dataset(
-            metadata_path,
-            partitioning=ds.partitioning(flavor="hive"),
-            filesystem=fs)
+            metadata_path, partitioning=ds.partitioning(flavor="hive"), filesystem=fs
+        )
 
     # materializing fragments should not open any file
     with assert_opens([]):
@@ -2250,7 +2341,7 @@ def test_parquet_dataset_lazy_filtering(tempdir, open_logging_fs):
 @pytest.mark.pandas
 def test_dataset_schema_metadata(tempdir):
     # ARROW-8802
-    df = pd.DataFrame({'a': [1, 2, 3]})
+    df = pd.DataFrame({"a": [1, 2, 3]})
     path = tempdir / "test.parquet"
     df.to_parquet(path)
     dataset = ds.dataset(path)
@@ -2269,13 +2360,12 @@ def test_filter_mismatching_schema(tempdir):
     # ARROW-9146
     import pyarrow.parquet as pq
 
-    table = pa.table({"col": pa.array([1, 2, 3, 4], type='int32')})
+    table = pa.table({"col": pa.array([1, 2, 3, 4], type="int32")})
     pq.write_table(table, str(tempdir / "data.parquet"))
 
     # specifying explicit schema, but that mismatches the schema of the data
     schema = pa.schema([("col", pa.int64())])
-    dataset = ds.dataset(
-        tempdir / "data.parquet", format="parquet", schema=schema)
+    dataset = ds.dataset(tempdir / "data.parquet", format="parquet", schema=schema)
 
     # filtering on a column with such type mismatch should give a proper error
     with pytest.raises(TypeError):
@@ -2292,65 +2382,70 @@ def test_dataset_project_only_partition_columns(tempdir):
     # ARROW-8729
     import pyarrow.parquet as pq
 
-    table = pa.table({'part': 'a a b b'.split(), 'col': list(range(4))})
+    table = pa.table({"part": "a a b b".split(), "col": list(range(4))})
 
-    path = str(tempdir / 'test_dataset')
-    pq.write_to_dataset(table, path, partition_cols=['part'])
-    dataset = ds.dataset(path, partitioning='hive')
+    path = str(tempdir / "test_dataset")
+    pq.write_to_dataset(table, path, partition_cols=["part"])
+    dataset = ds.dataset(path, partitioning="hive")
 
     all_cols = dataset.to_table(use_threads=False)
-    part_only = dataset.to_table(columns=['part'], use_threads=False)
+    part_only = dataset.to_table(columns=["part"], use_threads=False)
 
-    assert all_cols.column('part').equals(part_only.column('part'))
+    assert all_cols.column("part").equals(part_only.column("part"))
 
 
 @pytest.mark.parquet
 @pytest.mark.pandas
 def test_dataset_project_null_column(tempdir):
     import pandas as pd
-    df = pd.DataFrame({"col": np.array([None, None, None], dtype='object')})
+
+    df = pd.DataFrame({"col": np.array([None, None, None], dtype="object")})
 
     f = tempdir / "test_dataset_project_null_column.parquet"
     df.to_parquet(f, engine="pyarrow")
 
-    dataset = ds.dataset(f, format="parquet",
-                         schema=pa.schema([("col", pa.int64())]))
-    expected = pa.table({'col': pa.array([None, None, None], pa.int64())})
+    dataset = ds.dataset(f, format="parquet", schema=pa.schema([("col", pa.int64())]))
+    expected = pa.table({"col": pa.array([None, None, None], pa.int64())})
     assert dataset.to_table().equals(expected)
 
 
-def _check_dataset_roundtrip(dataset, base_dir, expected_files,
-                             base_dir_path=None, partitioning=None):
+def _check_dataset_roundtrip(
+    dataset, base_dir, expected_files, base_dir_path=None, partitioning=None
+):
     base_dir_path = base_dir_path or base_dir
 
-    ds.write_dataset(dataset, base_dir, format="feather",
-                     partitioning=partitioning, use_threads=False)
+    ds.write_dataset(
+        dataset,
+        base_dir,
+        format="feather",
+        partitioning=partitioning,
+        use_threads=False,
+    )
 
     # check that all files are present
     file_paths = list(base_dir_path.rglob("*"))
     assert set(file_paths) == set(expected_files)
 
     # check that reading back in as dataset gives the same result
-    dataset2 = ds.dataset(
-        base_dir_path, format="feather", partitioning=partitioning)
+    dataset2 = ds.dataset(base_dir_path, format="feather", partitioning=partitioning)
     assert dataset2.to_table().equals(dataset.to_table())
 
 
 @pytest.mark.parquet
 def test_write_dataset(tempdir):
     # manually create a written dataset and read as dataset object
-    directory = tempdir / 'single-file'
+    directory = tempdir / "single-file"
     directory.mkdir()
     _ = _create_single_file(directory)
     dataset = ds.dataset(directory)
 
     # full string path
-    target = tempdir / 'single-file-target'
+    target = tempdir / "single-file-target"
     expected_files = [target / "part-0.feather"]
     _check_dataset_roundtrip(dataset, str(target), expected_files, target)
 
     # pathlib path object
-    target = tempdir / 'single-file-target2'
+    target = tempdir / "single-file-target2"
     expected_files = [target / "part-0.feather"]
     _check_dataset_roundtrip(dataset, target, expected_files, target)
 
@@ -2362,12 +2457,12 @@ def test_write_dataset(tempdir):
     #     dataset, './single-file-target3', expected_files, target)
 
     # Directory of files
-    directory = tempdir / 'single-directory'
+    directory = tempdir / "single-directory"
     directory.mkdir()
     _ = _create_directory_of_files(directory)
     dataset = ds.dataset(directory)
 
-    target = tempdir / 'single-directory-target'
+    target = tempdir / "single-directory-target"
     expected_files = [target / "part-0.feather"]
     _check_dataset_roundtrip(dataset, str(target), expected_files, target)
 
@@ -2381,28 +2476,32 @@ def test_write_dataset_partitioned(tempdir):
     dataset = ds.dataset(directory, partitioning=partitioning)
 
     # hive partitioning
-    target = tempdir / 'partitioned-hive-target'
+    target = tempdir / "partitioned-hive-target"
     expected_paths = [
-        target / "part=a", target / "part=a" / "part-0.feather",
-        target / "part=b", target / "part=b" / "part-1.feather"
+        target / "part=a",
+        target / "part=a" / "part-0.feather",
+        target / "part=b",
+        target / "part=b" / "part-1.feather",
     ]
     partitioning_schema = ds.partitioning(
-        pa.schema([("part", pa.string())]), flavor="hive")
+        pa.schema([("part", pa.string())]), flavor="hive"
+    )
     _check_dataset_roundtrip(
-        dataset, str(target), expected_paths, target,
-        partitioning=partitioning_schema)
+        dataset, str(target), expected_paths, target, partitioning=partitioning_schema
+    )
 
     # directory partitioning
-    target = tempdir / 'partitioned-dir-target'
+    target = tempdir / "partitioned-dir-target"
     expected_paths = [
-        target / "a", target / "a" / "part-0.feather",
-        target / "b", target / "b" / "part-1.feather"
+        target / "a",
+        target / "a" / "part-0.feather",
+        target / "b",
+        target / "b" / "part-1.feather",
     ]
-    partitioning_schema = ds.partitioning(
-        pa.schema([("part", pa.string())]))
+    partitioning_schema = ds.partitioning(pa.schema([("part", pa.string())]))
     _check_dataset_roundtrip(
-        dataset, str(target), expected_paths, target,
-        partitioning=partitioning_schema)
+        dataset, str(target), expected_paths, target, partitioning=partitioning_schema
+    )
 
 
 @pytest.mark.parquet
@@ -2413,22 +2512,25 @@ def test_write_dataset_partitioned_dict(tempdir):
 
     # directory partitioning, dictionary partition columns
     dataset = ds.dataset(
-        directory,
-        partitioning=ds.HivePartitioning.discover(infer_dictionary=True))
-    target = tempdir / 'partitioned-dir-target'
+        directory, partitioning=ds.HivePartitioning.discover(infer_dictionary=True)
+    )
+    target = tempdir / "partitioned-dir-target"
     expected_paths = [
-        target / "a", target / "a" / "part-0.feather",
-        target / "b", target / "b" / "part-1.feather"
+        target / "a",
+        target / "a" / "part-0.feather",
+        target / "b",
+        target / "b" / "part-1.feather",
     ]
-    partitioning = ds.partitioning(pa.schema([
-        dataset.schema.field('part')]),
-        dictionaries={'part': pa.array(['a', 'b'])})
+    partitioning = ds.partitioning(
+        pa.schema([dataset.schema.field("part")]),
+        dictionaries={"part": pa.array(["a", "b"])},
+    )
     # NB: dictionaries required here since we use partitioning to parse
     # directories in _check_dataset_roundtrip (not currently required for
     # the formatting step)
     _check_dataset_roundtrip(
-        dataset, str(target), expected_paths, target,
-        partitioning=partitioning)
+        dataset, str(target), expected_paths, target, partitioning=partitioning
+    )
 
 
 @pytest.mark.parquet
@@ -2438,18 +2540,15 @@ def test_write_dataset_use_threads(tempdir):
     _ = _create_parquet_dataset_partitioned(directory)
     dataset = ds.dataset(directory, partitioning="hive")
 
-    partitioning = ds.partitioning(
-        pa.schema([("part", pa.string())]), flavor="hive")
+    partitioning = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
 
-    target1 = tempdir / 'partitioned1'
+    target1 = tempdir / "partitioned1"
     ds.write_dataset(
-        dataset, target1, format="feather", partitioning=partitioning,
-        use_threads=True
+        dataset, target1, format="feather", partitioning=partitioning, use_threads=True
     )
-    target2 = tempdir / 'partitioned2'
+    target2 = tempdir / "partitioned2"
     ds.write_dataset(
-        dataset, target2, format="feather", partitioning=partitioning,
-        use_threads=False
+        dataset, target2, format="feather", partitioning=partitioning, use_threads=False
     )
 
     # check that reading in gives same result
@@ -2459,14 +2558,19 @@ def test_write_dataset_use_threads(tempdir):
 
 
 def test_write_table(tempdir):
-    table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))
-    ], names=["f1", "f2", "part"])
-
-    base_dir = tempdir / 'single'
-    ds.write_dataset(table, base_dir,
-                     basename_template='dat_{i}.arrow', format="feather")
+    table = pa.table(
+        [
+            pa.array(range(20)),
+            pa.array(np.random.randn(20)),
+            pa.array(np.repeat(["a", "b"], 10)),
+        ],
+        names=["f1", "f2", "part"],
+    )
+
+    base_dir = tempdir / "single"
+    ds.write_dataset(
+        table, base_dir, basename_template="dat_{i}.arrow", format="feather"
+    )
     # check that all files are present
     file_paths = list(base_dir.rglob("*"))
     expected_paths = [base_dir / "dat_0.arrow"]
@@ -2476,16 +2580,21 @@ def test_write_table(tempdir):
     assert result.equals(table)
 
     # with partitioning
-    base_dir = tempdir / 'partitioned'
-    partitioning = ds.partitioning(
-        pa.schema([("part", pa.string())]), flavor="hive")
-    ds.write_dataset(table, base_dir, format="feather",
-                     basename_template='dat_{i}.arrow',
-                     partitioning=partitioning)
+    base_dir = tempdir / "partitioned"
+    partitioning = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
+    ds.write_dataset(
+        table,
+        base_dir,
+        format="feather",
+        basename_template="dat_{i}.arrow",
+        partitioning=partitioning,
+    )
     file_paths = list(base_dir.rglob("*"))
     expected_paths = [
-        base_dir / "part=a", base_dir / "part=a" / "dat_0.arrow",
-        base_dir / "part=b", base_dir / "part=b" / "dat_1.arrow"
+        base_dir / "part=a",
+        base_dir / "part=a" / "dat_0.arrow",
+        base_dir / "part=b",
+        base_dir / "part=b" / "dat_1.arrow",
     ]
     assert set(file_paths) == set(expected_paths)
     result = ds.dataset(base_dir, format="ipc", partitioning=partitioning)
@@ -2493,59 +2602,65 @@ def test_write_table(tempdir):
 
 
 def test_write_table_multiple_fragments(tempdir):
-    table = pa.table([
-        pa.array(range(10)), pa.array(np.random.randn(10)),
-        pa.array(np.repeat(['a', 'b'], 5))
-    ], names=["f1", "f2", "part"])
-    table = pa.concat_tables([table]*2)
+    table = pa.table(
+        [
+            pa.array(range(10)),
+            pa.array(np.random.randn(10)),
+            pa.array(np.repeat(["a", "b"], 5)),
+        ],
+        names=["f1", "f2", "part"],
+    )
+    table = pa.concat_tables([table] * 2)
 
     # Table with multiple batches written as single Fragment by default
-    base_dir = tempdir / 'single'
+    base_dir = tempdir / "single"
     ds.write_dataset(table, base_dir, format="feather")
     assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"])
     assert ds.dataset(base_dir, format="ipc").to_table().equals(table)
 
     # Same for single-element list of Table
-    base_dir = tempdir / 'single-list'
+    base_dir = tempdir / "single-list"
     ds.write_dataset([table], base_dir, format="feather")
     assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"])
     assert ds.dataset(base_dir, format="ipc").to_table().equals(table)
 
     # Provide list of batches to write multiple fragments
-    base_dir = tempdir / 'multiple'
+    base_dir = tempdir / "multiple"
     ds.write_dataset(table.to_batches(), base_dir, format="feather")
-    assert set(base_dir.rglob("*")) == set(
-        [base_dir / "part-0.feather"])
+    assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"])
     assert ds.dataset(base_dir, format="ipc").to_table().equals(table)
 
     # Provide list of tables to write multiple fragments
-    base_dir = tempdir / 'multiple-table'
+    base_dir = tempdir / "multiple-table"
     ds.write_dataset([table, table], base_dir, format="feather")
-    assert set(base_dir.rglob("*")) == set(
-        [base_dir / "part-0.feather"])
-    assert ds.dataset(base_dir, format="ipc").to_table().equals(
-        pa.concat_tables([table]*2)
+    assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"])
+    assert (
+        ds.dataset(base_dir, format="ipc")
+        .to_table()
+        .equals(pa.concat_tables([table] * 2))
     )
 
 
 def test_write_table_partitioned_dict(tempdir):
     # ensure writing table partitioned on a dictionary column works without
     # specifying the dictionary values explicitly
-    table = pa.table([
-        pa.array(range(20)),
-        pa.array(np.repeat(['a', 'b'], 10)).dictionary_encode(),
-    ], names=['col', 'part'])
+    table = pa.table(
+        [
+            pa.array(range(20)),
+            pa.array(np.repeat(["a", "b"], 10)).dictionary_encode(),
+        ],
+        names=["col", "part"],
+    )
 
     partitioning = ds.partitioning(table.select(["part"]).schema)
 
     base_dir = tempdir / "dataset"
-    ds.write_dataset(
-        table, base_dir, format="feather", partitioning=partitioning
-    )
+    ds.write_dataset(table, base_dir, format="feather", partitioning=partitioning)
 
     # check roundtrip
     partitioning_read = ds.DirectoryPartitioning.discover(
-        ["part"], infer_dictionary=True)
+        ["part"], infer_dictionary=True
+    )
     result = ds.dataset(
         base_dir, format="ipc", partitioning=partitioning_read
     ).to_table()
@@ -2556,14 +2671,18 @@ def test_write_table_partitioned_dict(tempdir):
 def test_write_dataset_parquet(tempdir):
     import pyarrow.parquet as pq
 
-    table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))
-    ], names=["f1", "f2", "part"])
+    table = pa.table(
+        [
+            pa.array(range(20)),
+            pa.array(np.random.randn(20)),
+            pa.array(np.repeat(["a", "b"], 10)),
+        ],
+        names=["f1", "f2", "part"],
+    )
 
     # using default "parquet" format string
 
-    base_dir = tempdir / 'parquet_dataset'
+    base_dir = tempdir / "parquet_dataset"
     ds.write_dataset(table, base_dir, format="parquet")
     # check that all files are present
     file_paths = list(base_dir.rglob("*"))
@@ -2577,7 +2696,7 @@ def test_write_dataset_parquet(tempdir):
     for version in ["1.0", "2.0"]:
         format = ds.ParquetFileFormat()
         opts = format.make_write_options(version=version)
-        base_dir = tempdir / 'parquet_dataset_version{0}'.format(version)
+        base_dir = tempdir / "parquet_dataset_version{0}".format(version)
         ds.write_dataset(table, base_dir, format=format, file_options=opts)
         meta = pq.read_metadata(base_dir / "part-0.parquet")
         assert meta.format_version == version
@@ -2602,12 +2721,12 @@ def test_write_dataset_schema_metadata(tempdir):
     # ensure that schema metadata gets written
     from pyarrow import feather
 
-    table = pa.table({'a': [1, 2, 3]})
-    table = table.replace_schema_metadata({b'key': b'value'})
+    table = pa.table({"a": [1, 2, 3]})
+    table = table.replace_schema_metadata({b"key": b"value"})
     ds.write_dataset(table, tempdir, format="feather")
 
     schema = feather.read_table(tempdir / "part-0.feather").schema
-    assert schema.metadata == {b'key': b'value'}
+    assert schema.metadata == {b"key": b"value"}
 
 
 @pytest.mark.parquet
@@ -2615,12 +2734,12 @@ def test_write_dataset_schema_metadata_parquet(tempdir):
     # ensure that schema metadata gets written
     import pyarrow.parquet as pq
 
-    table = pa.table({'a': [1, 2, 3]})
-    table = table.replace_schema_metadata({b'key': b'value'})
+    table = pa.table({"a": [1, 2, 3]})
+    table = table.replace_schema_metadata({b"key": b"value"})
     ds.write_dataset(table, tempdir, format="parquet")
 
     schema = pq.read_table(tempdir / "part-0.parquet").schema
-    assert schema.metadata == {b'key': b'value'}
+    assert schema.metadata == {b"key": b"value"}
 
 
 @pytest.mark.parquet
@@ -2628,22 +2747,23 @@ def test_write_dataset_schema_metadata_parquet(tempdir):
 def test_write_dataset_s3(s3_example_simple):
     # write dataset with s3 filesystem
     _, _, fs, _, host, port, access_key, secret_key = s3_example_simple
-    uri_template = (
-        "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format(
-            access_key, secret_key, host, port)
+    uri_template = "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format(
+        access_key, secret_key, host, port
     )
 
-    table = pa.table([
-        pa.array(range(20)), pa.array(np.random.randn(20)),
-        pa.array(np.repeat(['a', 'b'], 10))],
-        names=["f1", "f2", "part"]
+    table = pa.table(
+        [
+            pa.array(range(20)),
+            pa.array(np.random.randn(20)),
+            pa.array(np.repeat(["a", "b"], 10)),
+        ],
+        names=["f1", "f2", "part"],
     )
     part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
 
     # writing with filesystem object
     ds.write_dataset(
-        table, "mybucket/dataset", filesystem=fs, format="feather",
-        partitioning=part
+        table, "mybucket/dataset", filesystem=fs, format="feather", partitioning=part
     )
     # check rountrip
     result = ds.dataset(

From d502e05f7ab48dcf51001ffae1a9a5fc07bac792 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 8 Feb 2021 07:43:23 -1000
Subject: [PATCH 09/33] WIP

---
 python/pyarrow/_compute.pyx          |  21 ++++++
 python/pyarrow/array.pxi             |   5 +-
 python/pyarrow/compute.py            | 106 ++++++++++++++++-----------
 python/pyarrow/includes/libarrow.pxd |  13 ++++
 4 files changed, 102 insertions(+), 43 deletions(-)

diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index e5a19288b87..8dea882334a 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -648,6 +648,27 @@ class FilterOptions(_FilterOptions):
     def __init__(self, null_selection_behavior='drop'):
         self._set_options(null_selection_behavior)
 
+cdef class _DictionaryEncodeOptions(FunctionOptions):
+    cdef:
+        unique_ptr[CDictionaryEncodeOptions] dictionary_encode_options
+
+    cdef const CFunctionOptions* get_options(self) except NULL:
+        return self.dictionary_encode_options.get()
+
+    def _set_options(self, null_encoding_behavior):
+        if null_encoding_behavior == 'encode':
+            self.dictionary_encode_options.reset(
+                new CDictionaryEncodeOptions(CDictionaryEncodeNullEncodingBehavior_ENCODE))
+        elif null_encoding_behavior == 'mask':
+            self.dictionary_encode_options.reset(
+                new CDictionaryEncodeOptions(CDictionaryEncodeNullEncodingBehavior_MASK))
+        else:
+            raise ValueError('"{}" is not a valid null_encoding_behavior'.format(null_encoding_behavior))
+
+class DictionaryEncodeOptions(_DictionaryEncodeOptions):
+    def __init__(self, null_encoding_behavior='mask'):
+        self._set_options(null_encoding_behavior)
+
 
 cdef class _TakeOptions(FunctionOptions):
     cdef:
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index ae9e213b98d..a832b00b1eb 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -842,11 +842,12 @@ cdef class Array(_PandasConvertible):
         """
         return _pc().call_function('unique', [self])
 
-    def dictionary_encode(self):
+    def dictionary_encode(self, null_encoding='mask'):
         """
         Compute dictionary-encoded representation of array.
         """
-        return _pc().call_function('dictionary_encode', [self])
+        options = _pc().DictionaryEncodeOptions(null_encoding)
+        return _pc().call_function('dictionary_encode', [self], options)
 
     def value_counts(self):
         """
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index 616b2de89ec..1483d97a72d 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -30,6 +30,7 @@
     ArraySortOptions,
     CastOptions,
     CountOptions,
+    DictionaryEncodeOptions,
     FilterOptions,
     MatchSubstringOptions,
     MinMaxOptions,
@@ -68,14 +69,14 @@ def _get_arg_names(func):
             arg_names = ["left", "right"]
         else:
             raise NotImplementedError(
-                f"unsupported arity: {func.arity} (function: {func.name})")
+                f"unsupported arity: {func.arity} (function: {func.name})"
+            )
 
     return arg_names
 
 
 def _decorate_compute_function(wrapper, exposed_name, func, option_class):
-    wrapper.__arrow_compute_function__ = dict(name=func.name,
-                                              arity=func.arity)
+    wrapper.__arrow_compute_function__ = dict(name=func.name, arity=func.arity)
     wrapper.__name__ = exposed_name
     wrapper.__qualname__ = exposed_name
 
@@ -85,47 +86,64 @@ def _decorate_compute_function(wrapper, exposed_name, func, option_class):
     summary = cpp_doc.summary
     if not summary:
         arg_str = "arguments" if func.arity > 1 else "argument"
-        summary = ("Call compute function {!r} with the given {}"
-                   .format(func.name, arg_str))
+        summary = "Call compute function {!r} with the given {}".format(
+            func.name, arg_str
+        )
 
     description = cpp_doc.description
     arg_names = _get_arg_names(func)
 
-    doc_pieces.append("""\
+    doc_pieces.append(
+        """\
         {}.
 
-        """.format(summary))
+        """.format(
+            summary
+        )
+    )
 
     if description:
         doc_pieces.append("{}\n\n".format(description))
 
-    doc_pieces.append("""\
+    doc_pieces.append(
+        """\
         Parameters
         ----------
-        """)
+        """
+    )
 
     for arg_name in arg_names:
-        if func.kind in ('vector', 'scalar_aggregate'):
-            arg_type = 'Array-like'
+        if func.kind in ("vector", "scalar_aggregate"):
+            arg_type = "Array-like"
         else:
-            arg_type = 'Array-like or scalar-like'
-        doc_pieces.append("""\
+            arg_type = "Array-like or scalar-like"
+        doc_pieces.append(
+            """\
             {} : {}
                 Argument to compute function
-            """.format(arg_name, arg_type))
+            """.format(
+                arg_name, arg_type
+            )
+        )
 
-    doc_pieces.append("""\
+    doc_pieces.append(
+        """\
         memory_pool : pyarrow.MemoryPool, optional
             If not passed, will allocate memory from the default memory pool.
-        """)
+        """
+    )
     if option_class is not None:
-        doc_pieces.append("""\
+        doc_pieces.append(
+            """\
             options : pyarrow.compute.{0}, optional
                 Parameters altering compute function semantics
             **kwargs : optional
                 Parameters for {0} constructor.  Either `options`
                 or `**kwargs` can be passed, but not both at the same time.
-            """.format(option_class.__name__))
+            """.format(
+                option_class.__name__
+            )
+        )
 
     wrapper.__doc__ = "".join(dedent(s) for s in doc_pieces)
     return wrapper
@@ -138,8 +156,9 @@ def _get_options_class(func):
     try:
         return globals()[class_name]
     except KeyError:
-        warnings.warn("Python binding for {} not exposed"
-                      .format(class_name), RuntimeWarning)
+        warnings.warn(
+            "Python binding for {} not exposed".format(class_name), RuntimeWarning
+        )
         return None
 
 
@@ -149,8 +168,8 @@ def _handle_options(name, option_class, options, kwargs):
             return option_class(**kwargs)
         raise TypeError(
             "Function {!r} called with both an 'options' argument "
-            "and additional named arguments"
-            .format(name))
+            "and additional named arguments".format(name)
+        )
 
     if options is not None:
         if isinstance(options, dict):
@@ -158,20 +177,25 @@ def _handle_options(name, option_class, options, kwargs):
         elif isinstance(options, option_class):
             return options
         raise TypeError(
-            "Function {!r} expected a {} parameter, got {}"
-            .format(name, option_class, type(options)))
+            "Function {!r} expected a {} parameter, got {}".format(
+                name, option_class, type(options)
+            )
+        )
 
     return options
 
 
-_wrapper_template = dedent("""\
+_wrapper_template = dedent(
+    """\
     def make_wrapper(func, option_class):
         def {func_name}({args_sig}{kwonly}, memory_pool=None):
             return func.call([{args_sig}], None, memory_pool)
         return {func_name}
-    """)
+    """
+)
 
-_wrapper_options_template = dedent("""\
+_wrapper_options_template = dedent(
+    """\
     def make_wrapper(func, option_class):
         def {func_name}({args_sig}{kwonly}, options=None, memory_pool=None,
                         **kwargs):
@@ -179,14 +203,15 @@ def {func_name}({args_sig}{kwonly}, options=None, memory_pool=None,
                                       kwargs)
             return func.call([{args_sig}], options, memory_pool)
         return {func_name}
-    """)
+    """
+)
 
 
 def _wrap_function(name, func):
     option_class = _get_options_class(func)
     arg_names = _get_arg_names(func)
-    args_sig = ', '.join(arg_names)
-    kwonly = '' if arg_names[-1].startswith('*') else ', *'
+    args_sig = ", ".join(arg_names)
+    kwonly = "" if arg_names[-1].startswith("*") else ", *"
 
     # Generate templated wrapper, so that the signature matches
     # the documented argument names.
@@ -195,9 +220,10 @@ def _wrap_function(name, func):
         template = _wrapper_options_template
     else:
         template = _wrapper_template
-    exec(template.format(func_name=name, args_sig=args_sig, kwonly=kwonly),
-         globals(), ns)
-    wrapper = ns['make_wrapper'](func, option_class)
+    exec(
+        template.format(func_name=name, args_sig=args_sig, kwonly=kwonly), globals(), ns
+    )
+    wrapper = ns["make_wrapper"](func, option_class)
 
     return _decorate_compute_function(wrapper, name, func, option_class)
 
@@ -213,8 +239,7 @@ def _make_global_functions():
     reg = function_registry()
 
     # Avoid clashes with Python keywords
-    rewrites = {'and': 'and_',
-                'or': 'or_'}
+    rewrites = {"and": "and_", "or": "or_"}
 
     for cpp_name in reg.list_functions():
         name = rewrites.get(cpp_name, cpp_name)
@@ -298,8 +323,7 @@ def match_substring(array, pattern):
     -------
     result : pyarrow.Array or pyarrow.ChunkedArray
     """
-    return call_function("match_substring", [array],
-                         MatchSubstringOptions(pattern))
+    return call_function("match_substring", [array], MatchSubstringOptions(pattern))
 
 
 def sum(array):
@@ -314,7 +338,7 @@ def sum(array):
     -------
     sum : pyarrow.Scalar
     """
-    return call_function('sum', [array])
+    return call_function("sum", [array])
 
 
 def mode(array, n=1):
@@ -346,7 +370,7 @@ def mode(array, n=1):
     return call_function("mode", [array], options)
 
 
-def filter(data, mask, null_selection_behavior='drop'):
+def filter(data, mask, null_selection_behavior="drop"):
     """
     Select values (or records) from array- or table-like data given boolean
     filter, where true values are selected.
@@ -387,7 +411,7 @@ def filter(data, mask, null_selection_behavior='drop'):
     ]
     """
     options = FilterOptions(null_selection_behavior)
-    return call_function('filter', [data, mask], options)
+    return call_function("filter", [data, mask], options)
 
 
 def take(data, indices, *, boundscheck=True, memory_pool=None):
@@ -428,7 +452,7 @@ def take(data, indices, *, boundscheck=True, memory_pool=None):
     ]
     """
     options = TakeOptions(boundscheck=boundscheck)
-    return call_function('take', [data, indices], options, memory_pool)
+    return call_function("take", [data, indices], options, memory_pool)
 
 
 def fill_null(values, fill_value):
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index e10ef1e3a5e..983ee0df0f1 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1802,6 +1802,19 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
         CFilterOptions(CFilterNullSelectionBehavior null_selection)
         CFilterNullSelectionBehavior null_selection_behavior
 
+    enum CDictionaryEncodeNullEncodingBehavior \
+            "arrow::compute::DictionaryEncodeOptions::NullEncodingBehavior":
+        CDictionaryEncodeNullEncodingBehavior_ENCODE \
+            "arrow::compute::DictionaryEncodeOptions::ENCODE"
+        CDictionaryEncodeNullEncodingBehavior_MASK \
+            "arrow::compute::DictionaryEncodeOptions::MASK"
+
+    cdef cppclass CDictionaryEncodeOptions \
+        "arrow::compute::DictionaryEncodeOptions"(CFunctionOptions):
+        CDictionaryEncodeOptions()
+        CDictionaryEncodeOptions(CDictionaryEncodeNullEncodingBehavior null_encoding)
+        CDictionaryEncodeNullEncodingBehavior null_encoding
+
     cdef cppclass CTakeOptions \
             " arrow::compute::TakeOptions"(CFunctionOptions):
         CTakeOptions(c_bool boundscheck)

From 5b18c961ad265d2600fd6fa9c14219cfdc8335b1 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Wed, 10 Feb 2021 22:51:05 -1000
Subject: [PATCH 10/33] WIP

---
 cpp/src/arrow/dataset/expression.cc      |  4 +++-
 cpp/src/arrow/dataset/expression.h       |  2 +-
 cpp/src/arrow/dataset/expression_test.cc |  9 ++++++++-
 cpp/src/arrow/dataset/partition.cc       | 14 +++++++++-----
 cpp/src/arrow/dataset/partition.h        |  1 +
 cpp/src/arrow/dataset/partition_test.cc  | 23 ++++++++++++++++-------
 python/pyarrow/table.pxi                 |  5 +++--
 7 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/cpp/src/arrow/dataset/expression.cc b/cpp/src/arrow/dataset/expression.cc
index d5bcd3fb0eb..6afe9309c54 100644
--- a/cpp/src/arrow/dataset/expression.cc
+++ b/cpp/src/arrow/dataset/expression.cc
@@ -51,7 +51,9 @@ Expression::Expression(Parameter parameter)
 
 Expression literal(Datum lit) { return Expression(std::move(lit)); }
 
-Expression null_literal() { return Expression(Datum()); }
+Expression null_literal(const std::shared_ptr<DataType>& type) {
+  return Expression(MakeNullScalar(type));
+}
 
 Expression field_ref(FieldRef ref) {
   return Expression(Expression::Parameter{std::move(ref), {}});
diff --git a/cpp/src/arrow/dataset/expression.h b/cpp/src/arrow/dataset/expression.h
index 33ffdddb8a6..79d7f077f23 100644
--- a/cpp/src/arrow/dataset/expression.h
+++ b/cpp/src/arrow/dataset/expression.h
@@ -136,7 +136,7 @@ ARROW_DS_EXPORT
 Expression literal(Datum lit);
 
 ARROW_DS_EXPORT
-Expression null_literal();
+Expression null_literal(const std::shared_ptr<DataType>& type);
 
 template <typename Arg>
 Expression literal(Arg&& arg) {
diff --git a/cpp/src/arrow/dataset/expression_test.cc b/cpp/src/arrow/dataset/expression_test.cc
index 2f0110255ec..81a9c74fad1 100644
--- a/cpp/src/arrow/dataset/expression_test.cc
+++ b/cpp/src/arrow/dataset/expression_test.cc
@@ -250,6 +250,9 @@ TEST(Expression, Hash) {
   EXPECT_FALSE(set.emplace(literal(1)).second) << "already inserted";
   EXPECT_TRUE(set.emplace(literal(3)).second);
 
+  EXPECT_TRUE(set.emplace(null_literal(int32())).second);
+  EXPECT_FALSE(set.emplace(null_literal(int32())).second) << "already inserted";
+  EXPECT_TRUE(set.emplace(null_literal(float32())).second);
   // NB: no validation on construction; we couldn't execute
   //     add with zero arguments
   EXPECT_TRUE(set.emplace(call("add", {})).second);
@@ -258,7 +261,7 @@ TEST(Expression, Hash) {
   // NB: unbound expressions don't check for availability in any registry
   EXPECT_TRUE(set.emplace(call("widgetify", {})).second);
 
-  EXPECT_EQ(set.size(), 6);
+  EXPECT_EQ(set.size(), 8);
 }
 
 TEST(Expression, IsScalarExpression) {
@@ -1013,6 +1016,10 @@ TEST(Expression, SimplifyWithGuarantee) {
   Simplify{greater(field_ref("dict_i32"), literal(int64_t(1)))}
       .WithGuarantee(equal(field_ref("dict_i32"), literal(0)))
       .Expect(false);
+
+  Simplify{null_literal(int32())}
+      .WithGuarantee(null_literal(int32()))
+      .Expect(literal(true));
 }
 
 TEST(Expression, SimplifyThenExecute) {
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 595cce8021d..c5f55d73b69 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -147,7 +147,9 @@ Result<Expression> KeyValuePartitioning::ConvertKey(const Key& key) const {
 
   std::shared_ptr<Scalar> converted;
 
-  if (field->type()->id() == Type::DICTIONARY) {
+  if (key.null) {
+    converted = MakeNullScalar(field->type());
+  } else if (field->type()->id() == Type::DICTIONARY) {
     if (dictionaries_.empty() || dictionaries_[field_index] == nullptr) {
       return Status::Invalid("No dictionary provided for dictionary field ",
                              field->ToString());
@@ -230,7 +232,7 @@ std::vector<KeyValuePartitioning::Key> DirectoryPartitioning::ParseKeys(
   for (auto&& segment : fs::internal::SplitAbstractPath(path)) {
     if (i >= schema_->num_fields()) break;
 
-    keys.push_back({schema_->field(i++)->name(), std::move(segment)});
+    keys.push_back({schema_->field(i++)->name(), std::move(segment), false});
   }
 
   return keys;
@@ -419,9 +421,9 @@ util::optional<KeyValuePartitioning::Key> HivePartitioning::ParseKey(
 
   auto value = segment.substr(name_end + 1);
   if (value == null_fallback) {
-    return util::nullopt;
+    return Key{segment.substr(0, name_end), "", true};
   }
-  return Key{segment.substr(0, name_end), segment.substr(name_end + 1)};
+  return Key{segment.substr(0, name_end), segment.substr(name_end + 1), false};
 }
 
 std::vector<KeyValuePartitioning::Key> HivePartitioning::ParseKeys(
@@ -443,7 +445,9 @@ Result<std::string> HivePartitioning::FormatValues(const ScalarVector& values) c
   for (int i = 0; i < schema_->num_fields(); ++i) {
     const std::string& name = schema_->field(i)->name();
 
-    if (values[i] == nullptr || !values[i]->is_valid) {
+    if (values[i] == nullptr) {
+      segments[i] = "";
+    } else if (!values[i]->is_valid) {
       // If no key is available just provide a placeholder segment to maintain the
       // field_index <-> path nesting relation
       segments[i] = name + "=" + null_fallback_;
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index 5cdf7a1df66..e5afd00c76d 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -125,6 +125,7 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
   /// of a scalar value
   struct Key {
     std::string name, value;
+    bool null;
   };
 
   static Status SetDefaultValuesFromKeys(const Expression& expr,
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 2558af293da..7e19e4f382d 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -297,13 +297,16 @@ TEST_F(TestPartitioning, DiscoverSchemaSegfault) {
 
 TEST_F(TestPartitioning, HivePartitioning) {
   partitioning_ = std::make_shared<HivePartitioning>(
-      schema({field("alpha", int32()), field("beta", float32())}));
+      schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz");
 
   AssertParse("/alpha=0/beta=3.25", and_(equal(field_ref("alpha"), literal(0)),
                                          equal(field_ref("beta"), literal(3.25f))));
   AssertParse("/beta=3.25/alpha=0", and_(equal(field_ref("beta"), literal(3.25f)),
                                          equal(field_ref("alpha"), literal(0))));
   AssertParse("/alpha=0", equal(field_ref("alpha"), literal(0)));
+  AssertParse("/alpha=xyz/beta=3.25",
+              and_(equal(field_ref("alpha"), null_literal(int32())),
+                   equal(field_ref("beta"), literal(3.25f))));
   AssertParse("/beta=3.25", equal(field_ref("beta"), literal(3.25f)));
   AssertParse("", literal(true));
 
@@ -332,9 +335,18 @@ TEST_F(TestPartitioning, HivePartitioningFormat) {
   AssertFormat(and_(equal(field_ref("beta"), literal(3.25f)),
                     equal(field_ref("alpha"), literal(0))),
                "alpha=0/beta=3.25");
-  AssertFormat(equal(field_ref("alpha"), literal(0)), "alpha=0/beta=xyz");
-  AssertFormat(equal(field_ref("beta"), literal(3.25f)), "alpha=xyz/beta=3.25");
-  AssertFormat(literal(true), "alpha=xyz/beta=xyz");
+  AssertFormat(equal(field_ref("alpha"), literal(0)), "alpha=0");
+  AssertFormat(and_(equal(field_ref("alpha"), literal(0)),
+                    equal(field_ref("beta"), null_literal(float32()))),
+               "alpha=0/beta=xyz");
+  AssertFormat(and_(equal(field_ref("alpha"), null_literal(int32())),
+                    equal(field_ref("beta"), literal(3.25f))),
+               "alpha=xyz/beta=3.25");
+  AssertFormat(literal(true), "");
+
+  AssertFormat(and_(equal(field_ref("alpha"), null_literal(int32())),
+                    equal(field_ref("beta"), null_literal(float32()))),
+               "alpha=xyz/beta=xyz");
 
   ASSERT_OK_AND_ASSIGN(written_schema_,
                        written_schema_->AddField(0, field("gamma", utf8())));
@@ -343,9 +355,6 @@ TEST_F(TestPartitioning, HivePartitioningFormat) {
                      equal(field_ref("beta"), literal(3.25f))}),
                "alpha=0/beta=3.25");
 
-  AssertFormat(equal(field_ref("alpha"), literal(MakeNullScalar(int32()))),
-               "alpha=xyz/beta=xyz");
-
   // written_schema_ is incompatible with partitioning_'s schema
   written_schema_ = schema({field("alpha", utf8()), field("beta", utf8())});
   AssertFormatError<StatusCode::TypeError>(
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index c6b0b4180b6..3f1fc28ee60 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -276,7 +276,7 @@ cdef class ChunkedArray(_PandasConvertible):
         """
         return _pc().cast(self, target_type, safe=safe)
 
-    def dictionary_encode(self):
+    def dictionary_encode(self, null_encoding='mask'):
         """
         Compute dictionary-encoded representation of array
 
@@ -285,7 +285,8 @@ cdef class ChunkedArray(_PandasConvertible):
         pyarrow.ChunkedArray
             Same chunking as the input, all chunks share a common dictionary.
         """
-        return _pc().call_function('dictionary_encode', [self])
+        options = _pc().DictionaryEncodeOptions(null_encoding)
+        return _pc().call_function('dictionary_encode', [self], options)
 
     def flatten(self, MemoryPool memory_pool=None):
         """

From 613b286c2d1ed1ad7998a4458d216a9d5d05a83e Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Wed, 10 Feb 2021 23:38:07 -1000
Subject: [PATCH 11/33] WIP

---
 cpp/src/arrow/dataset/expression_test.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/dataset/expression_test.cc b/cpp/src/arrow/dataset/expression_test.cc
index 81a9c74fad1..6329c5f83e9 100644
--- a/cpp/src/arrow/dataset/expression_test.cc
+++ b/cpp/src/arrow/dataset/expression_test.cc
@@ -1017,8 +1017,12 @@ TEST(Expression, SimplifyWithGuarantee) {
       .WithGuarantee(equal(field_ref("dict_i32"), literal(0)))
       .Expect(false);
 
-  Simplify{null_literal(int32())}
-      .WithGuarantee(null_literal(int32()))
+  Simplify{equal(field_ref("i32"), literal(7))}
+      .WithGuarantee(equal(field_ref("i32"), literal(7)))
+      .Expect(literal(true));
+
+  Simplify{equal(field_ref("i32"), null_literal(int32()))}
+      .WithGuarantee(equal(field_ref("i32"), null_literal(int32())))
       .Expect(literal(true));
 }
 

From 3f4ec252ae6251040fed2d035dcf159d5631f342 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Thu, 11 Feb 2021 13:43:21 -1000
Subject: [PATCH 12/33] Improved null handling in expression/partition a bit

---
 cpp/src/arrow/dataset/expression.cc      | 90 +++++++++++++++++-------
 cpp/src/arrow/dataset/expression.h       | 26 ++++++-
 cpp/src/arrow/dataset/expression_test.cc | 51 +++++++++-----
 cpp/src/arrow/dataset/partition.cc       | 56 ++++++++++-----
 cpp/src/arrow/dataset/partition_test.cc  | 17 ++---
 5 files changed, 166 insertions(+), 74 deletions(-)

diff --git a/cpp/src/arrow/dataset/expression.cc b/cpp/src/arrow/dataset/expression.cc
index 6afe9309c54..ef92ae09fe7 100644
--- a/cpp/src/arrow/dataset/expression.cc
+++ b/cpp/src/arrow/dataset/expression.cc
@@ -688,30 +688,42 @@ std::vector<Expression> GuaranteeConjunctionMembers(
 // conjunction_members
 Status ExtractKnownFieldValuesImpl(
     std::vector<Expression>* conjunction_members,
-    std::unordered_map<FieldRef, Datum, FieldRef::Hash>* known_values) {
-  auto unconsumed_end =
-      std::partition(conjunction_members->begin(), conjunction_members->end(),
-                     [](const Expression& expr) {
-                       // search for an equality conditions between a field and a literal
-                       auto call = expr.call();
-                       if (!call) return true;
-
-                       if (call->function_name == "equal") {
-                         auto ref = call->arguments[0].field_ref();
-                         auto lit = call->arguments[1].literal();
-                         return !(ref && lit);
-                       }
-
-                       return true;
-                     });
+    std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash>* known_values) {
+  auto unconsumed_end = std::partition(
+      conjunction_members->begin(), conjunction_members->end(),
+      [](const Expression& expr) {
+        // search for an equality conditions between a field and a literal
+        auto call = expr.call();
+        if (!call) return true;
+
+        if (call->function_name == "equal") {
+          auto ref = call->arguments[0].field_ref();
+          auto lit = call->arguments[1].literal();
+          return !(ref && lit);
+        }
+
+        if (call->function_name == "is_null" || call->function_name == "is_valid") {
+          auto ref = call->arguments[0].field_ref();
+          return !ref;
+        }
+
+        return true;
+      });
 
   for (auto it = unconsumed_end; it != conjunction_members->end(); ++it) {
     auto call = CallNotNull(*it);
 
-    auto ref = call->arguments[0].field_ref();
-    auto lit = call->arguments[1].literal();
-
-    known_values->emplace(*ref, *lit);
+    if (call->function_name == "equal") {
+      auto ref = call->arguments[0].field_ref();
+      auto lit = call->arguments[1].literal();
+      known_values->emplace(*ref, *lit);
+    } else if (call->function_name == "is_null") {
+      auto ref = call->arguments[0].field_ref();
+      known_values->emplace(*ref, false);
+    } else if (call->function_name == "is_valid") {
+      auto ref = call->arguments[0].field_ref();
+      known_values->emplace(*ref, true);
+    }
   }
 
   conjunction_members->erase(unconsumed_end, conjunction_members->end());
@@ -721,16 +733,16 @@ Status ExtractKnownFieldValuesImpl(
 
 }  // namespace
 
-Result<std::unordered_map<FieldRef, Datum, FieldRef::Hash>> ExtractKnownFieldValues(
-    const Expression& guaranteed_true_predicate) {
+Result<std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash>>
+ExtractKnownFieldValues(const Expression& guaranteed_true_predicate) {
   auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate);
-  std::unordered_map<FieldRef, Datum, FieldRef::Hash> known_values;
+  std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash> known_values;
   RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values));
   return known_values;
 }
 
 Result<Expression> ReplaceFieldsWithKnownValues(
-    const std::unordered_map<FieldRef, Datum, FieldRef::Hash>& known_values,
+    const std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash>& known_values,
     Expression expr) {
   if (!expr.IsBound()) {
     return Status::Invalid(
@@ -743,7 +755,11 @@ Result<Expression> ReplaceFieldsWithKnownValues(
         if (auto ref = expr.field_ref()) {
           auto it = known_values.find(*ref);
           if (it != known_values.end()) {
-            Datum lit = it->second;
+            const auto& known_value = it->second;
+            if (!known_value.concrete()) {
+              return expr;
+            }
+            auto lit = known_value.datum;
             if (expr.type()->id() == Type::DICTIONARY) {
               if (lit.is_scalar()) {
                 // FIXME the "right" way to support this is adding support for scalars to
@@ -760,9 +776,25 @@ Result<Expression> ReplaceFieldsWithKnownValues(
                     DictionaryScalar::Make(std::move(index), std::move(dictionary)));
               }
             }
-            ARROW_ASSIGN_OR_RAISE(lit, compute::Cast(it->second, expr.type()));
+            ARROW_ASSIGN_OR_RAISE(lit, compute::Cast(lit, expr.type()));
             return literal(std::move(lit));
           }
+        } else if (auto call = expr.call()) {
+          if (call->function_name == "is_null") {
+            if (auto ref = call->arguments[0].field_ref()) {
+              auto it = known_values.find(*ref);
+              if (it != known_values.end()) {
+                return literal(!it->second.valid);
+              }
+            }
+          } else if (call->function_name == "is_valid") {
+            if (auto ref = call->arguments[0].field_ref()) {
+              auto it = known_values.find(*ref);
+              if (it != known_values.end()) {
+                return literal(it->second.valid);
+              }
+            }
+          }
         }
         return expr;
       },
@@ -939,7 +971,7 @@ Result<Expression> SimplifyWithGuarantee(Expression expr,
                                          const Expression& guaranteed_true_predicate) {
   auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate);
 
-  std::unordered_map<FieldRef, Datum, FieldRef::Hash> known_values;
+  std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash> known_values;
   RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values));
 
   ARROW_ASSIGN_OR_RAISE(expr,
@@ -1226,6 +1258,10 @@ Expression greater_equal(Expression lhs, Expression rhs) {
   return call("greater_equal", {std::move(lhs), std::move(rhs)});
 }
 
+Expression is_null(Expression lhs) { return call("is_null", {std::move(lhs)}); }
+
+Expression is_valid(Expression lhs) { return call("is_valid", {std::move(lhs)}); }
+
 Expression and_(Expression lhs, Expression rhs) {
   return call("and_kleene", {std::move(lhs), std::move(rhs)});
 }
diff --git a/cpp/src/arrow/dataset/expression.h b/cpp/src/arrow/dataset/expression.h
index 79d7f077f23..785290e4bb2 100644
--- a/cpp/src/arrow/dataset/expression.h
+++ b/cpp/src/arrow/dataset/expression.h
@@ -162,10 +162,25 @@ Expression call(std::string function, std::vector<Expression> arguments,
 ARROW_DS_EXPORT
 std::vector<FieldRef> FieldsInExpression(const Expression&);
 
+/// Represents either a concrete value or a hint that a field is valid/invalid
+struct KnownFieldValue {
+  Datum datum;
+  bool valid;
+
+  KnownFieldValue(const Datum& datum)
+      : datum(datum), valid(datum.length() == datum.null_count()) {}
+  KnownFieldValue(bool is_valid) : datum(), valid(is_valid) {}
+
+  inline bool concrete() const { return datum.kind() != Datum::Kind::NONE; }
+  bool operator==(const KnownFieldValue& other) const {
+    return datum == other.datum && valid == other.valid;
+  }
+};
+
 /// Assemble a mapping from field references to known values.
 ARROW_DS_EXPORT
-Result<std::unordered_map<FieldRef, Datum, FieldRef::Hash>> ExtractKnownFieldValues(
-    const Expression& guaranteed_true_predicate);
+Result<std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash>>
+ExtractKnownFieldValues(const Expression& guaranteed_true_predicate);
 
 /// \defgroup expression-passes Functions for modification of Expressions
 ///
@@ -194,7 +209,8 @@ Result<Expression> FoldConstants(Expression);
 /// Simplify Expressions by replacing with known values of the fields which it references.
 ARROW_DS_EXPORT
 Result<Expression> ReplaceFieldsWithKnownValues(
-    const std::unordered_map<FieldRef, Datum, FieldRef::Hash>& known_values, Expression);
+    const std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash>& known_values,
+    Expression);
 
 /// Simplify an expression by replacing subexpressions based on a guarantee:
 /// a boolean expression which is guaranteed to evaluate to `true`. For example, this is
@@ -239,6 +255,10 @@ ARROW_DS_EXPORT Expression greater(Expression lhs, Expression rhs);
 
 ARROW_DS_EXPORT Expression greater_equal(Expression lhs, Expression rhs);
 
+ARROW_DS_EXPORT Expression is_null(Expression lhs);
+
+ARROW_DS_EXPORT Expression is_valid(Expression lhs);
+
 ARROW_DS_EXPORT Expression and_(Expression lhs, Expression rhs);
 ARROW_DS_EXPORT Expression and_(const std::vector<Expression>&);
 ARROW_DS_EXPORT Expression or_(Expression lhs, Expression rhs);
diff --git a/cpp/src/arrow/dataset/expression_test.cc b/cpp/src/arrow/dataset/expression_test.cc
index 6329c5f83e9..3aa62319e85 100644
--- a/cpp/src/arrow/dataset/expression_test.cc
+++ b/cpp/src/arrow/dataset/expression_test.cc
@@ -606,6 +606,8 @@ TEST(Expression, FoldConstants) {
   // call against literals (3 + 2 == 5)
   ExpectFoldsTo(call("add", {literal(3), literal(2)}), literal(5));
 
+  ExpectFoldsTo(call("equal", {literal(3), literal(3)}), literal(true));
+
   // call against literal and field_ref
   ExpectFoldsTo(call("add", {literal(3), field_ref("i32")}),
                 call("add", {literal(3), field_ref("i32")}));
@@ -674,8 +676,9 @@ TEST(Expression, FoldConstantsBoolean) {
 
 TEST(Expression, ExtractKnownFieldValues) {
   struct {
-    void operator()(Expression guarantee,
-                    std::unordered_map<FieldRef, Datum, FieldRef::Hash> expected) {
+    void operator()(
+        Expression guarantee,
+        std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash> expected) {
       ASSERT_OK_AND_ASSIGN(auto actual, ExtractKnownFieldValues(guarantee));
       EXPECT_THAT(actual, UnorderedElementsAreArray(expected))
           << "  guarantee: " << guarantee.ToString();
@@ -723,20 +726,20 @@ TEST(Expression, ExtractKnownFieldValues) {
 }
 
 TEST(Expression, ReplaceFieldsWithKnownValues) {
-  auto ExpectReplacesTo =
-      [](Expression expr,
-         std::unordered_map<FieldRef, Datum, FieldRef::Hash> known_values,
-         Expression unbound_expected) {
-        ASSERT_OK_AND_ASSIGN(expr, expr.Bind(*kBoringSchema));
-        ASSERT_OK_AND_ASSIGN(auto expected, unbound_expected.Bind(*kBoringSchema));
-        ASSERT_OK_AND_ASSIGN(auto replaced,
-                             ReplaceFieldsWithKnownValues(known_values, expr));
+  auto ExpectReplacesTo = [](Expression expr,
+                             const std::unordered_map<FieldRef, KnownFieldValue,
+                                                      FieldRef::Hash>& known_values,
+                             Expression unbound_expected) {
+    ASSERT_OK_AND_ASSIGN(expr, expr.Bind(*kBoringSchema));
+    ASSERT_OK_AND_ASSIGN(auto expected, unbound_expected.Bind(*kBoringSchema));
+    ASSERT_OK_AND_ASSIGN(auto replaced, ReplaceFieldsWithKnownValues(known_values, expr));
 
-        EXPECT_EQ(replaced, expected);
-        ExpectIdenticalIfUnchanged(replaced, expr);
-      };
+    EXPECT_EQ(replaced, expected);
+    ExpectIdenticalIfUnchanged(replaced, expr);
+  };
 
-  std::unordered_map<FieldRef, Datum, FieldRef::Hash> i32_is_3{{"i32", Datum(3)}};
+  std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash> i32_is_3{
+      {"i32", Datum(3)}};
 
   ExpectReplacesTo(literal(1), i32_is_3, literal(1));
 
@@ -768,6 +771,14 @@ TEST(Expression, ReplaceFieldsWithKnownValues) {
                                         }),
                                    literal(2),
                                }));
+
+  std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash> a_valid_b_invalid{
+      {"a", true}, {"b", false}};
+
+  ExpectReplacesTo(is_null(field_ref("a")), a_valid_b_invalid, literal(false));
+  ExpectReplacesTo(is_valid(field_ref("a")), a_valid_b_invalid, literal(true));
+  ExpectReplacesTo(is_null(field_ref("b")), a_valid_b_invalid, literal(true));
+  ExpectReplacesTo(is_valid(field_ref("b")), a_valid_b_invalid, literal(false));
 }
 
 struct {
@@ -1021,8 +1032,16 @@ TEST(Expression, SimplifyWithGuarantee) {
       .WithGuarantee(equal(field_ref("i32"), literal(7)))
       .Expect(literal(true));
 
-  Simplify{equal(field_ref("i32"), null_literal(int32()))}
-      .WithGuarantee(equal(field_ref("i32"), null_literal(int32())))
+  Simplify{equal(field_ref("i32"), literal(7))}
+      .WithGuarantee(not_(equal(field_ref("i32"), literal(7))))
+      .Expect(equal(field_ref("i32"), literal(7)));
+
+  Simplify{is_null(field_ref("i32"))}
+      .WithGuarantee(is_null(field_ref("i32")))
+      .Expect(literal(true));
+
+  Simplify{is_valid(field_ref("i32"))}
+      .WithGuarantee(is_valid(field_ref("i32")))
       .Expect(literal(true));
 }
 
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index c5f55d73b69..2afaf414f9d 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -74,15 +74,26 @@ Status KeyValuePartitioning::SetDefaultValuesFromKeys(const Expression& expr,
                                                       RecordBatchProjector* projector) {
   ARROW_ASSIGN_OR_RAISE(auto known_values, ExtractKnownFieldValues(expr));
   for (const auto& ref_value : known_values) {
-    if (!ref_value.second.is_scalar()) {
-      return Status::Invalid("non-scalar partition key ", ref_value.second.ToString());
+    const auto& known_value = ref_value.second;
+    if (known_value.concrete() && !known_value.datum.is_scalar()) {
+      return Status::Invalid("non-scalar partition key ", known_value.datum.ToString());
     }
 
     ARROW_ASSIGN_OR_RAISE(auto match,
                           ref_value.first.FindOneOrNone(*projector->schema()));
 
     if (match.empty()) continue;
-    RETURN_NOT_OK(projector->SetDefaultValue(match, ref_value.second.scalar()));
+
+    const auto& field = projector->schema()->field(match[0]);
+    if (known_value.concrete()) {
+      RETURN_NOT_OK(projector->SetDefaultValue(match, known_value.datum.scalar()));
+    } else if (known_value.valid) {
+      return Status::Invalid(
+          "Partition expression not defined enough to set default value for ",
+          ref_value.first.name());
+    } else {
+      RETURN_NOT_OK(projector->SetDefaultValue(match, MakeNullScalar(field->type())));
+    }
   }
   return Status::OK();
 }
@@ -148,7 +159,7 @@ Result<Expression> KeyValuePartitioning::ConvertKey(const Key& key) const {
   std::shared_ptr<Scalar> converted;
 
   if (key.null) {
-    converted = MakeNullScalar(field->type());
+    return is_null(field_ref(field->name()));
   } else if (field->type()->id() == Type::DICTIONARY) {
     if (dictionaries_.empty() || dictionaries_[field_index] == nullptr) {
       return Status::Invalid("No dictionary provided for dictionary field ",
@@ -198,27 +209,34 @@ Result<std::string> KeyValuePartitioning::Format(const Expression& expr) const {
 
   ARROW_ASSIGN_OR_RAISE(auto known_values, ExtractKnownFieldValues(expr));
   for (const auto& ref_value : known_values) {
-    if (!ref_value.second.is_scalar()) {
-      return Status::Invalid("non-scalar partition key ", ref_value.second.ToString());
+    const auto& known_value = ref_value.second;
+    if (known_value.concrete() && !known_value.datum.is_scalar()) {
+      return Status::Invalid("non-scalar partition key ", known_value.datum.ToString());
     }
 
     ARROW_ASSIGN_OR_RAISE(auto match, ref_value.first.FindOneOrNone(*schema_));
     if (match.empty()) continue;
 
-    auto value = ref_value.second.scalar();
-
     const auto& field = schema_->field(match[0]);
-    if (!value->type->Equals(field->type())) {
-      return Status::TypeError("scalar ", value->ToString(), " (of type ", *value->type,
-                               ") is invalid for ", field->ToString());
-    }
 
-    if (value->type->id() == Type::DICTIONARY) {
-      ARROW_ASSIGN_OR_RAISE(
-          value, checked_cast<const DictionaryScalar&>(*value).GetEncodedValue());
-    }
+    if (known_value.concrete()) {
+      auto value = known_value.datum.scalar();
+      if (!value->type->Equals(field->type())) {
+        return Status::TypeError("scalar ", value->ToString(), " (of type ", *value->type,
+                                 ") is invalid for ", field->ToString());
+      }
 
-    values[match[0]] = std::move(value);
+      if (value->type->id() == Type::DICTIONARY) {
+        ARROW_ASSIGN_OR_RAISE(
+            value, checked_cast<const DictionaryScalar&>(*value).GetEncodedValue());
+      }
+
+      values[match[0]] = std::move(value);
+    } else {
+      if (!known_value.valid) {
+        values[match[0]] = MakeNullScalar(field->type());
+      }
+    }
   }
 
   return FormatValues(values);
@@ -471,7 +489,9 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory {
     for (auto path : paths) {
       for (auto&& segment : fs::internal::SplitAbstractPath(path)) {
         if (auto key = HivePartitioning::ParseKey(segment, null_fallback_)) {
-          RETURN_NOT_OK(InsertRepr(key->name, key->value));
+          if (!key->null) {
+            RETURN_NOT_OK(InsertRepr(key->name, key->value));
+          }
         }
       }
     }
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 7e19e4f382d..b8dade238c0 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -304,9 +304,8 @@ TEST_F(TestPartitioning, HivePartitioning) {
   AssertParse("/beta=3.25/alpha=0", and_(equal(field_ref("beta"), literal(3.25f)),
                                          equal(field_ref("alpha"), literal(0))));
   AssertParse("/alpha=0", equal(field_ref("alpha"), literal(0)));
-  AssertParse("/alpha=xyz/beta=3.25",
-              and_(equal(field_ref("alpha"), null_literal(int32())),
-                   equal(field_ref("beta"), literal(3.25f))));
+  AssertParse("/alpha=xyz/beta=3.25", and_(is_null(field_ref("alpha")),
+                                           equal(field_ref("beta"), literal(3.25f))));
   AssertParse("/beta=3.25", equal(field_ref("beta"), literal(3.25f)));
   AssertParse("", literal(true));
 
@@ -336,16 +335,14 @@ TEST_F(TestPartitioning, HivePartitioningFormat) {
                     equal(field_ref("alpha"), literal(0))),
                "alpha=0/beta=3.25");
   AssertFormat(equal(field_ref("alpha"), literal(0)), "alpha=0");
-  AssertFormat(and_(equal(field_ref("alpha"), literal(0)),
-                    equal(field_ref("beta"), null_literal(float32()))),
+  AssertFormat(and_(equal(field_ref("alpha"), literal(0)), is_null(field_ref("beta"))),
                "alpha=0/beta=xyz");
-  AssertFormat(and_(equal(field_ref("alpha"), null_literal(int32())),
-                    equal(field_ref("beta"), literal(3.25f))),
-               "alpha=xyz/beta=3.25");
+  AssertFormat(
+      and_(is_null(field_ref("alpha")), equal(field_ref("beta"), literal(3.25f))),
+      "alpha=xyz/beta=3.25");
   AssertFormat(literal(true), "");
 
-  AssertFormat(and_(equal(field_ref("alpha"), null_literal(int32())),
-                    equal(field_ref("beta"), null_literal(float32()))),
+  AssertFormat(and_(is_null(field_ref("alpha")), is_null(field_ref("beta"))),
                "alpha=xyz/beta=xyz");
 
   ASSERT_OK_AND_ASSIGN(written_schema_,

From 79dda1afb264d35f690e5bc2df2e0f4037b1dcb5 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Thu, 11 Feb 2021 15:15:58 -1000
Subject: [PATCH 13/33] Added the python half of the new extract known values

---
 cpp/src/arrow/dataset/expression.h           |  3 ++-
 python/pyarrow/_dataset.pyx                  | 15 +++++++++++----
 python/pyarrow/includes/libarrow_dataset.pxd |  9 ++++++++-
 python/pyarrow/tests/test_dataset.py         | 20 +++++++++++++-------
 4 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/cpp/src/arrow/dataset/expression.h b/cpp/src/arrow/dataset/expression.h
index 785290e4bb2..1e895febac6 100644
--- a/cpp/src/arrow/dataset/expression.h
+++ b/cpp/src/arrow/dataset/expression.h
@@ -167,8 +167,9 @@ struct KnownFieldValue {
   Datum datum;
   bool valid;
 
+  KnownFieldValue() : datum(), valid(false) {}
   KnownFieldValue(const Datum& datum)
-      : datum(datum), valid(datum.length() == datum.null_count()) {}
+      : datum(datum), valid(datum.length() != datum.null_count()) {}
   KnownFieldValue(bool is_valid) : datum(), valid(is_valid) {}
 
   inline bool concrete() const { return datum.kind() != Datum::Kind::NONE; }
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 5fa2b118ed5..acd5d9602b5 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -206,6 +206,10 @@ cdef class Expression(_Weakrefable):
         """Checks whether the expression is not-null (valid)"""
         return Expression._call("is_valid", [self])
 
+    def is_null(self):
+        """Checks whether the expression is null"""
+        return Expression._call("is_null", [self])
+
     def cast(self, type, bint safe=True):
         """Explicitly change the expression's data type"""
         cdef shared_ptr[CCastOptions] c_options
@@ -2351,14 +2355,17 @@ def _get_partition_keys(Expression partition_expression):
     """
     cdef:
         CExpression expr = partition_expression.unwrap()
-        pair[CFieldRef, CDatum] ref_val
+        pair[CFieldRef, CKnownFieldValue] ref_val
 
     out = {}
     for ref_val in GetResultValue(CExtractKnownFieldValues(expr)):
         assert ref_val.first.name() != nullptr
-        assert ref_val.second.kind() == DatumType_SCALAR
-        val = pyarrow_wrap_scalar(ref_val.second.scalar())
-        out[frombytes(deref(ref_val.first.name()))] = val.as_py()
+        if ref_val.second.valid:
+            assert ref_val.second.datum.kind() == DatumType_SCALAR
+            val = pyarrow_wrap_scalar(ref_val.second.datum.scalar())
+            out[frombytes(deref(ref_val.first.name()))] = val.as_py()
+        else:
+            out[frombytes(deref(ref_val.first.name()))] = None
     return out
 
 
diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
index 93bc0edddc1..2127b3dccff 100644
--- a/python/pyarrow/includes/libarrow_dataset.pxd
+++ b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -315,7 +315,14 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
             const CExpression& partition_expression,
             CRecordBatchProjector* projector)
 
-    cdef CResult[unordered_map[CFieldRef, CDatum, CFieldRefHash]] \
+    cdef cppclass CKnownFieldValue "arrow::dataset::KnownFieldValue":
+        CDatum datum
+        c_bool valid
+        CKnownFieldValue(CDatum datum)
+        CKnownFieldValue(c_bool valid)
+        c_bool operator==(const CKnownFieldValue&) const
+
+    cdef CResult[unordered_map[CFieldRef, CKnownFieldValue, CFieldRefHash]] \
         CExtractKnownFieldValues "arrow::dataset::ExtractKnownFieldValues"(
             const CExpression& partition_expression)
 
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 48ef421694d..a2d75f8e4a4 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -489,6 +489,9 @@ def test_partition_keys():
     assert ds._get_partition_keys(a) == {"a": "a"}
     assert ds._get_partition_keys(a & b & c) == {f: f for f in "abc"}
 
+    null = ds.field("a").is_null()
+    assert ds._get_partition_keys(null) == {"a": None}
+
     nope = ds.field("d") >= 3
     assert ds._get_partition_keys(nope) == {}
     assert ds._get_partition_keys(a & nope) == {"a": "a"}
@@ -1710,16 +1713,17 @@ def dict_type(key):
 @pytest.mark.pandas
 def test_dataset_partitioned_dictionary_type_reconstruct(tempdir):
     # https://issues.apache.org/jira/browse/ARROW-11400
-    table = pa.table({'part': np.repeat(['A', 'B'], 5), 'col': range(10)})
-    part = ds.partitioning(table.select(['part']).schema, flavor="hive")
+    table = pa.table({"part": np.repeat(["A", "B"], 5), "col": range(10)})
+    part = ds.partitioning(table.select(["part"]).schema, flavor="hive")
     ds.write_dataset(table, tempdir, partitioning=part, format="feather")
 
     dataset = ds.dataset(
-        tempdir, format="feather",
-        partitioning=ds.HivePartitioning.discover(infer_dictionary=True)
+        tempdir,
+        format="feather",
+        partitioning=ds.HivePartitioning.discover(infer_dictionary=True),
     )
     expected = pa.table(
-        {'col': table['col'], 'part': table['part'].dictionary_encode()}
+        {"col": table["col"], "part": table["part"].dictionary_encode()}
     )
     assert dataset.to_table().equals(expected)
     fragment = list(dataset.get_fragments())[0]
@@ -1732,8 +1736,10 @@ def test_dataset_partitioned_dictionary_type_reconstruct(tempdir):
     restored = pickle.loads(pickle.dumps(fragment))
     assert restored.to_table(schema=dataset.schema).equals(expected[:5])
     # to_pandas call triggers computation of the actual dictionary values
-    assert restored.to_table(schema=dataset.schema).to_pandas().equals(
-        expected[:5].to_pandas()
+    assert (
+        restored.to_table(schema=dataset.schema)
+        .to_pandas()
+        .equals(expected[:5].to_pandas())
     )
     assert restored.partition_expression.equals(part_expr)
 

From de7be7b89508ec355c7f97b84276893ccaa155ed Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Thu, 11 Feb 2021 15:35:01 -1000
Subject: [PATCH 14/33] Lint

---
 cpp/src/arrow/dataset/expression.h   |   5 +-
 python/pyarrow/_compute.pyx          |   5 +-
 python/pyarrow/compute.py            |   6 +-
 python/pyarrow/includes/libarrow.pxd |   2 +-
 python/pyarrow/tests/test_dataset.py | 106 ++++++++++++++++++---------
 5 files changed, 82 insertions(+), 42 deletions(-)

diff --git a/cpp/src/arrow/dataset/expression.h b/cpp/src/arrow/dataset/expression.h
index 1e895febac6..1bbcb471015 100644
--- a/cpp/src/arrow/dataset/expression.h
+++ b/cpp/src/arrow/dataset/expression.h
@@ -168,9 +168,10 @@ struct KnownFieldValue {
   bool valid;
 
   KnownFieldValue() : datum(), valid(false) {}
-  KnownFieldValue(const Datum& datum)
+  KnownFieldValue(const Datum& datum)  // NOLINT implicit conversion
       : datum(datum), valid(datum.length() != datum.null_count()) {}
-  KnownFieldValue(bool is_valid) : datum(), valid(is_valid) {}
+  KnownFieldValue(bool is_valid)  // NOLINT implicit conversion
+      : datum(), valid(is_valid) {}
 
   inline bool concrete() const { return datum.kind() != Datum::Kind::NONE; }
   bool operator==(const KnownFieldValue& other) const {
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index 8dea882334a..d3d5dc510a3 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -648,6 +648,7 @@ class FilterOptions(_FilterOptions):
     def __init__(self, null_selection_behavior='drop'):
         self._set_options(null_selection_behavior)
 
+
 cdef class _DictionaryEncodeOptions(FunctionOptions):
     cdef:
         unique_ptr[CDictionaryEncodeOptions] dictionary_encode_options
@@ -663,7 +664,9 @@ cdef class _DictionaryEncodeOptions(FunctionOptions):
             self.dictionary_encode_options.reset(
                 new CDictionaryEncodeOptions(CDictionaryEncodeNullEncodingBehavior_MASK))
         else:
-            raise ValueError('"{}" is not a valid null_encoding_behavior'.format(null_encoding_behavior))
+            raise ValueError('"{}" is not a valid null_encoding_behavior'.format(
+                null_encoding_behavior))
+
 
 class DictionaryEncodeOptions(_DictionaryEncodeOptions):
     def __init__(self, null_encoding_behavior='mask'):
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index 1483d97a72d..1e437d43d4c 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -157,7 +157,8 @@ def _get_options_class(func):
         return globals()[class_name]
     except KeyError:
         warnings.warn(
-            "Python binding for {} not exposed".format(class_name), RuntimeWarning
+            "Python binding for {} not exposed".format(
+                class_name), RuntimeWarning
         )
         return None
 
@@ -221,7 +222,8 @@ def _wrap_function(name, func):
     else:
         template = _wrapper_template
     exec(
-        template.format(func_name=name, args_sig=args_sig, kwonly=kwonly), globals(), ns
+        template.format(func_name=name, args_sig=args_sig,
+                        kwonly=kwonly), globals(), ns
     )
     wrapper = ns["make_wrapper"](func, option_class)
 
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 983ee0df0f1..6423741ae50 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1810,7 +1810,7 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
             "arrow::compute::DictionaryEncodeOptions::MASK"
 
     cdef cppclass CDictionaryEncodeOptions \
-        "arrow::compute::DictionaryEncodeOptions"(CFunctionOptions):
+            "arrow::compute::DictionaryEncodeOptions"(CFunctionOptions):
         CDictionaryEncodeOptions()
         CDictionaryEncodeOptions(CDictionaryEncodeNullEncodingBehavior null_encoding)
         CDictionaryEncodeNullEncodingBehavior null_encoding
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index a2d75f8e4a4..3b9eeafef28 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -200,7 +200,8 @@ def dataset(mockfs):
     selector = fs.FileSelector("subdir", recursive=True)
     options = ds.FileSystemFactoryOptions("subdir")
     options.partitioning = ds.DirectoryPartitioning(
-        pa.schema([pa.field("group", pa.int32()), pa.field("key", pa.string())])
+        pa.schema([pa.field("group", pa.int32()),
+                   pa.field("key", pa.string())])
     )
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     return factory.finish()
@@ -338,7 +339,8 @@ def test_dataset(dataset):
 
 
 def test_scanner(dataset):
-    scanner = ds.Scanner.from_dataset(dataset, memory_pool=pa.default_memory_pool())
+    scanner = ds.Scanner.from_dataset(
+        dataset, memory_pool=pa.default_memory_pool())
     assert isinstance(scanner, ds.Scanner)
     assert len(list(scanner.scan())) == 2
 
@@ -368,13 +370,15 @@ def test_abstract_classes():
 
 
 def test_partitioning():
-    schema = pa.schema([pa.field("i64", pa.int64()), pa.field("f64", pa.float64())])
+    schema = pa.schema([pa.field("i64", pa.int64()),
+                        pa.field("f64", pa.float64())])
     for klass in [ds.DirectoryPartitioning, ds.HivePartitioning]:
         partitioning = klass(schema)
         assert isinstance(partitioning, ds.Partitioning)
 
     partitioning = ds.DirectoryPartitioning(
-        pa.schema([pa.field("group", pa.int64()), pa.field("key", pa.float64())])
+        pa.schema([pa.field("group", pa.int64()),
+                   pa.field("key", pa.float64())])
     )
     expr = partitioning.parse("/3/3.14")
     assert isinstance(expr, ds.Expression)
@@ -386,10 +390,12 @@ def test_partitioning():
         partitioning.parse("/prefix/3/aaa")
 
     partitioning = ds.HivePartitioning(
-        pa.schema([pa.field("alpha", pa.int64()), pa.field("beta", pa.int64())])
+        pa.schema([pa.field("alpha", pa.int64()),
+                   pa.field("beta", pa.int64())])
     )
     expr = partitioning.parse("/alpha=0/beta=3")
-    expected = (ds.field("alpha") == ds.scalar(0)) & (ds.field("beta") == ds.scalar(3))
+    expected = (ds.field("alpha") == ds.scalar(0)) & (
+        ds.field("beta") == ds.scalar(3))
     assert expr.equals(expected)
 
     for shouldfail in ["/alpha=one/beta=2", "/alpha=one", "/beta=two"]:
@@ -397,7 +403,8 @@ def test_partitioning():
             partitioning.parse(shouldfail)
 
     partitioning = ds.HivePartitioning(
-        pa.schema([pa.field("alpha", pa.int64()), pa.field("beta", pa.int64())]),
+        pa.schema([pa.field("alpha", pa.int64()),
+                   pa.field("beta", pa.int64())]),
         None,
         "xyz",
     )
@@ -536,7 +543,8 @@ def test_file_format_pickling():
     formats = [
         ds.IpcFileFormat(),
         ds.CsvFileFormat(),
-        ds.CsvFileFormat(pa.csv.ParseOptions(delimiter="\t", ignore_empty_lines=True)),
+        ds.CsvFileFormat(pa.csv.ParseOptions(
+            delimiter="\t", ignore_empty_lines=True)),
         ds.ParquetFileFormat(),
         ds.ParquetFileFormat(
             read_options=ds.ParquetReadOptions(use_buffered_stream=True)
@@ -568,13 +576,15 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer):
 
     options = ds.FileSystemFactoryOptions("subdir")
     options.partitioning = ds.DirectoryPartitioning(
-        pa.schema([pa.field("group", pa.int32()), pa.field("key", pa.string())])
+        pa.schema([pa.field("group", pa.int32()),
+                   pa.field("key", pa.string())])
     )
     assert options.partition_base_dir == "subdir"
     assert options.selector_ignore_prefixes == [".", "_"]
     assert options.exclude_invalid_files is False
 
-    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options)
+    factory = ds.FileSystemDatasetFactory(
+        mockfs, paths_or_selector, format, options)
     inspected_schema = factory.inspect()
 
     assert factory.inspect().equals(
@@ -627,7 +637,8 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer):
 
 def test_make_fragment(multisourcefs):
     parquet_format = ds.ParquetFileFormat()
-    dataset = ds.dataset("/plain", filesystem=multisourcefs, format=parquet_format)
+    dataset = ds.dataset(
+        "/plain", filesystem=multisourcefs, format=parquet_format)
 
     for path in dataset.files:
         fragment = parquet_format.make_fragment(path, multisourcefs)
@@ -689,7 +700,8 @@ def test_make_parquet_fragment_from_buffer():
         )
     )
 
-    cases = [(arrays, ds.ParquetFileFormat()), (dictionary_arrays, dictionary_format)]
+    cases = [(arrays, ds.ParquetFileFormat()),
+             (dictionary_arrays, dictionary_format)]
     for arrays, format_ in cases:
         table = pa.table(arrays, names=["alpha", "num", "animal"])
 
@@ -714,7 +726,8 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None):
     path = str(tempdir / "test_parquet_dataset")
 
     # write_to_dataset currently requires pandas
-    pq.write_to_dataset(table, path, partition_cols=["part"], chunk_size=chunk_size)
+    pq.write_to_dataset(table, path, partition_cols=[
+                        "part"], chunk_size=chunk_size)
     dataset = ds.dataset(
         path, format="parquet", partitioning="hive", filesystem=filesystem
     )
@@ -777,7 +790,8 @@ def test_fragments_reconstruct(tempdir):
     table, dataset = _create_dataset_for_fragments(tempdir)
 
     def assert_yields_projected(fragment, row_slice, columns=None, filter=None):
-        actual = fragment.to_table(schema=table.schema, columns=columns, filter=filter)
+        actual = fragment.to_table(
+            schema=table.schema, columns=columns, filter=filter)
         column_names = columns if columns else table.column_names
         assert actual.column_names == column_names
 
@@ -824,13 +838,14 @@ def assert_yields_projected(fragment, row_slice, columns=None, filter=None):
         fragment.filesystem,
         partition_expression=fragment.partition_expression,
     )
-    assert_yields_projected(new_fragment, (0, 4), filter=ds.field("part") == "a")
+    assert_yields_projected(new_fragment, (0, 4),
+                            filter=ds.field("part") == "a")
 
     # Fragments don't contain the partition's columns if not provided to the
     # `to_table(schema=...)` method.
     pattern = (
-        r"No match for FieldRef.Name\(part\) in "
-        + fragment.physical_schema.to_string(False, False, False)
+        r"No match for FieldRef.Name\(part\) in " +
+        fragment.physical_schema.to_string(False, False, False)
     )
     with pytest.raises(ValueError, match=pattern):
         new_fragment = parquet_format.make_fragment(
@@ -914,7 +929,8 @@ def test_fragments_parquet_row_groups_dictionary(tempdir):
 @pytest.mark.parquet
 def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs):
     fs, assert_opens = open_logging_fs
-    _, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2, filesystem=fs)
+    _, dataset = _create_dataset_for_fragments(
+        tempdir, chunk_size=2, filesystem=fs)
     fragment = list(dataset.get_fragments())[0]
 
     # with default discovery, no metadata loaded
@@ -1163,7 +1179,8 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir):
 @pytest.mark.parquet
 def test_fragments_parquet_subset_ids(tempdir, open_logging_fs):
     fs, assert_opens = open_logging_fs
-    table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, filesystem=fs)
+    table, dataset = _create_dataset_for_fragments(
+        tempdir, chunk_size=1, filesystem=fs)
     fragment = list(dataset.get_fragments())[0]
 
     # select with row group ids
@@ -1190,7 +1207,8 @@ def test_fragments_parquet_subset_ids(tempdir, open_logging_fs):
 @pytest.mark.parquet
 def test_fragments_parquet_subset_filter(tempdir, open_logging_fs):
     fs, assert_opens = open_logging_fs
-    table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, filesystem=fs)
+    table, dataset = _create_dataset_for_fragments(
+        tempdir, chunk_size=1, filesystem=fs)
     fragment = list(dataset.get_fragments())[0]
 
     # select with filter
@@ -1240,7 +1258,8 @@ def test_partitioning_factory(mockfs):
     assert isinstance(partitioning_factory, ds.PartitioningFactory)
     options.partitioning_factory = partitioning_factory
 
-    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options)
+    factory = ds.FileSystemDatasetFactory(
+        mockfs, paths_or_selector, format, options)
     inspected_schema = factory.inspect()
     # i64/f64 from data, group/key from "/1/xxx" and "/2/yyy" paths
     expected_schema = pa.schema(
@@ -1269,7 +1288,8 @@ def test_partitioning_factory_dictionary(mockfs, infer_dictionary):
         ["group", "key"], infer_dictionary=infer_dictionary
     )
 
-    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options)
+    factory = ds.FileSystemDatasetFactory(
+        mockfs, paths_or_selector, format, options)
 
     inferred_schema = factory.inspect()
     if infer_dictionary:
@@ -1396,7 +1416,8 @@ def test_open_dataset_list_of_files(tempdir):
     tables, (path1, path2) = _create_directory_of_files(tempdir)
     table = pa.concat_tables(tables)
 
-    datasets = [ds.dataset([path1, path2]), ds.dataset([str(path1), str(path2)])]
+    datasets = [ds.dataset([path1, path2]),
+                ds.dataset([str(path1), str(path2)])]
     datasets += [pickle.loads(pickle.dumps(d)) for d in datasets]
 
     for dataset in datasets:
@@ -1504,7 +1525,8 @@ def test_construct_empty_dataset():
     assert table.num_rows == 0
     assert table.num_columns == 0
 
-    empty = ds.dataset([], schema=pa.schema([("a", pa.int64()), ("a", pa.string())]))
+    empty = ds.dataset([], schema=pa.schema(
+        [("a", pa.int64()), ("a", pa.string())]))
     table = empty.to_table()
     assert table.num_rows == 0
     assert table.num_columns == 2
@@ -1555,13 +1577,15 @@ def test_open_dataset_partitioned_directory(tempdir):
     _check_dataset_from_path(path, full_table)
 
     # specify partition scheme with discovery
-    dataset = ds.dataset(str(path), partitioning=ds.partitioning(flavor="hive"))
+    dataset = ds.dataset(
+        str(path), partitioning=ds.partitioning(flavor="hive"))
     expected_schema = table.schema.append(pa.field("part", pa.int32()))
     assert dataset.schema.equals(expected_schema)
 
     # specify partition scheme with discovery and relative path
     with change_cwd(tempdir):
-        dataset = ds.dataset("dataset/", partitioning=ds.partitioning(flavor="hive"))
+        dataset = ds.dataset(
+            "dataset/", partitioning=ds.partitioning(flavor="hive"))
         expected_schema = table.schema.append(pa.field("part", pa.int32()))
         assert dataset.schema.equals(expected_schema)
 
@@ -1572,7 +1596,8 @@ def test_open_dataset_partitioned_directory(tempdir):
     # specify partition scheme with explicit scheme
     dataset = ds.dataset(
         str(path),
-        partitioning=ds.partitioning(pa.schema([("part", pa.int8())]), flavor="hive"),
+        partitioning=ds.partitioning(
+            pa.schema([("part", pa.int8())]), flavor="hive"),
     )
     expected_schema = table.schema.append(pa.field("part", pa.int8()))
     assert dataset.schema.equals(expected_schema)
@@ -1694,7 +1719,8 @@ def test_open_dataset_partitioned_dictionary_type(
     part_keys1, part_keys2 = partition_keys
     for part1 in part_keys1:
         for part2 in part_keys2:
-            path = basepath / fmt.format(part1 or null_value, part2 or null_value)
+            path = basepath / \
+                fmt.format(part1 or null_value, part2 or null_value)
             path.mkdir(parents=True)
             pq.write_table(table, path / "test.parquet")
 
@@ -2248,7 +2274,8 @@ def _create_metadata_file(root_path):
         metadata_collector.append(metadata)
 
     metadata_path = root_path / "_metadata"
-    pq.write_metadata(schema, metadata_path, metadata_collector=metadata_collector)
+    pq.write_metadata(schema, metadata_path,
+                      metadata_collector=metadata_collector)
     return metadata_path
 
 
@@ -2371,7 +2398,8 @@ def test_filter_mismatching_schema(tempdir):
 
     # specifying explicit schema, but that mismatches the schema of the data
     schema = pa.schema([("col", pa.int64())])
-    dataset = ds.dataset(tempdir / "data.parquet", format="parquet", schema=schema)
+    dataset = ds.dataset(tempdir / "data.parquet",
+                         format="parquet", schema=schema)
 
     # filtering on a column with such type mismatch should give a proper error
     with pytest.raises(TypeError):
@@ -2410,7 +2438,8 @@ def test_dataset_project_null_column(tempdir):
     f = tempdir / "test_dataset_project_null_column.parquet"
     df.to_parquet(f, engine="pyarrow")
 
-    dataset = ds.dataset(f, format="parquet", schema=pa.schema([("col", pa.int64())]))
+    dataset = ds.dataset(f, format="parquet",
+                         schema=pa.schema([("col", pa.int64())]))
     expected = pa.table({"col": pa.array([None, None, None], pa.int64())})
     assert dataset.to_table().equals(expected)
 
@@ -2433,7 +2462,8 @@ def _check_dataset_roundtrip(
     assert set(file_paths) == set(expected_files)
 
     # check that reading back in as dataset gives the same result
-    dataset2 = ds.dataset(base_dir_path, format="feather", partitioning=partitioning)
+    dataset2 = ds.dataset(base_dir_path, format="feather",
+                          partitioning=partitioning)
     assert dataset2.to_table().equals(dataset.to_table())
 
 
@@ -2518,7 +2548,8 @@ def test_write_dataset_partitioned_dict(tempdir):
 
     # directory partitioning, dictionary partition columns
     dataset = ds.dataset(
-        directory, partitioning=ds.HivePartitioning.discover(infer_dictionary=True)
+        directory, partitioning=ds.HivePartitioning.discover(
+            infer_dictionary=True)
     )
     target = tempdir / "partitioned-dir-target"
     expected_paths = [
@@ -2546,7 +2577,8 @@ def test_write_dataset_use_threads(tempdir):
     _ = _create_parquet_dataset_partitioned(directory)
     dataset = ds.dataset(directory, partitioning="hive")
 
-    partitioning = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
+    partitioning = ds.partitioning(
+        pa.schema([("part", pa.string())]), flavor="hive")
 
     target1 = tempdir / "partitioned1"
     ds.write_dataset(
@@ -2587,7 +2619,8 @@ def test_write_table(tempdir):
 
     # with partitioning
     base_dir = tempdir / "partitioned"
-    partitioning = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
+    partitioning = ds.partitioning(
+        pa.schema([("part", pa.string())]), flavor="hive")
     ds.write_dataset(
         table,
         base_dir,
@@ -2661,7 +2694,8 @@ def test_write_table_partitioned_dict(tempdir):
     partitioning = ds.partitioning(table.select(["part"]).schema)
 
     base_dir = tempdir / "dataset"
-    ds.write_dataset(table, base_dir, format="feather", partitioning=partitioning)
+    ds.write_dataset(table, base_dir, format="feather",
+                     partitioning=partitioning)
 
     # check roundtrip
     partitioning_read = ds.DirectoryPartitioning.discover(

From cd00e59b267aa1267c58e8026ae8e4b4be51d2a0 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Thu, 11 Feb 2021 15:37:17 -1000
Subject: [PATCH 15/33] Missed a test case

---
 python/pyarrow/tests/test_dataset.py | 110 +++++++++------------------
 1 file changed, 37 insertions(+), 73 deletions(-)

diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 3b9eeafef28..9bbffbc8d76 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -200,8 +200,7 @@ def dataset(mockfs):
     selector = fs.FileSelector("subdir", recursive=True)
     options = ds.FileSystemFactoryOptions("subdir")
     options.partitioning = ds.DirectoryPartitioning(
-        pa.schema([pa.field("group", pa.int32()),
-                   pa.field("key", pa.string())])
+        pa.schema([pa.field("group", pa.int32()), pa.field("key", pa.string())])
     )
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     return factory.finish()
@@ -339,8 +338,7 @@ def test_dataset(dataset):
 
 
 def test_scanner(dataset):
-    scanner = ds.Scanner.from_dataset(
-        dataset, memory_pool=pa.default_memory_pool())
+    scanner = ds.Scanner.from_dataset(dataset, memory_pool=pa.default_memory_pool())
     assert isinstance(scanner, ds.Scanner)
     assert len(list(scanner.scan())) == 2
 
@@ -370,15 +368,13 @@ def test_abstract_classes():
 
 
 def test_partitioning():
-    schema = pa.schema([pa.field("i64", pa.int64()),
-                        pa.field("f64", pa.float64())])
+    schema = pa.schema([pa.field("i64", pa.int64()), pa.field("f64", pa.float64())])
     for klass in [ds.DirectoryPartitioning, ds.HivePartitioning]:
         partitioning = klass(schema)
         assert isinstance(partitioning, ds.Partitioning)
 
     partitioning = ds.DirectoryPartitioning(
-        pa.schema([pa.field("group", pa.int64()),
-                   pa.field("key", pa.float64())])
+        pa.schema([pa.field("group", pa.int64()), pa.field("key", pa.float64())])
     )
     expr = partitioning.parse("/3/3.14")
     assert isinstance(expr, ds.Expression)
@@ -390,12 +386,10 @@ def test_partitioning():
         partitioning.parse("/prefix/3/aaa")
 
     partitioning = ds.HivePartitioning(
-        pa.schema([pa.field("alpha", pa.int64()),
-                   pa.field("beta", pa.int64())])
+        pa.schema([pa.field("alpha", pa.int64()), pa.field("beta", pa.int64())])
     )
     expr = partitioning.parse("/alpha=0/beta=3")
-    expected = (ds.field("alpha") == ds.scalar(0)) & (
-        ds.field("beta") == ds.scalar(3))
+    expected = (ds.field("alpha") == ds.scalar(0)) & (ds.field("beta") == ds.scalar(3))
     assert expr.equals(expected)
 
     for shouldfail in ["/alpha=one/beta=2", "/alpha=one", "/beta=two"]:
@@ -403,15 +397,12 @@ def test_partitioning():
             partitioning.parse(shouldfail)
 
     partitioning = ds.HivePartitioning(
-        pa.schema([pa.field("alpha", pa.int64()),
-                   pa.field("beta", pa.int64())]),
+        pa.schema([pa.field("alpha", pa.int64()), pa.field("beta", pa.int64())]),
         None,
         "xyz",
     )
     expr = partitioning.parse("/alpha=xyz/beta=3")
-    expected = (ds.field("alpha") == ds.scalar(None)) & (
-        ds.field("beta") == ds.scalar(3)
-    )
+    expected = (ds.field("alpha").is_null()) & (ds.field("beta") == ds.scalar(3))
     assert expr.equals(expected)
 
 
@@ -543,8 +534,7 @@ def test_file_format_pickling():
     formats = [
         ds.IpcFileFormat(),
         ds.CsvFileFormat(),
-        ds.CsvFileFormat(pa.csv.ParseOptions(
-            delimiter="\t", ignore_empty_lines=True)),
+        ds.CsvFileFormat(pa.csv.ParseOptions(delimiter="\t", ignore_empty_lines=True)),
         ds.ParquetFileFormat(),
         ds.ParquetFileFormat(
             read_options=ds.ParquetReadOptions(use_buffered_stream=True)
@@ -576,15 +566,13 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer):
 
     options = ds.FileSystemFactoryOptions("subdir")
     options.partitioning = ds.DirectoryPartitioning(
-        pa.schema([pa.field("group", pa.int32()),
-                   pa.field("key", pa.string())])
+        pa.schema([pa.field("group", pa.int32()), pa.field("key", pa.string())])
     )
     assert options.partition_base_dir == "subdir"
     assert options.selector_ignore_prefixes == [".", "_"]
     assert options.exclude_invalid_files is False
 
-    factory = ds.FileSystemDatasetFactory(
-        mockfs, paths_or_selector, format, options)
+    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options)
     inspected_schema = factory.inspect()
 
     assert factory.inspect().equals(
@@ -637,8 +625,7 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer):
 
 def test_make_fragment(multisourcefs):
     parquet_format = ds.ParquetFileFormat()
-    dataset = ds.dataset(
-        "/plain", filesystem=multisourcefs, format=parquet_format)
+    dataset = ds.dataset("/plain", filesystem=multisourcefs, format=parquet_format)
 
     for path in dataset.files:
         fragment = parquet_format.make_fragment(path, multisourcefs)
@@ -700,8 +687,7 @@ def test_make_parquet_fragment_from_buffer():
         )
     )
 
-    cases = [(arrays, ds.ParquetFileFormat()),
-             (dictionary_arrays, dictionary_format)]
+    cases = [(arrays, ds.ParquetFileFormat()), (dictionary_arrays, dictionary_format)]
     for arrays, format_ in cases:
         table = pa.table(arrays, names=["alpha", "num", "animal"])
 
@@ -726,8 +712,7 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None):
     path = str(tempdir / "test_parquet_dataset")
 
     # write_to_dataset currently requires pandas
-    pq.write_to_dataset(table, path, partition_cols=[
-                        "part"], chunk_size=chunk_size)
+    pq.write_to_dataset(table, path, partition_cols=["part"], chunk_size=chunk_size)
     dataset = ds.dataset(
         path, format="parquet", partitioning="hive", filesystem=filesystem
     )
@@ -790,8 +775,7 @@ def test_fragments_reconstruct(tempdir):
     table, dataset = _create_dataset_for_fragments(tempdir)
 
     def assert_yields_projected(fragment, row_slice, columns=None, filter=None):
-        actual = fragment.to_table(
-            schema=table.schema, columns=columns, filter=filter)
+        actual = fragment.to_table(schema=table.schema, columns=columns, filter=filter)
         column_names = columns if columns else table.column_names
         assert actual.column_names == column_names
 
@@ -838,14 +822,13 @@ def assert_yields_projected(fragment, row_slice, columns=None, filter=None):
         fragment.filesystem,
         partition_expression=fragment.partition_expression,
     )
-    assert_yields_projected(new_fragment, (0, 4),
-                            filter=ds.field("part") == "a")
+    assert_yields_projected(new_fragment, (0, 4), filter=ds.field("part") == "a")
 
     # Fragments don't contain the partition's columns if not provided to the
     # `to_table(schema=...)` method.
     pattern = (
-        r"No match for FieldRef.Name\(part\) in " +
-        fragment.physical_schema.to_string(False, False, False)
+        r"No match for FieldRef.Name\(part\) in "
+        + fragment.physical_schema.to_string(False, False, False)
     )
     with pytest.raises(ValueError, match=pattern):
         new_fragment = parquet_format.make_fragment(
@@ -929,8 +912,7 @@ def test_fragments_parquet_row_groups_dictionary(tempdir):
 @pytest.mark.parquet
 def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs):
     fs, assert_opens = open_logging_fs
-    _, dataset = _create_dataset_for_fragments(
-        tempdir, chunk_size=2, filesystem=fs)
+    _, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2, filesystem=fs)
     fragment = list(dataset.get_fragments())[0]
 
     # with default discovery, no metadata loaded
@@ -1179,8 +1161,7 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir):
 @pytest.mark.parquet
 def test_fragments_parquet_subset_ids(tempdir, open_logging_fs):
     fs, assert_opens = open_logging_fs
-    table, dataset = _create_dataset_for_fragments(
-        tempdir, chunk_size=1, filesystem=fs)
+    table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, filesystem=fs)
     fragment = list(dataset.get_fragments())[0]
 
     # select with row group ids
@@ -1207,8 +1188,7 @@ def test_fragments_parquet_subset_ids(tempdir, open_logging_fs):
 @pytest.mark.parquet
 def test_fragments_parquet_subset_filter(tempdir, open_logging_fs):
     fs, assert_opens = open_logging_fs
-    table, dataset = _create_dataset_for_fragments(
-        tempdir, chunk_size=1, filesystem=fs)
+    table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, filesystem=fs)
     fragment = list(dataset.get_fragments())[0]
 
     # select with filter
@@ -1258,8 +1238,7 @@ def test_partitioning_factory(mockfs):
     assert isinstance(partitioning_factory, ds.PartitioningFactory)
     options.partitioning_factory = partitioning_factory
 
-    factory = ds.FileSystemDatasetFactory(
-        mockfs, paths_or_selector, format, options)
+    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options)
     inspected_schema = factory.inspect()
     # i64/f64 from data, group/key from "/1/xxx" and "/2/yyy" paths
     expected_schema = pa.schema(
@@ -1288,8 +1267,7 @@ def test_partitioning_factory_dictionary(mockfs, infer_dictionary):
         ["group", "key"], infer_dictionary=infer_dictionary
     )
 
-    factory = ds.FileSystemDatasetFactory(
-        mockfs, paths_or_selector, format, options)
+    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options)
 
     inferred_schema = factory.inspect()
     if infer_dictionary:
@@ -1416,8 +1394,7 @@ def test_open_dataset_list_of_files(tempdir):
     tables, (path1, path2) = _create_directory_of_files(tempdir)
     table = pa.concat_tables(tables)
 
-    datasets = [ds.dataset([path1, path2]),
-                ds.dataset([str(path1), str(path2)])]
+    datasets = [ds.dataset([path1, path2]), ds.dataset([str(path1), str(path2)])]
     datasets += [pickle.loads(pickle.dumps(d)) for d in datasets]
 
     for dataset in datasets:
@@ -1525,8 +1502,7 @@ def test_construct_empty_dataset():
     assert table.num_rows == 0
     assert table.num_columns == 0
 
-    empty = ds.dataset([], schema=pa.schema(
-        [("a", pa.int64()), ("a", pa.string())]))
+    empty = ds.dataset([], schema=pa.schema([("a", pa.int64()), ("a", pa.string())]))
     table = empty.to_table()
     assert table.num_rows == 0
     assert table.num_columns == 2
@@ -1577,15 +1553,13 @@ def test_open_dataset_partitioned_directory(tempdir):
     _check_dataset_from_path(path, full_table)
 
     # specify partition scheme with discovery
-    dataset = ds.dataset(
-        str(path), partitioning=ds.partitioning(flavor="hive"))
+    dataset = ds.dataset(str(path), partitioning=ds.partitioning(flavor="hive"))
     expected_schema = table.schema.append(pa.field("part", pa.int32()))
     assert dataset.schema.equals(expected_schema)
 
     # specify partition scheme with discovery and relative path
     with change_cwd(tempdir):
-        dataset = ds.dataset(
-            "dataset/", partitioning=ds.partitioning(flavor="hive"))
+        dataset = ds.dataset("dataset/", partitioning=ds.partitioning(flavor="hive"))
         expected_schema = table.schema.append(pa.field("part", pa.int32()))
         assert dataset.schema.equals(expected_schema)
 
@@ -1596,8 +1570,7 @@ def test_open_dataset_partitioned_directory(tempdir):
     # specify partition scheme with explicit scheme
     dataset = ds.dataset(
         str(path),
-        partitioning=ds.partitioning(
-            pa.schema([("part", pa.int8())]), flavor="hive"),
+        partitioning=ds.partitioning(pa.schema([("part", pa.int8())]), flavor="hive"),
     )
     expected_schema = table.schema.append(pa.field("part", pa.int8()))
     assert dataset.schema.equals(expected_schema)
@@ -1719,8 +1692,7 @@ def test_open_dataset_partitioned_dictionary_type(
     part_keys1, part_keys2 = partition_keys
     for part1 in part_keys1:
         for part2 in part_keys2:
-            path = basepath / \
-                fmt.format(part1 or null_value, part2 or null_value)
+            path = basepath / fmt.format(part1 or null_value, part2 or null_value)
             path.mkdir(parents=True)
             pq.write_table(table, path / "test.parquet")
 
@@ -2274,8 +2246,7 @@ def _create_metadata_file(root_path):
         metadata_collector.append(metadata)
 
     metadata_path = root_path / "_metadata"
-    pq.write_metadata(schema, metadata_path,
-                      metadata_collector=metadata_collector)
+    pq.write_metadata(schema, metadata_path, metadata_collector=metadata_collector)
     return metadata_path
 
 
@@ -2398,8 +2369,7 @@ def test_filter_mismatching_schema(tempdir):
 
     # specifying explicit schema, but that mismatches the schema of the data
     schema = pa.schema([("col", pa.int64())])
-    dataset = ds.dataset(tempdir / "data.parquet",
-                         format="parquet", schema=schema)
+    dataset = ds.dataset(tempdir / "data.parquet", format="parquet", schema=schema)
 
     # filtering on a column with such type mismatch should give a proper error
     with pytest.raises(TypeError):
@@ -2438,8 +2408,7 @@ def test_dataset_project_null_column(tempdir):
     f = tempdir / "test_dataset_project_null_column.parquet"
     df.to_parquet(f, engine="pyarrow")
 
-    dataset = ds.dataset(f, format="parquet",
-                         schema=pa.schema([("col", pa.int64())]))
+    dataset = ds.dataset(f, format="parquet", schema=pa.schema([("col", pa.int64())]))
     expected = pa.table({"col": pa.array([None, None, None], pa.int64())})
     assert dataset.to_table().equals(expected)
 
@@ -2462,8 +2431,7 @@ def _check_dataset_roundtrip(
     assert set(file_paths) == set(expected_files)
 
     # check that reading back in as dataset gives the same result
-    dataset2 = ds.dataset(base_dir_path, format="feather",
-                          partitioning=partitioning)
+    dataset2 = ds.dataset(base_dir_path, format="feather", partitioning=partitioning)
     assert dataset2.to_table().equals(dataset.to_table())
 
 
@@ -2548,8 +2516,7 @@ def test_write_dataset_partitioned_dict(tempdir):
 
     # directory partitioning, dictionary partition columns
     dataset = ds.dataset(
-        directory, partitioning=ds.HivePartitioning.discover(
-            infer_dictionary=True)
+        directory, partitioning=ds.HivePartitioning.discover(infer_dictionary=True)
     )
     target = tempdir / "partitioned-dir-target"
     expected_paths = [
@@ -2577,8 +2544,7 @@ def test_write_dataset_use_threads(tempdir):
     _ = _create_parquet_dataset_partitioned(directory)
     dataset = ds.dataset(directory, partitioning="hive")
 
-    partitioning = ds.partitioning(
-        pa.schema([("part", pa.string())]), flavor="hive")
+    partitioning = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
 
     target1 = tempdir / "partitioned1"
     ds.write_dataset(
@@ -2619,8 +2585,7 @@ def test_write_table(tempdir):
 
     # with partitioning
     base_dir = tempdir / "partitioned"
-    partitioning = ds.partitioning(
-        pa.schema([("part", pa.string())]), flavor="hive")
+    partitioning = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
     ds.write_dataset(
         table,
         base_dir,
@@ -2694,8 +2659,7 @@ def test_write_table_partitioned_dict(tempdir):
     partitioning = ds.partitioning(table.select(["part"]).schema)
 
     base_dir = tempdir / "dataset"
-    ds.write_dataset(table, base_dir, format="feather",
-                     partitioning=partitioning)
+    ds.write_dataset(table, base_dir, format="feather", partitioning=partitioning)
 
     # check roundtrip
     partitioning_read = ds.DirectoryPartitioning.discover(

From 4506853455451759a6501e330ab906b962c4aff0 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Thu, 11 Feb 2021 17:22:43 -1000
Subject: [PATCH 16/33] Re-lint, it appears my IDE is using the wrong style
 file

---
 python/pyarrow/tests/test_dataset.py | 109 ++++++++++++++++++---------
 1 file changed, 72 insertions(+), 37 deletions(-)

diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 9bbffbc8d76..89bec21fd86 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -200,7 +200,8 @@ def dataset(mockfs):
     selector = fs.FileSelector("subdir", recursive=True)
     options = ds.FileSystemFactoryOptions("subdir")
     options.partitioning = ds.DirectoryPartitioning(
-        pa.schema([pa.field("group", pa.int32()), pa.field("key", pa.string())])
+        pa.schema([pa.field("group", pa.int32()),
+                   pa.field("key", pa.string())])
     )
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     return factory.finish()
@@ -338,7 +339,8 @@ def test_dataset(dataset):
 
 
 def test_scanner(dataset):
-    scanner = ds.Scanner.from_dataset(dataset, memory_pool=pa.default_memory_pool())
+    scanner = ds.Scanner.from_dataset(
+        dataset, memory_pool=pa.default_memory_pool())
     assert isinstance(scanner, ds.Scanner)
     assert len(list(scanner.scan())) == 2
 
@@ -368,13 +370,15 @@ def test_abstract_classes():
 
 
 def test_partitioning():
-    schema = pa.schema([pa.field("i64", pa.int64()), pa.field("f64", pa.float64())])
+    schema = pa.schema([pa.field("i64", pa.int64()),
+                        pa.field("f64", pa.float64())])
     for klass in [ds.DirectoryPartitioning, ds.HivePartitioning]:
         partitioning = klass(schema)
         assert isinstance(partitioning, ds.Partitioning)
 
     partitioning = ds.DirectoryPartitioning(
-        pa.schema([pa.field("group", pa.int64()), pa.field("key", pa.float64())])
+        pa.schema([pa.field("group", pa.int64()),
+                   pa.field("key", pa.float64())])
     )
     expr = partitioning.parse("/3/3.14")
     assert isinstance(expr, ds.Expression)
@@ -386,10 +390,12 @@ def test_partitioning():
         partitioning.parse("/prefix/3/aaa")
 
     partitioning = ds.HivePartitioning(
-        pa.schema([pa.field("alpha", pa.int64()), pa.field("beta", pa.int64())])
+        pa.schema([pa.field("alpha", pa.int64()),
+                   pa.field("beta", pa.int64())])
     )
     expr = partitioning.parse("/alpha=0/beta=3")
-    expected = (ds.field("alpha") == ds.scalar(0)) & (ds.field("beta") == ds.scalar(3))
+    expected = (ds.field("alpha") == ds.scalar(0)) & (
+        ds.field("beta") == ds.scalar(3))
     assert expr.equals(expected)
 
     for shouldfail in ["/alpha=one/beta=2", "/alpha=one", "/beta=two"]:
@@ -397,12 +403,14 @@ def test_partitioning():
             partitioning.parse(shouldfail)
 
     partitioning = ds.HivePartitioning(
-        pa.schema([pa.field("alpha", pa.int64()), pa.field("beta", pa.int64())]),
+        pa.schema([pa.field("alpha", pa.int64()),
+                   pa.field("beta", pa.int64())]),
         None,
         "xyz",
     )
     expr = partitioning.parse("/alpha=xyz/beta=3")
-    expected = (ds.field("alpha").is_null()) & (ds.field("beta") == ds.scalar(3))
+    expected = (ds.field("alpha").is_null()) & (
+        ds.field("beta") == ds.scalar(3))
     assert expr.equals(expected)
 
 
@@ -534,7 +542,8 @@ def test_file_format_pickling():
     formats = [
         ds.IpcFileFormat(),
         ds.CsvFileFormat(),
-        ds.CsvFileFormat(pa.csv.ParseOptions(delimiter="\t", ignore_empty_lines=True)),
+        ds.CsvFileFormat(pa.csv.ParseOptions(
+            delimiter="\t", ignore_empty_lines=True)),
         ds.ParquetFileFormat(),
         ds.ParquetFileFormat(
             read_options=ds.ParquetReadOptions(use_buffered_stream=True)
@@ -566,13 +575,15 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer):
 
     options = ds.FileSystemFactoryOptions("subdir")
     options.partitioning = ds.DirectoryPartitioning(
-        pa.schema([pa.field("group", pa.int32()), pa.field("key", pa.string())])
+        pa.schema([pa.field("group", pa.int32()),
+                   pa.field("key", pa.string())])
     )
     assert options.partition_base_dir == "subdir"
     assert options.selector_ignore_prefixes == [".", "_"]
     assert options.exclude_invalid_files is False
 
-    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options)
+    factory = ds.FileSystemDatasetFactory(
+        mockfs, paths_or_selector, format, options)
     inspected_schema = factory.inspect()
 
     assert factory.inspect().equals(
@@ -625,7 +636,8 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer):
 
 def test_make_fragment(multisourcefs):
     parquet_format = ds.ParquetFileFormat()
-    dataset = ds.dataset("/plain", filesystem=multisourcefs, format=parquet_format)
+    dataset = ds.dataset(
+        "/plain", filesystem=multisourcefs, format=parquet_format)
 
     for path in dataset.files:
         fragment = parquet_format.make_fragment(path, multisourcefs)
@@ -687,7 +699,8 @@ def test_make_parquet_fragment_from_buffer():
         )
     )
 
-    cases = [(arrays, ds.ParquetFileFormat()), (dictionary_arrays, dictionary_format)]
+    cases = [(arrays, ds.ParquetFileFormat()),
+             (dictionary_arrays, dictionary_format)]
     for arrays, format_ in cases:
         table = pa.table(arrays, names=["alpha", "num", "animal"])
 
@@ -712,7 +725,8 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None):
     path = str(tempdir / "test_parquet_dataset")
 
     # write_to_dataset currently requires pandas
-    pq.write_to_dataset(table, path, partition_cols=["part"], chunk_size=chunk_size)
+    pq.write_to_dataset(table, path, partition_cols=[
+                        "part"], chunk_size=chunk_size)
     dataset = ds.dataset(
         path, format="parquet", partitioning="hive", filesystem=filesystem
     )
@@ -775,7 +789,8 @@ def test_fragments_reconstruct(tempdir):
     table, dataset = _create_dataset_for_fragments(tempdir)
 
     def assert_yields_projected(fragment, row_slice, columns=None, filter=None):
-        actual = fragment.to_table(schema=table.schema, columns=columns, filter=filter)
+        actual = fragment.to_table(
+            schema=table.schema, columns=columns, filter=filter)
         column_names = columns if columns else table.column_names
         assert actual.column_names == column_names
 
@@ -822,13 +837,14 @@ def assert_yields_projected(fragment, row_slice, columns=None, filter=None):
         fragment.filesystem,
         partition_expression=fragment.partition_expression,
     )
-    assert_yields_projected(new_fragment, (0, 4), filter=ds.field("part") == "a")
+    assert_yields_projected(new_fragment, (0, 4),
+                            filter=ds.field("part") == "a")
 
     # Fragments don't contain the partition's columns if not provided to the
     # `to_table(schema=...)` method.
     pattern = (
-        r"No match for FieldRef.Name\(part\) in "
-        + fragment.physical_schema.to_string(False, False, False)
+        r"No match for FieldRef.Name\(part\) in " +
+        fragment.physical_schema.to_string(False, False, False)
     )
     with pytest.raises(ValueError, match=pattern):
         new_fragment = parquet_format.make_fragment(
@@ -912,7 +928,8 @@ def test_fragments_parquet_row_groups_dictionary(tempdir):
 @pytest.mark.parquet
 def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs):
     fs, assert_opens = open_logging_fs
-    _, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2, filesystem=fs)
+    _, dataset = _create_dataset_for_fragments(
+        tempdir, chunk_size=2, filesystem=fs)
     fragment = list(dataset.get_fragments())[0]
 
     # with default discovery, no metadata loaded
@@ -1161,7 +1178,8 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir):
 @pytest.mark.parquet
 def test_fragments_parquet_subset_ids(tempdir, open_logging_fs):
     fs, assert_opens = open_logging_fs
-    table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, filesystem=fs)
+    table, dataset = _create_dataset_for_fragments(
+        tempdir, chunk_size=1, filesystem=fs)
     fragment = list(dataset.get_fragments())[0]
 
     # select with row group ids
@@ -1188,7 +1206,8 @@ def test_fragments_parquet_subset_ids(tempdir, open_logging_fs):
 @pytest.mark.parquet
 def test_fragments_parquet_subset_filter(tempdir, open_logging_fs):
     fs, assert_opens = open_logging_fs
-    table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, filesystem=fs)
+    table, dataset = _create_dataset_for_fragments(
+        tempdir, chunk_size=1, filesystem=fs)
     fragment = list(dataset.get_fragments())[0]
 
     # select with filter
@@ -1238,7 +1257,8 @@ def test_partitioning_factory(mockfs):
     assert isinstance(partitioning_factory, ds.PartitioningFactory)
     options.partitioning_factory = partitioning_factory
 
-    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options)
+    factory = ds.FileSystemDatasetFactory(
+        mockfs, paths_or_selector, format, options)
     inspected_schema = factory.inspect()
     # i64/f64 from data, group/key from "/1/xxx" and "/2/yyy" paths
     expected_schema = pa.schema(
@@ -1267,7 +1287,8 @@ def test_partitioning_factory_dictionary(mockfs, infer_dictionary):
         ["group", "key"], infer_dictionary=infer_dictionary
     )
 
-    factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options)
+    factory = ds.FileSystemDatasetFactory(
+        mockfs, paths_or_selector, format, options)
 
     inferred_schema = factory.inspect()
     if infer_dictionary:
@@ -1394,7 +1415,8 @@ def test_open_dataset_list_of_files(tempdir):
     tables, (path1, path2) = _create_directory_of_files(tempdir)
     table = pa.concat_tables(tables)
 
-    datasets = [ds.dataset([path1, path2]), ds.dataset([str(path1), str(path2)])]
+    datasets = [ds.dataset([path1, path2]),
+                ds.dataset([str(path1), str(path2)])]
     datasets += [pickle.loads(pickle.dumps(d)) for d in datasets]
 
     for dataset in datasets:
@@ -1502,7 +1524,8 @@ def test_construct_empty_dataset():
     assert table.num_rows == 0
     assert table.num_columns == 0
 
-    empty = ds.dataset([], schema=pa.schema([("a", pa.int64()), ("a", pa.string())]))
+    empty = ds.dataset([], schema=pa.schema(
+        [("a", pa.int64()), ("a", pa.string())]))
     table = empty.to_table()
     assert table.num_rows == 0
     assert table.num_columns == 2
@@ -1553,13 +1576,15 @@ def test_open_dataset_partitioned_directory(tempdir):
     _check_dataset_from_path(path, full_table)
 
     # specify partition scheme with discovery
-    dataset = ds.dataset(str(path), partitioning=ds.partitioning(flavor="hive"))
+    dataset = ds.dataset(
+        str(path), partitioning=ds.partitioning(flavor="hive"))
     expected_schema = table.schema.append(pa.field("part", pa.int32()))
     assert dataset.schema.equals(expected_schema)
 
     # specify partition scheme with discovery and relative path
     with change_cwd(tempdir):
-        dataset = ds.dataset("dataset/", partitioning=ds.partitioning(flavor="hive"))
+        dataset = ds.dataset(
+            "dataset/", partitioning=ds.partitioning(flavor="hive"))
         expected_schema = table.schema.append(pa.field("part", pa.int32()))
         assert dataset.schema.equals(expected_schema)
 
@@ -1570,7 +1595,8 @@ def test_open_dataset_partitioned_directory(tempdir):
     # specify partition scheme with explicit scheme
     dataset = ds.dataset(
         str(path),
-        partitioning=ds.partitioning(pa.schema([("part", pa.int8())]), flavor="hive"),
+        partitioning=ds.partitioning(
+            pa.schema([("part", pa.int8())]), flavor="hive"),
     )
     expected_schema = table.schema.append(pa.field("part", pa.int8()))
     assert dataset.schema.equals(expected_schema)
@@ -1692,7 +1718,8 @@ def test_open_dataset_partitioned_dictionary_type(
     part_keys1, part_keys2 = partition_keys
     for part1 in part_keys1:
         for part2 in part_keys2:
-            path = basepath / fmt.format(part1 or null_value, part2 or null_value)
+            path = basepath / \
+                fmt.format(part1 or null_value, part2 or null_value)
             path.mkdir(parents=True)
             pq.write_table(table, path / "test.parquet")
 
@@ -2246,7 +2273,8 @@ def _create_metadata_file(root_path):
         metadata_collector.append(metadata)
 
     metadata_path = root_path / "_metadata"
-    pq.write_metadata(schema, metadata_path, metadata_collector=metadata_collector)
+    pq.write_metadata(schema, metadata_path,
+                      metadata_collector=metadata_collector)
     return metadata_path
 
 
@@ -2369,7 +2397,8 @@ def test_filter_mismatching_schema(tempdir):
 
     # specifying explicit schema, but that mismatches the schema of the data
     schema = pa.schema([("col", pa.int64())])
-    dataset = ds.dataset(tempdir / "data.parquet", format="parquet", schema=schema)
+    dataset = ds.dataset(tempdir / "data.parquet",
+                         format="parquet", schema=schema)
 
     # filtering on a column with such type mismatch should give a proper error
     with pytest.raises(TypeError):
@@ -2408,7 +2437,8 @@ def test_dataset_project_null_column(tempdir):
     f = tempdir / "test_dataset_project_null_column.parquet"
     df.to_parquet(f, engine="pyarrow")
 
-    dataset = ds.dataset(f, format="parquet", schema=pa.schema([("col", pa.int64())]))
+    dataset = ds.dataset(f, format="parquet",
+                         schema=pa.schema([("col", pa.int64())]))
     expected = pa.table({"col": pa.array([None, None, None], pa.int64())})
     assert dataset.to_table().equals(expected)
 
@@ -2431,7 +2461,8 @@ def _check_dataset_roundtrip(
     assert set(file_paths) == set(expected_files)
 
     # check that reading back in as dataset gives the same result
-    dataset2 = ds.dataset(base_dir_path, format="feather", partitioning=partitioning)
+    dataset2 = ds.dataset(base_dir_path, format="feather",
+                          partitioning=partitioning)
     assert dataset2.to_table().equals(dataset.to_table())
 
 
@@ -2516,7 +2547,8 @@ def test_write_dataset_partitioned_dict(tempdir):
 
     # directory partitioning, dictionary partition columns
     dataset = ds.dataset(
-        directory, partitioning=ds.HivePartitioning.discover(infer_dictionary=True)
+        directory, partitioning=ds.HivePartitioning.discover(
+            infer_dictionary=True)
     )
     target = tempdir / "partitioned-dir-target"
     expected_paths = [
@@ -2544,7 +2576,8 @@ def test_write_dataset_use_threads(tempdir):
     _ = _create_parquet_dataset_partitioned(directory)
     dataset = ds.dataset(directory, partitioning="hive")
 
-    partitioning = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
+    partitioning = ds.partitioning(
+        pa.schema([("part", pa.string())]), flavor="hive")
 
     target1 = tempdir / "partitioned1"
     ds.write_dataset(
@@ -2585,7 +2618,8 @@ def test_write_table(tempdir):
 
     # with partitioning
     base_dir = tempdir / "partitioned"
-    partitioning = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
+    partitioning = ds.partitioning(
+        pa.schema([("part", pa.string())]), flavor="hive")
     ds.write_dataset(
         table,
         base_dir,
@@ -2659,7 +2693,8 @@ def test_write_table_partitioned_dict(tempdir):
     partitioning = ds.partitioning(table.select(["part"]).schema)
 
     base_dir = tempdir / "dataset"
-    ds.write_dataset(table, base_dir, format="feather", partitioning=partitioning)
+    ds.write_dataset(table, base_dir, format="feather",
+                     partitioning=partitioning)
 
     # check roundtrip
     partitioning_read = ds.DirectoryPartitioning.discover(

From 3ca5f348636214452bbfac535312d928420b8b12 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Thu, 11 Feb 2021 21:50:09 -1000
Subject: [PATCH 17/33] Final lint pass.  Turns out I was relying on black
 which was messing up everything

---
 python/pyarrow/_compute.pyx          |   10 +-
 python/pyarrow/_dataset.pyx          |   10 +-
 python/pyarrow/compute.py            |  107 +--
 python/pyarrow/includes/libarrow.pxd |    3 +-
 python/pyarrow/tests/test_dataset.py | 1314 ++++++++++++--------------
 5 files changed, 647 insertions(+), 797 deletions(-)

diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index d3d5dc510a3..3cb152aa381 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -659,13 +659,15 @@ cdef class _DictionaryEncodeOptions(FunctionOptions):
     def _set_options(self, null_encoding_behavior):
         if null_encoding_behavior == 'encode':
             self.dictionary_encode_options.reset(
-                new CDictionaryEncodeOptions(CDictionaryEncodeNullEncodingBehavior_ENCODE))
+                new CDictionaryEncodeOptions(
+                    CDictionaryEncodeNullEncodingBehavior_ENCODE))
         elif null_encoding_behavior == 'mask':
             self.dictionary_encode_options.reset(
-                new CDictionaryEncodeOptions(CDictionaryEncodeNullEncodingBehavior_MASK))
+                new CDictionaryEncodeOptions(
+                    CDictionaryEncodeNullEncodingBehavior_MASK))
         else:
-            raise ValueError('"{}" is not a valid null_encoding_behavior'.format(
-                null_encoding_behavior))
+            raise ValueError('"{}" is not a valid null_encoding_behavior'
+                             .format(null_encoding_behavior))
 
 
 class DictionaryEncodeOptions(_DictionaryEncodeOptions):
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index acd5d9602b5..e38ea626d79 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -1614,7 +1614,11 @@ cdef class HivePartitioning(Partitioning):
     cdef:
         CHivePartitioning* hive_partitioning
 
-    def __init__(self, Schema schema not None, dictionaries=None, null_fallback="__HIVE_DEFAULT_PARTITION__"):
+    def __init__(self,
+                 Schema schema not None,
+                 dictionaries=None,
+                 null_fallback="__HIVE_DEFAULT_PARTITION__"):
+
         cdef:
             shared_ptr[CHivePartitioning] c_partitioning
             c_string c_null_fallback = tobytes(null_fallback)
@@ -1631,7 +1635,9 @@ cdef class HivePartitioning(Partitioning):
         self.hive_partitioning = <CHivePartitioning*> sp.get()
 
     @staticmethod
-    def discover(infer_dictionary=False, max_partition_dictionary_size=0, null_fallback="__HIVE_DEFAULT_PARTITION__"):
+    def discover(infer_dictionary=False,
+                 max_partition_dictionary_size=0,
+                 null_fallback="__HIVE_DEFAULT_PARTITION__"):
         """
         Discover a HivePartitioning.
 
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index 1e437d43d4c..3d7f5ecb4c3 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -69,14 +69,14 @@ def _get_arg_names(func):
             arg_names = ["left", "right"]
         else:
             raise NotImplementedError(
-                f"unsupported arity: {func.arity} (function: {func.name})"
-            )
+                f"unsupported arity: {func.arity} (function: {func.name})")
 
     return arg_names
 
 
 def _decorate_compute_function(wrapper, exposed_name, func, option_class):
-    wrapper.__arrow_compute_function__ = dict(name=func.name, arity=func.arity)
+    wrapper.__arrow_compute_function__ = dict(name=func.name,
+                                              arity=func.arity)
     wrapper.__name__ = exposed_name
     wrapper.__qualname__ = exposed_name
 
@@ -86,64 +86,47 @@ def _decorate_compute_function(wrapper, exposed_name, func, option_class):
     summary = cpp_doc.summary
     if not summary:
         arg_str = "arguments" if func.arity > 1 else "argument"
-        summary = "Call compute function {!r} with the given {}".format(
-            func.name, arg_str
-        )
+        summary = ("Call compute function {!r} with the given {}"
+                   .format(func.name, arg_str))
 
     description = cpp_doc.description
     arg_names = _get_arg_names(func)
 
-    doc_pieces.append(
-        """\
+    doc_pieces.append("""\
         {}.
 
-        """.format(
-            summary
-        )
-    )
+        """.format(summary))
 
     if description:
         doc_pieces.append("{}\n\n".format(description))
 
-    doc_pieces.append(
-        """\
+    doc_pieces.append("""\
         Parameters
         ----------
-        """
-    )
+        """)
 
     for arg_name in arg_names:
-        if func.kind in ("vector", "scalar_aggregate"):
-            arg_type = "Array-like"
+        if func.kind in ('vector', 'scalar_aggregate'):
+            arg_type = 'Array-like'
         else:
-            arg_type = "Array-like or scalar-like"
-        doc_pieces.append(
-            """\
+            arg_type = 'Array-like or scalar-like'
+        doc_pieces.append("""\
             {} : {}
                 Argument to compute function
-            """.format(
-                arg_name, arg_type
-            )
-        )
+            """.format(arg_name, arg_type))
 
-    doc_pieces.append(
-        """\
+    doc_pieces.append("""\
         memory_pool : pyarrow.MemoryPool, optional
             If not passed, will allocate memory from the default memory pool.
-        """
-    )
+        """)
     if option_class is not None:
-        doc_pieces.append(
-            """\
+        doc_pieces.append("""\
             options : pyarrow.compute.{0}, optional
                 Parameters altering compute function semantics
             **kwargs : optional
                 Parameters for {0} constructor.  Either `options`
                 or `**kwargs` can be passed, but not both at the same time.
-            """.format(
-                option_class.__name__
-            )
-        )
+            """.format(option_class.__name__))
 
     wrapper.__doc__ = "".join(dedent(s) for s in doc_pieces)
     return wrapper
@@ -156,10 +139,8 @@ def _get_options_class(func):
     try:
         return globals()[class_name]
     except KeyError:
-        warnings.warn(
-            "Python binding for {} not exposed".format(
-                class_name), RuntimeWarning
-        )
+        warnings.warn("Python binding for {} not exposed"
+                      .format(class_name), RuntimeWarning)
         return None
 
 
@@ -169,8 +150,8 @@ def _handle_options(name, option_class, options, kwargs):
             return option_class(**kwargs)
         raise TypeError(
             "Function {!r} called with both an 'options' argument "
-            "and additional named arguments".format(name)
-        )
+            "and additional named arguments"
+            .format(name))
 
     if options is not None:
         if isinstance(options, dict):
@@ -178,25 +159,20 @@ def _handle_options(name, option_class, options, kwargs):
         elif isinstance(options, option_class):
             return options
         raise TypeError(
-            "Function {!r} expected a {} parameter, got {}".format(
-                name, option_class, type(options)
-            )
-        )
+            "Function {!r} expected a {} parameter, got {}"
+            .format(name, option_class, type(options)))
 
     return options
 
 
-_wrapper_template = dedent(
-    """\
+_wrapper_template = dedent("""\
     def make_wrapper(func, option_class):
         def {func_name}({args_sig}{kwonly}, memory_pool=None):
             return func.call([{args_sig}], None, memory_pool)
         return {func_name}
-    """
-)
+    """)
 
-_wrapper_options_template = dedent(
-    """\
+_wrapper_options_template = dedent("""\
     def make_wrapper(func, option_class):
         def {func_name}({args_sig}{kwonly}, options=None, memory_pool=None,
                         **kwargs):
@@ -204,15 +180,14 @@ def {func_name}({args_sig}{kwonly}, options=None, memory_pool=None,
                                       kwargs)
             return func.call([{args_sig}], options, memory_pool)
         return {func_name}
-    """
-)
+    """)
 
 
 def _wrap_function(name, func):
     option_class = _get_options_class(func)
     arg_names = _get_arg_names(func)
-    args_sig = ", ".join(arg_names)
-    kwonly = "" if arg_names[-1].startswith("*") else ", *"
+    args_sig = ', '.join(arg_names)
+    kwonly = '' if arg_names[-1].startswith('*') else ', *'
 
     # Generate templated wrapper, so that the signature matches
     # the documented argument names.
@@ -221,11 +196,9 @@ def _wrap_function(name, func):
         template = _wrapper_options_template
     else:
         template = _wrapper_template
-    exec(
-        template.format(func_name=name, args_sig=args_sig,
-                        kwonly=kwonly), globals(), ns
-    )
-    wrapper = ns["make_wrapper"](func, option_class)
+    exec(template.format(func_name=name, args_sig=args_sig, kwonly=kwonly),
+         globals(), ns)
+    wrapper = ns['make_wrapper'](func, option_class)
 
     return _decorate_compute_function(wrapper, name, func, option_class)
 
@@ -241,7 +214,8 @@ def _make_global_functions():
     reg = function_registry()
 
     # Avoid clashes with Python keywords
-    rewrites = {"and": "and_", "or": "or_"}
+    rewrites = {'and': 'and_',
+                'or': 'or_'}
 
     for cpp_name in reg.list_functions():
         name = rewrites.get(cpp_name, cpp_name)
@@ -325,7 +299,8 @@ def match_substring(array, pattern):
     -------
     result : pyarrow.Array or pyarrow.ChunkedArray
     """
-    return call_function("match_substring", [array], MatchSubstringOptions(pattern))
+    return call_function("match_substring", [array],
+                         MatchSubstringOptions(pattern))
 
 
 def sum(array):
@@ -340,7 +315,7 @@ def sum(array):
     -------
     sum : pyarrow.Scalar
     """
-    return call_function("sum", [array])
+    return call_function('sum', [array])
 
 
 def mode(array, n=1):
@@ -372,7 +347,7 @@ def mode(array, n=1):
     return call_function("mode", [array], options)
 
 
-def filter(data, mask, null_selection_behavior="drop"):
+def filter(data, mask, null_selection_behavior='drop'):
     """
     Select values (or records) from array- or table-like data given boolean
     filter, where true values are selected.
@@ -413,7 +388,7 @@ def filter(data, mask, null_selection_behavior="drop"):
     ]
     """
     options = FilterOptions(null_selection_behavior)
-    return call_function("filter", [data, mask], options)
+    return call_function('filter', [data, mask], options)
 
 
 def take(data, indices, *, boundscheck=True, memory_pool=None):
@@ -454,7 +429,7 @@ def take(data, indices, *, boundscheck=True, memory_pool=None):
     ]
     """
     options = TakeOptions(boundscheck=boundscheck)
-    return call_function("take", [data, indices], options, memory_pool)
+    return call_function('take', [data, indices], options, memory_pool)
 
 
 def fill_null(values, fill_value):
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 6423741ae50..ba3c3ad7d2b 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1812,7 +1812,8 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
     cdef cppclass CDictionaryEncodeOptions \
             "arrow::compute::DictionaryEncodeOptions"(CFunctionOptions):
         CDictionaryEncodeOptions()
-        CDictionaryEncodeOptions(CDictionaryEncodeNullEncodingBehavior null_encoding)
+        CDictionaryEncodeOptions(
+            CDictionaryEncodeNullEncodingBehavior null_encoding)
         CDictionaryEncodeNullEncodingBehavior null_encoding
 
     cdef cppclass CTakeOptions \
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 89bec21fd86..b2c1fc9f030 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -21,7 +21,6 @@
 import textwrap
 
 import numpy as np
-from numpy.core.fromnumeric import partition
 import pytest
 
 import pyarrow as pa
@@ -50,25 +49,23 @@ def _generate_data(n):
 
     day = datetime.datetime(2000, 1, 1)
     interval = datetime.timedelta(days=5)
-    colors = itertools.cycle(["green", "blue", "yellow", "red", "orange"])
+    colors = itertools.cycle(['green', 'blue', 'yellow', 'red', 'orange'])
 
     data = []
     for i in range(n):
         data.append((day, i, float(i), next(colors)))
         day += interval
 
-    return pd.DataFrame(data, columns=["date", "index", "value", "color"])
+    return pd.DataFrame(data, columns=['date', 'index', 'value', 'color'])
 
 
 def _table_from_pandas(df):
-    schema = pa.schema(
-        [
-            pa.field("date", pa.date32()),
-            pa.field("index", pa.int64()),
-            pa.field("value", pa.float64()),
-            pa.field("color", pa.string()),
-        ]
-    )
+    schema = pa.schema([
+        pa.field('date', pa.date32()),
+        pa.field('index', pa.int64()),
+        pa.field('value', pa.float64()),
+        pa.field('color', pa.string()),
+    ])
     table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
     return table.replace_schema_metadata()
 
@@ -81,28 +78,26 @@ def mockfs():
     mockfs = fs._MockFileSystem()
 
     directories = [
-        "subdir/1/xxx",
-        "subdir/2/yyy",
+        'subdir/1/xxx',
+        'subdir/2/yyy',
     ]
 
     for i, directory in enumerate(directories):
-        path = "{}/file{}.parquet".format(directory, i)
+        path = '{}/file{}.parquet'.format(directory, i)
         mockfs.create_dir(directory)
         with mockfs.open_output_stream(path) as out:
             data = [
                 list(range(5)),
                 list(map(float, range(5))),
                 list(map(str, range(5))),
-                [i] * 5,
+                [i] * 5
             ]
-            schema = pa.schema(
-                [
-                    pa.field("i64", pa.int64()),
-                    pa.field("f64", pa.float64()),
-                    pa.field("str", pa.string()),
-                    pa.field("const", pa.int64()),
-                ]
-            )
+            schema = pa.schema([
+                pa.field('i64', pa.int64()),
+                pa.field('f64', pa.float64()),
+                pa.field('str', pa.string()),
+                pa.field('const', pa.int64()),
+            ])
             batch = pa.record_batch(data, schema=schema)
             table = pa.Table.from_batches([batch])
 
@@ -143,10 +138,10 @@ def assert_opens(expected_opened):
     return fs, assert_opens
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
 def multisourcefs(request):
-    request.config.pyarrow.requires("pandas")
-    request.config.pyarrow.requires("parquet")
+    request.config.pyarrow.requires('pandas')
+    request.config.pyarrow.requires('parquet')
     import pyarrow.parquet as pq
 
     df = _generate_data(1000)
@@ -158,35 +153,35 @@ def multisourcefs(request):
 
     # create a directory containing a flat sequence of parquet files without
     # any partitioning involved
-    mockfs.create_dir("plain")
+    mockfs.create_dir('plain')
     for i, chunk in enumerate(np.array_split(df_a, 10)):
-        path = "plain/chunk-{}.parquet".format(i)
+        path = 'plain/chunk-{}.parquet'.format(i)
         with mockfs.open_output_stream(path) as out:
             pq.write_table(_table_from_pandas(chunk), out)
 
     # create one with schema partitioning by weekday and color
-    mockfs.create_dir("schema")
+    mockfs.create_dir('schema')
     for part, chunk in df_b.groupby([df_b.date.dt.dayofweek, df_b.color]):
-        folder = "schema/{}/{}".format(*part)
-        path = "{}/chunk.parquet".format(folder)
+        folder = 'schema/{}/{}'.format(*part)
+        path = '{}/chunk.parquet'.format(folder)
         mockfs.create_dir(folder)
         with mockfs.open_output_stream(path) as out:
             pq.write_table(_table_from_pandas(chunk), out)
 
     # create one with hive partitioning by year and month
-    mockfs.create_dir("hive")
+    mockfs.create_dir('hive')
     for part, chunk in df_c.groupby([df_c.date.dt.year, df_c.date.dt.month]):
-        folder = "hive/year={}/month={}".format(*part)
-        path = "{}/chunk.parquet".format(folder)
+        folder = 'hive/year={}/month={}'.format(*part)
+        path = '{}/chunk.parquet'.format(folder)
         mockfs.create_dir(folder)
         with mockfs.open_output_stream(path) as out:
             pq.write_table(_table_from_pandas(chunk), out)
 
     # create one with hive partitioning by color
-    mockfs.create_dir("hive_color")
+    mockfs.create_dir('hive_color')
     for part, chunk in df_d.groupby(["color"]):
-        folder = "hive_color/color={}".format(*part)
-        path = "{}/chunk.parquet".format(folder)
+        folder = 'hive_color/color={}'.format(*part)
+        path = '{}/chunk.parquet'.format(folder)
         mockfs.create_dir(folder)
         with mockfs.open_output_stream(path) as out:
             pq.write_table(_table_from_pandas(chunk), out)
@@ -197,41 +192,36 @@ def multisourcefs(request):
 @pytest.fixture
 def dataset(mockfs):
     format = ds.ParquetFileFormat()
-    selector = fs.FileSelector("subdir", recursive=True)
-    options = ds.FileSystemFactoryOptions("subdir")
+    selector = fs.FileSelector('subdir', recursive=True)
+    options = ds.FileSystemFactoryOptions('subdir')
     options.partitioning = ds.DirectoryPartitioning(
-        pa.schema([pa.field("group", pa.int32()),
-                   pa.field("key", pa.string())])
+        pa.schema([
+            pa.field('group', pa.int32()),
+            pa.field('key', pa.string())
+        ])
     )
     factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options)
     return factory.finish()
 
 
 def test_filesystem_dataset(mockfs):
-    schema = pa.schema([pa.field("const", pa.int64())])
+    schema = pa.schema([
+        pa.field('const', pa.int64())
+    ])
     file_format = ds.ParquetFileFormat()
-    paths = ["subdir/1/xxx/file0.parquet", "subdir/2/yyy/file1.parquet"]
-    partitions = [ds.field("part") == x for x in range(1, 3)]
-    fragments = [
-        file_format.make_fragment(path, mockfs, part)
-        for path, part in zip(paths, partitions)
-    ]
-    root_partition = ds.field("level") == ds.scalar(1337)
+    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
+    partitions = [ds.field('part') == x for x in range(1, 3)]
+    fragments = [file_format.make_fragment(path, mockfs, part)
+                 for path, part in zip(paths, partitions)]
+    root_partition = ds.field('level') == ds.scalar(1337)
 
     dataset_from_fragments = ds.FileSystemDataset(
-        fragments,
-        schema=schema,
-        format=file_format,
-        filesystem=mockfs,
-        root_partition=root_partition,
+        fragments, schema=schema, format=file_format,
+        filesystem=mockfs, root_partition=root_partition,
     )
     dataset_from_paths = ds.FileSystemDataset.from_paths(
-        paths,
-        schema=schema,
-        format=file_format,
-        filesystem=mockfs,
-        partitions=partitions,
-        root_partition=root_partition,
+        paths, schema=schema, format=file_format, filesystem=mockfs,
+        partitions=partitions, root_partition=root_partition,
     )
 
     for dataset in [dataset_from_fragments, dataset_from_paths]:
@@ -278,9 +268,8 @@ def test_filesystem_dataset(mockfs):
         ds.FileSystemDataset(fragments, file_format, schema)
     # validation of root_partition
     with pytest.raises(TypeError, match="incorrect type"):
-        ds.FileSystemDataset(
-            fragments, schema=schema, format=file_format, root_partition=1
-        )
+        ds.FileSystemDataset(fragments, schema=schema,
+                             format=file_format, root_partition=1)
     # missing required argument in from_paths
     with pytest.raises(TypeError, match="incorrect type"):
         ds.FileSystemDataset.from_paths(fragments, format=file_format)
@@ -288,15 +277,15 @@ def test_filesystem_dataset(mockfs):
 
 def test_filesystem_dataset_no_filesystem_interaction():
     # ARROW-8283
-    schema = pa.schema([pa.field("f1", pa.int64())])
+    schema = pa.schema([
+        pa.field('f1', pa.int64())
+    ])
     file_format = ds.IpcFileFormat()
-    paths = ["nonexistingfile.arrow"]
+    paths = ['nonexistingfile.arrow']
 
     # creating the dataset itself doesn't raise
     dataset = ds.FileSystemDataset.from_paths(
-        paths,
-        schema=schema,
-        format=file_format,
+        paths, schema=schema, format=file_format,
         filesystem=fs.LocalFileSystem(),
     )
 
@@ -328,28 +317,27 @@ def test_dataset(dataset):
     assert isinstance(table, pa.Table)
     assert len(table) == 10
 
-    condition = ds.field("i64") == 1
+    condition = ds.field('i64') == 1
     result = dataset.to_table(use_threads=True, filter=condition).to_pydict()
 
     # don't rely on the scanning order
-    assert result["i64"] == [1, 1]
-    assert result["f64"] == [1.0, 1.0]
-    assert sorted(result["group"]) == [1, 2]
-    assert sorted(result["key"]) == ["xxx", "yyy"]
+    assert result['i64'] == [1, 1]
+    assert result['f64'] == [1., 1.]
+    assert sorted(result['group']) == [1, 2]
+    assert sorted(result['key']) == ['xxx', 'yyy']
 
 
 def test_scanner(dataset):
-    scanner = ds.Scanner.from_dataset(
-        dataset, memory_pool=pa.default_memory_pool())
+    scanner = ds.Scanner.from_dataset(dataset,
+                                      memory_pool=pa.default_memory_pool())
     assert isinstance(scanner, ds.Scanner)
     assert len(list(scanner.scan())) == 2
 
     with pytest.raises(pa.ArrowInvalid):
-        ds.Scanner.from_dataset(dataset, columns=["unknown"])
+        ds.Scanner.from_dataset(dataset, columns=['unknown'])
 
-    scanner = ds.Scanner.from_dataset(
-        dataset, columns=["i64"], memory_pool=pa.default_memory_pool()
-    )
+    scanner = ds.Scanner.from_dataset(dataset, columns=['i64'],
+                                      memory_pool=pa.default_memory_pool())
 
     assert isinstance(scanner, ds.Scanner)
     assert len(list(scanner.scan())) == 2
@@ -370,49 +358,46 @@ def test_abstract_classes():
 
 
 def test_partitioning():
-    schema = pa.schema([pa.field("i64", pa.int64()),
-                        pa.field("f64", pa.float64())])
+    schema = pa.schema([
+        pa.field('i64', pa.int64()),
+        pa.field('f64', pa.float64())
+    ])
     for klass in [ds.DirectoryPartitioning, ds.HivePartitioning]:
         partitioning = klass(schema)
         assert isinstance(partitioning, ds.Partitioning)
 
     partitioning = ds.DirectoryPartitioning(
-        pa.schema([pa.field("group", pa.int64()),
-                   pa.field("key", pa.float64())])
+        pa.schema([
+            pa.field('group', pa.int64()),
+            pa.field('key', pa.float64())
+        ])
     )
-    expr = partitioning.parse("/3/3.14")
+    expr = partitioning.parse('/3/3.14')
     assert isinstance(expr, ds.Expression)
 
-    expected = (ds.field("group") == 3) & (ds.field("key") == 3.14)
+    expected = (ds.field('group') == 3) & (ds.field('key') == 3.14)
     assert expr.equals(expected)
 
     with pytest.raises(pa.ArrowInvalid):
-        partitioning.parse("/prefix/3/aaa")
+        partitioning.parse('/prefix/3/aaa')
 
     partitioning = ds.HivePartitioning(
-        pa.schema([pa.field("alpha", pa.int64()),
-                   pa.field("beta", pa.int64())])
+        pa.schema([
+            pa.field('alpha', pa.int64()),
+            pa.field('beta', pa.int64())
+        ])
+    )
+    expr = partitioning.parse('/alpha=0/beta=3')
+    expected = (
+        (ds.field('alpha') == ds.scalar(0)) &
+        (ds.field('beta') == ds.scalar(3))
     )
-    expr = partitioning.parse("/alpha=0/beta=3")
-    expected = (ds.field("alpha") == ds.scalar(0)) & (
-        ds.field("beta") == ds.scalar(3))
     assert expr.equals(expected)
 
-    for shouldfail in ["/alpha=one/beta=2", "/alpha=one", "/beta=two"]:
+    for shouldfail in ['/alpha=one/beta=2', '/alpha=one', '/beta=two']:
         with pytest.raises(pa.ArrowInvalid):
             partitioning.parse(shouldfail)
 
-    partitioning = ds.HivePartitioning(
-        pa.schema([pa.field("alpha", pa.int64()),
-                   pa.field("beta", pa.int64())]),
-        None,
-        "xyz",
-    )
-    expr = partitioning.parse("/alpha=xyz/beta=3")
-    expected = (ds.field("alpha").is_null()) & (
-        ds.field("beta") == ds.scalar(3))
-    assert expr.equals(expected)
-
 
 def test_expression_serialization():
     a = ds.scalar(1)
@@ -420,30 +405,14 @@ def test_expression_serialization():
     c = ds.scalar(True)
     d = ds.scalar("string")
     e = ds.scalar(None)
-    f = ds.scalar({"a": 1})
+    f = ds.scalar({'a': 1})
     g = ds.scalar(pa.scalar(1))
 
-    all_exprs = [
-        a,
-        b,
-        c,
-        d,
-        e,
-        f,
-        g,
-        a == b,
-        a > b,
-        a & b,
-        a | b,
-        ~c,
-        d.is_valid(),
-        a.cast(pa.int32(), safe=False),
-        a.cast(pa.int32(), safe=False),
-        a.isin([1, 2, 3]),
-        ds.field("i64") > 5,
-        ds.field("i64") == 5,
-        ds.field("i64") == 7,
-    ]
+    all_exprs = [a, b, c, d, e, f, g, a == b, a > b, a & b, a | b, ~c,
+                 d.is_valid(), a.cast(pa.int32(), safe=False),
+                 a.cast(pa.int32(), safe=False), a.isin([1, 2, 3]),
+                 ds.field('i64') > 5, ds.field('i64') == 5,
+                 ds.field('i64') == 7]
     for expr in all_exprs:
         assert isinstance(expr, ds.Expression)
         restored = pickle.loads(pickle.dumps(expr))
@@ -491,16 +460,13 @@ def test_expression_boolean_operators():
 
 
 def test_partition_keys():
-    a, b, c = [ds.field(f) == f for f in "abc"]
-    assert ds._get_partition_keys(a) == {"a": "a"}
-    assert ds._get_partition_keys(a & b & c) == {f: f for f in "abc"}
+    a, b, c = [ds.field(f) == f for f in 'abc']
+    assert ds._get_partition_keys(a) == {'a': 'a'}
+    assert ds._get_partition_keys(a & b & c) == {f: f for f in 'abc'}
 
-    null = ds.field("a").is_null()
-    assert ds._get_partition_keys(null) == {"a": None}
-
-    nope = ds.field("d") >= 3
+    nope = ds.field('d') >= 3
     assert ds._get_partition_keys(nope) == {}
-    assert ds._get_partition_keys(a & nope) == {"a": "a"}
+    assert ds._get_partition_keys(a & nope) == {'a': 'a'}
 
 
 def test_parquet_read_options():
@@ -542,66 +508,69 @@ def test_file_format_pickling():
     formats = [
         ds.IpcFileFormat(),
         ds.CsvFileFormat(),
-        ds.CsvFileFormat(pa.csv.ParseOptions(
-            delimiter="\t", ignore_empty_lines=True)),
+        ds.CsvFileFormat(pa.csv.ParseOptions(delimiter='\t',
+                                             ignore_empty_lines=True)),
         ds.ParquetFileFormat(),
         ds.ParquetFileFormat(
             read_options=ds.ParquetReadOptions(use_buffered_stream=True)
         ),
         ds.ParquetFileFormat(
             read_options={
-                "use_buffered_stream": True,
-                "buffer_size": 4096,
+                'use_buffered_stream': True,
+                'buffer_size': 4096,
             }
-        ),
+        )
     ]
     for file_format in formats:
         assert pickle.loads(pickle.dumps(file_format)) == file_format
 
 
-@pytest.mark.parametrize(
-    "paths_or_selector",
+@pytest.mark.parametrize('paths_or_selector', [
+    fs.FileSelector('subdir', recursive=True),
     [
         'subdir/1/xxx/file0.parquet',
         'subdir/2/yyy/file1.parquet',
     ]
 ])
+<<<<<<< HEAD
 @pytest.mark.parametrize('pre_buffer', [False, True])
 def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer):
+=======
+def test_filesystem_factory(mockfs, paths_or_selector):
+>>>>>>> Final lint pass.  Turns out I was relying on black which was messing up everything
     format = ds.ParquetFileFormat(
         read_options=ds.ParquetReadOptions(dictionary_columns={"str"},
                                            pre_buffer=pre_buffer)
     )
 
-    options = ds.FileSystemFactoryOptions("subdir")
+    options = ds.FileSystemFactoryOptions('subdir')
     options.partitioning = ds.DirectoryPartitioning(
-        pa.schema([pa.field("group", pa.int32()),
-                   pa.field("key", pa.string())])
+        pa.schema([
+            pa.field('group', pa.int32()),
+            pa.field('key', pa.string())
+        ])
     )
-    assert options.partition_base_dir == "subdir"
-    assert options.selector_ignore_prefixes == [".", "_"]
+    assert options.partition_base_dir == 'subdir'
+    assert options.selector_ignore_prefixes == ['.', '_']
     assert options.exclude_invalid_files is False
 
     factory = ds.FileSystemDatasetFactory(
-        mockfs, paths_or_selector, format, options)
+        mockfs, paths_or_selector, format, options
+    )
     inspected_schema = factory.inspect()
 
-    assert factory.inspect().equals(
-        pa.schema(
-            [
-                pa.field("i64", pa.int64()),
-                pa.field("f64", pa.float64()),
-                pa.field("str", pa.dictionary(pa.int32(), pa.string())),
-                pa.field("const", pa.int64()),
-                pa.field("group", pa.int32()),
-                pa.field("key", pa.string()),
-            ]
-        ),
-        check_metadata=False,
-    )
+    assert factory.inspect().equals(pa.schema([
+        pa.field('i64', pa.int64()),
+        pa.field('f64', pa.float64()),
+        pa.field('str', pa.dictionary(pa.int32(), pa.string())),
+        pa.field('const', pa.int64()),
+        pa.field('group', pa.int32()),
+        pa.field('key', pa.string()),
+    ]), check_metadata=False)
 
     assert isinstance(factory.inspect_schemas(), list)
-    assert isinstance(factory.finish(inspected_schema), ds.FileSystemDataset)
+    assert isinstance(factory.finish(inspected_schema),
+                      ds.FileSystemDataset)
     assert factory.root_partition.equals(ds.scalar(True))
 
     dataset = factory.finish()
@@ -613,9 +582,9 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer):
     expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64())
     expected_str = pa.DictionaryArray.from_arrays(
         pa.array([0, 1, 2, 3, 4], type=pa.int32()),
-        pa.array("0 1 2 3 4".split(), type=pa.string()),
+        pa.array("0 1 2 3 4".split(), type=pa.string())
     )
-    for task, group, key in zip(scanner.scan(), [1, 2], ["xxx", "yyy"]):
+    for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']):
         expected_group = pa.array([group] * 5, type=pa.int32())
         expected_key = pa.array([key] * 5, type=pa.string())
         expected_const = pa.array([group - 1] * 5, type=pa.int64())
@@ -636,16 +605,15 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer):
 
 def test_make_fragment(multisourcefs):
     parquet_format = ds.ParquetFileFormat()
-    dataset = ds.dataset(
-        "/plain", filesystem=multisourcefs, format=parquet_format)
+    dataset = ds.dataset('/plain', filesystem=multisourcefs,
+                         format=parquet_format)
 
     for path in dataset.files:
         fragment = parquet_format.make_fragment(path, multisourcefs)
         assert fragment.row_groups == [0]
 
-        row_group_fragment = parquet_format.make_fragment(
-            path, multisourcefs, row_groups=[0]
-        )
+        row_group_fragment = parquet_format.make_fragment(path, multisourcefs,
+                                                          row_groups=[0])
         for f in [fragment, row_group_fragment]:
             assert isinstance(f, ds.ParquetFileFragment)
             assert f.path == path
@@ -654,23 +622,21 @@ def test_make_fragment(multisourcefs):
 
 
 def test_make_csv_fragment_from_buffer():
-    content = textwrap.dedent(
-        """
+    content = textwrap.dedent("""
         alpha,num,animal
         a,12,dog
         b,11,cat
         c,10,rabbit
-    """
-    )
-    buffer = pa.py_buffer(content.encode("utf-8"))
+    """)
+    buffer = pa.py_buffer(content.encode('utf-8'))
 
     csv_format = ds.CsvFileFormat()
     fragment = csv_format.make_fragment(buffer)
 
-    expected = pa.table(
-        [["a", "b", "c"], [12, 11, 10], ["dog", "cat", "rabbit"]],
-        names=["alpha", "num", "animal"],
-    )
+    expected = pa.table([['a', 'b', 'c'],
+                         [12, 11, 10],
+                         ['dog', 'cat', 'rabbit']],
+                        names=['alpha', 'num', 'animal'])
     assert fragment.to_table().equals(expected)
 
     pickled = pickle.loads(pickle.dumps(fragment))
@@ -682,27 +648,29 @@ def test_make_parquet_fragment_from_buffer():
     import pyarrow.parquet as pq
 
     arrays = [
-        pa.array(["a", "b", "c"]),
+        pa.array(['a', 'b', 'c']),
         pa.array([12, 11, 10]),
-        pa.array(["dog", "cat", "rabbit"]),
+        pa.array(['dog', 'cat', 'rabbit'])
     ]
     dictionary_arrays = [
         arrays[0].dictionary_encode(),
         arrays[1],
-        arrays[2].dictionary_encode(),
+        arrays[2].dictionary_encode()
     ]
     dictionary_format = ds.ParquetFileFormat(
         read_options=ds.ParquetReadOptions(
             use_buffered_stream=True,
             buffer_size=4096,
-            dictionary_columns=["alpha", "animal"],
+            dictionary_columns=['alpha', 'animal']
         )
     )
 
-    cases = [(arrays, ds.ParquetFileFormat()),
-             (dictionary_arrays, dictionary_format)]
+    cases = [
+        (arrays, ds.ParquetFileFormat()),
+        (dictionary_arrays, dictionary_format)
+    ]
     for arrays, format_ in cases:
-        table = pa.table(arrays, names=["alpha", "num", "animal"])
+        table = pa.table(arrays, names=['alpha', 'num', 'animal'])
 
         out = pa.BufferOutputStream()
         pq.write_table(table, out)
@@ -719,14 +687,15 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None):
     import pyarrow.parquet as pq
 
     table = pa.table(
-        [range(8), [1] * 8, ["a"] * 4 + ["b"] * 4], names=["f1", "f2", "part"]
+        [range(8), [1] * 8, ['a'] * 4 + ['b'] * 4],
+        names=['f1', 'f2', 'part']
     )
 
     path = str(tempdir / "test_parquet_dataset")
 
     # write_to_dataset currently requires pandas
-    pq.write_to_dataset(table, path, partition_cols=[
-                        "part"], chunk_size=chunk_size)
+    pq.write_to_dataset(table, path,
+                        partition_cols=["part"], chunk_size=chunk_size)
     dataset = ds.dataset(
         path, format="parquet", partitioning="hive", filesystem=filesystem
     )
@@ -744,11 +713,11 @@ def test_fragments(tempdir):
     assert len(fragments) == 2
     f = fragments[0]
 
-    physical_names = ["f1", "f2"]
+    physical_names = ['f1', 'f2']
     # file's schema does not include partition column
     assert f.physical_schema.names == physical_names
     assert f.format.inspect(f.path, f.filesystem) == f.physical_schema
-    assert f.partition_expression.equals(ds.field("part") == "a")
+    assert f.partition_expression.equals(ds.field('part') == 'a')
 
     # By default, the partition column is not part of the schema.
     result = f.to_table()
@@ -758,13 +727,13 @@ def test_fragments(tempdir):
     # scanning fragment includes partition columns when given the proper
     # schema.
     result = f.to_table(schema=dataset.schema)
-    assert result.column_names == ["f1", "f2", "part"]
+    assert result.column_names == ['f1', 'f2', 'part']
     assert result.equals(table.slice(0, 4))
     assert f.physical_schema == result.schema.remove(2)
 
     # scanning fragments follow filter predicate
-    result = f.to_table(schema=dataset.schema, filter=ds.field("f1") < 2)
-    assert result.column_names == ["f1", "f2", "part"]
+    result = f.to_table(schema=dataset.schema, filter=ds.field('f1') < 2)
+    assert result.column_names == ['f1', 'f2', 'part']
 
 
 @pytest.mark.pandas
@@ -773,11 +742,11 @@ def test_fragments_implicit_cast(tempdir):
     # ARROW-8693
     import pyarrow.parquet as pq
 
-    table = pa.table([range(8), [1] * 4 + [2] * 4], names=["col", "part"])
+    table = pa.table([range(8), [1] * 4 + [2] * 4], names=['col', 'part'])
     path = str(tempdir / "test_parquet_dataset")
     pq.write_to_dataset(table, path, partition_cols=["part"])
 
-    part = ds.partitioning(pa.schema([("part", "int8")]), flavor="hive")
+    part = ds.partitioning(pa.schema([('part', 'int8')]), flavor="hive")
     dataset = ds.dataset(path, format="parquet", partitioning=part)
     fragments = dataset.get_fragments(filter=ds.field("part") >= 2)
     assert len(list(fragments)) == 1
@@ -788,7 +757,8 @@ def test_fragments_implicit_cast(tempdir):
 def test_fragments_reconstruct(tempdir):
     table, dataset = _create_dataset_for_fragments(tempdir)
 
-    def assert_yields_projected(fragment, row_slice, columns=None, filter=None):
+    def assert_yields_projected(fragment, row_slice,
+                                columns=None, filter=None):
         actual = fragment.to_table(
             schema=table.schema, columns=columns, filter=filter)
         column_names = columns if columns else table.column_names
@@ -806,53 +776,40 @@ def assert_yields_projected(fragment, row_slice, columns=None, filter=None):
 
     # manually re-construct a fragment, with explicit schema
     new_fragment = parquet_format.make_fragment(
-        fragment.path,
-        fragment.filesystem,
-        partition_expression=fragment.partition_expression,
-    )
+        fragment.path, fragment.filesystem,
+        partition_expression=fragment.partition_expression)
     assert new_fragment.to_table().equals(fragment.to_table())
     assert_yields_projected(new_fragment, (0, 4))
 
     # filter / column projection, inspected schema
     new_fragment = parquet_format.make_fragment(
-        fragment.path,
-        fragment.filesystem,
-        partition_expression=fragment.partition_expression,
-    )
-    assert_yields_projected(new_fragment, (0, 2), filter=ds.field("f1") < 2)
+        fragment.path, fragment.filesystem,
+        partition_expression=fragment.partition_expression)
+    assert_yields_projected(new_fragment, (0, 2), filter=ds.field('f1') < 2)
 
     # filter requiring cast / column projection, inspected schema
     new_fragment = parquet_format.make_fragment(
-        fragment.path,
-        fragment.filesystem,
-        partition_expression=fragment.partition_expression,
-    )
-    assert_yields_projected(
-        new_fragment, (0, 2), columns=["f1"], filter=ds.field("f1") < 2.0
-    )
+        fragment.path, fragment.filesystem,
+        partition_expression=fragment.partition_expression)
+    assert_yields_projected(new_fragment, (0, 2),
+                            columns=['f1'], filter=ds.field('f1') < 2.0)
 
     # filter on the partition column
     new_fragment = parquet_format.make_fragment(
-        fragment.path,
-        fragment.filesystem,
-        partition_expression=fragment.partition_expression,
-    )
+        fragment.path, fragment.filesystem,
+        partition_expression=fragment.partition_expression)
     assert_yields_projected(new_fragment, (0, 4),
-                            filter=ds.field("part") == "a")
+                            filter=ds.field('part') == 'a')
 
     # Fragments don't contain the partition's columns if not provided to the
     # `to_table(schema=...)` method.
-    pattern = (
-        r"No match for FieldRef.Name\(part\) in " +
-        fragment.physical_schema.to_string(False, False, False)
-    )
+    pattern = (r'No match for FieldRef.Name\(part\) in ' +
+               fragment.physical_schema.to_string(False, False, False))
     with pytest.raises(ValueError, match=pattern):
         new_fragment = parquet_format.make_fragment(
-            fragment.path,
-            fragment.filesystem,
-            partition_expression=fragment.partition_expression,
-        )
-        new_fragment.to_table(filter=ds.field("part") == "a")
+            fragment.path, fragment.filesystem,
+            partition_expression=fragment.partition_expression)
+        new_fragment.to_table(filter=ds.field('part') == 'a')
 
 
 @pytest.mark.pandas
@@ -866,21 +823,21 @@ def test_fragments_parquet_row_groups(tempdir):
     row_group_fragments = list(fragment.split_by_row_group())
     assert len(row_group_fragments) == fragment.num_row_groups == 2
     result = row_group_fragments[0].to_table(schema=dataset.schema)
-    assert result.column_names == ["f1", "f2", "part"]
+    assert result.column_names == ['f1', 'f2', 'part']
     assert len(result) == 2
     assert result.equals(table.slice(0, 2))
 
     assert row_group_fragments[0].row_groups is not None
     assert row_group_fragments[0].num_row_groups == 1
     assert row_group_fragments[0].row_groups[0].statistics == {
-        "f1": {"min": 0, "max": 1},
-        "f2": {"min": 1, "max": 1},
+        'f1': {'min': 0, 'max': 1},
+        'f2': {'min': 1, 'max': 1},
     }
 
-    fragment = list(dataset.get_fragments(filter=ds.field("f1") < 1))[0]
-    row_group_fragments = list(fragment.split_by_row_group(ds.field("f1") < 1))
+    fragment = list(dataset.get_fragments(filter=ds.field('f1') < 1))[0]
+    row_group_fragments = list(fragment.split_by_row_group(ds.field('f1') < 1))
     assert len(row_group_fragments) == 1
-    result = row_group_fragments[0].to_table(filter=ds.field("f1") < 1)
+    result = row_group_fragments[0].to_table(filter=ds.field('f1') < 1)
     assert len(result) == 1
 
 
@@ -888,15 +845,15 @@ def test_fragments_parquet_row_groups(tempdir):
 def test_fragments_parquet_num_row_groups(tempdir):
     import pyarrow.parquet as pq
 
-    table = pa.table({"a": range(8)})
+    table = pa.table({'a': range(8)})
     pq.write_table(table, tempdir / "test.parquet", row_group_size=2)
     dataset = ds.dataset(tempdir / "test.parquet", format="parquet")
     original_fragment = list(dataset.get_fragments())[0]
 
     # create fragment with subset of row groups
     fragment = original_fragment.format.make_fragment(
-        original_fragment.path, original_fragment.filesystem, row_groups=[1, 3]
-    )
+        original_fragment.path, original_fragment.filesystem,
+        row_groups=[1, 3])
     assert fragment.num_row_groups == 2
     # ensure that parsing metadata preserves correct number of row groups
     fragment.ensure_complete_metadata()
@@ -909,16 +866,14 @@ def test_fragments_parquet_num_row_groups(tempdir):
 def test_fragments_parquet_row_groups_dictionary(tempdir):
     import pandas as pd
 
-    df = pd.DataFrame(dict(col1=["a", "b"], col2=[1, 2]))
-    df["col1"] = df["col1"].astype("category")
+    df = pd.DataFrame(dict(col1=['a', 'b'], col2=[1, 2]))
+    df['col1'] = df['col1'].astype("category")
 
     import pyarrow.parquet as pq
-
     pq.write_table(pa.table(df), tempdir / "test_filter_dictionary.parquet")
 
     import pyarrow.dataset as ds
-
-    dataset = ds.dataset(tempdir / "test_filter_dictionary.parquet")
+    dataset = ds.dataset(tempdir / 'test_filter_dictionary.parquet')
     result = dataset.to_table(filter=ds.field("col1") == "a")
 
     assert (df.iloc[0] == result.to_pandas()).all().all()
@@ -929,7 +884,8 @@ def test_fragments_parquet_row_groups_dictionary(tempdir):
 def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs):
     fs, assert_opens = open_logging_fs
     _, dataset = _create_dataset_for_fragments(
-        tempdir, chunk_size=2, filesystem=fs)
+        tempdir, chunk_size=2, filesystem=fs
+    )
     fragment = list(dataset.get_fragments())[0]
 
     # with default discovery, no metadata loaded
@@ -979,38 +935,38 @@ def _create_dataset_all_types(tempdir, chunk_size=None):
             pa.array([1, 10, 42], pa.uint64()),
             pa.array([1.0, 10.0, 42.0], pa.float32()),
             pa.array([1.0, 10.0, 42.0], pa.float64()),
-            pa.array(["a", None, "z"], pa.utf8()),
-            pa.array(["a", None, "z"], pa.binary()),
-            pa.array([1, 10, 42], pa.timestamp("s")),
-            pa.array([1, 10, 42], pa.timestamp("ms")),
-            pa.array([1, 10, 42], pa.timestamp("us")),
+            pa.array(['a', None, 'z'], pa.utf8()),
+            pa.array(['a', None, 'z'], pa.binary()),
+            pa.array([1, 10, 42], pa.timestamp('s')),
+            pa.array([1, 10, 42], pa.timestamp('ms')),
+            pa.array([1, 10, 42], pa.timestamp('us')),
             pa.array([1, 10, 42], pa.date32()),
             pa.array([1, 10, 4200000000], pa.date64()),
-            pa.array([1, 10, 42], pa.time32("s")),
-            pa.array([1, 10, 42], pa.time64("us")),
+            pa.array([1, 10, 42], pa.time32('s')),
+            pa.array([1, 10, 42], pa.time64('us')),
         ],
         names=[
-            "boolean",
-            "int8",
-            "uint8",
-            "int16",
-            "uint16",
-            "int32",
-            "uint32",
-            "int64",
-            "uint64",
-            "float",
-            "double",
-            "utf8",
-            "binary",
-            "ts[s]",
-            "ts[ms]",
-            "ts[us]",
-            "date32",
-            "date64",
-            "time32",
-            "time64",
-        ],
+            'boolean',
+            'int8',
+            'uint8',
+            'int16',
+            'uint16',
+            'int32',
+            'uint32',
+            'int64',
+            'uint64',
+            'float',
+            'double',
+            'utf8',
+            'binary',
+            'ts[s]',
+            'ts[ms]',
+            'ts[us]',
+            'date32',
+            'date64',
+            'time32',
+            'time64',
+        ]
     )
 
     path = str(tempdir / "test_parquet_dataset_all_types")
@@ -1029,16 +985,9 @@ def test_parquet_fragment_statistics(tempdir):
     fragment = list(dataset.get_fragments())[0]
 
     import datetime
-
-    def dt_s(x):
-        return datetime.datetime(1970, 1, 1, 0, 0, x)
-
-    def dt_ms(x):
-        return datetime.datetime(1970, 1, 1, 0, 0, 0, x * 1000)
-
-    def dt_us(x):
-        return datetime.datetime(1970, 1, 1, 0, 0, 0, x)
-
+    def dt_s(x): return datetime.datetime(1970, 1, 1, 0, 0, x)
+    def dt_ms(x): return datetime.datetime(1970, 1, 1, 0, 0, 0, x*1000)
+    def dt_us(x): return datetime.datetime(1970, 1, 1, 0, 0, 0, x)
     date = datetime.date
     time = datetime.time
 
@@ -1049,26 +998,26 @@ def dt_us(x):
     assert row_group.num_rows == 3
     assert row_group.total_byte_size > 1000
     assert row_group.statistics == {
-        "boolean": {"min": False, "max": True},
-        "int8": {"min": 1, "max": 42},
-        "uint8": {"min": 1, "max": 42},
-        "int16": {"min": 1, "max": 42},
-        "uint16": {"min": 1, "max": 42},
-        "int32": {"min": 1, "max": 42},
-        "uint32": {"min": 1, "max": 42},
-        "int64": {"min": 1, "max": 42},
-        "uint64": {"min": 1, "max": 42},
-        "float": {"min": 1.0, "max": 42.0},
-        "double": {"min": 1.0, "max": 42.0},
-        "utf8": {"min": "a", "max": "z"},
-        "binary": {"min": b"a", "max": b"z"},
-        "ts[s]": {"min": dt_s(1), "max": dt_s(42)},
-        "ts[ms]": {"min": dt_ms(1), "max": dt_ms(42)},
-        "ts[us]": {"min": dt_us(1), "max": dt_us(42)},
-        "date32": {"min": date(1970, 1, 2), "max": date(1970, 2, 12)},
-        "date64": {"min": date(1970, 1, 1), "max": date(1970, 2, 18)},
-        "time32": {"min": time(0, 0, 1), "max": time(0, 0, 42)},
-        "time64": {"min": time(0, 0, 0, 1), "max": time(0, 0, 0, 42)},
+        'boolean': {'min': False, 'max': True},
+        'int8': {'min': 1, 'max': 42},
+        'uint8': {'min': 1, 'max': 42},
+        'int16': {'min': 1, 'max': 42},
+        'uint16': {'min': 1, 'max': 42},
+        'int32': {'min': 1, 'max': 42},
+        'uint32': {'min': 1, 'max': 42},
+        'int64': {'min': 1, 'max': 42},
+        'uint64': {'min': 1, 'max': 42},
+        'float': {'min': 1.0, 'max': 42.0},
+        'double': {'min': 1.0, 'max': 42.0},
+        'utf8': {'min': 'a', 'max': 'z'},
+        'binary': {'min': b'a', 'max': b'z'},
+        'ts[s]': {'min': dt_s(1), 'max': dt_s(42)},
+        'ts[ms]': {'min': dt_ms(1), 'max': dt_ms(42)},
+        'ts[us]': {'min': dt_us(1), 'max': dt_us(42)},
+        'date32': {'min': date(1970, 1, 2), 'max': date(1970, 2, 12)},
+        'date64': {'min': date(1970, 1, 1), 'max': date(1970, 2, 18)},
+        'time32': {'min': time(0, 0, 1), 'max': time(0, 0, 42)},
+        'time64': {'min': time(0, 0, 0, 1), 'max': time(0, 0, 0, 42)},
     }
 
 
@@ -1076,7 +1025,7 @@ def dt_us(x):
 def test_parquet_fragment_statistics_nulls(tempdir):
     import pyarrow.parquet as pq
 
-    table = pa.table({"a": [0, 1, None, None], "b": ["a", "b", None, None]})
+    table = pa.table({'a': [0, 1, None, None], 'b': ['a', 'b', None, None]})
     pq.write_table(table, tempdir / "test.parquet", row_group_size=2)
 
     dataset = ds.dataset(tempdir / "test.parquet", format="parquet")
@@ -1103,25 +1052,21 @@ def test_fragments_parquet_row_groups_predicate(tempdir):
     table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2)
 
     fragment = list(dataset.get_fragments())[0]
-    assert fragment.partition_expression.equals(ds.field("part") == "a")
+    assert fragment.partition_expression.equals(ds.field('part') == 'a')
 
     # predicate may reference a partition field not present in the
     # physical_schema if an explicit schema is provided to split_by_row_group
 
     # filter matches partition_expression: all row groups
     row_group_fragments = list(
-        fragment.split_by_row_group(
-            filter=ds.field("part") == "a", schema=dataset.schema
-        )
-    )
+        fragment.split_by_row_group(filter=ds.field('part') == 'a',
+                                    schema=dataset.schema))
     assert len(row_group_fragments) == 2
 
     # filter contradicts partition_expression: no row groups
     row_group_fragments = list(
-        fragment.split_by_row_group(
-            filter=ds.field("part") == "b", schema=dataset.schema
-        )
-    )
+        fragment.split_by_row_group(filter=ds.field('part') == 'b',
+                                    schema=dataset.schema))
     assert len(row_group_fragments) == 0
 
 
@@ -1140,36 +1085,27 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir):
 
     # manually re-construct row group fragments
     new_fragment = parquet_format.make_fragment(
-        fragment.path,
-        fragment.filesystem,
+        fragment.path, fragment.filesystem,
         partition_expression=fragment.partition_expression,
-        row_groups=[0],
-    )
+        row_groups=[0])
     result = new_fragment.to_table()
     assert result.equals(row_group_fragments[0].to_table())
 
     # manually re-construct a row group fragment with filter/column projection
     new_fragment = parquet_format.make_fragment(
-        fragment.path,
-        fragment.filesystem,
+        fragment.path, fragment.filesystem,
         partition_expression=fragment.partition_expression,
-        row_groups={1},
-    )
-    result = new_fragment.to_table(
-        schema=table.schema,
-        columns=["f1", "part"],
-        filter=ds.field("f1") < 3,
-    )
-    assert result.column_names == ["f1", "part"]
+        row_groups={1})
+    result = new_fragment.to_table(schema=table.schema, columns=['f1', 'part'],
+                                   filter=ds.field('f1') < 3, )
+    assert result.column_names == ['f1', 'part']
     assert len(result) == 1
 
     # out of bounds row group index
     new_fragment = parquet_format.make_fragment(
-        fragment.path,
-        fragment.filesystem,
+        fragment.path, fragment.filesystem,
         partition_expression=fragment.partition_expression,
-        row_groups={2},
-    )
+        row_groups={2})
     with pytest.raises(IndexError, match="references row group 2"):
         new_fragment.to_table()
 
@@ -1178,8 +1114,8 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir):
 @pytest.mark.parquet
 def test_fragments_parquet_subset_ids(tempdir, open_logging_fs):
     fs, assert_opens = open_logging_fs
-    table, dataset = _create_dataset_for_fragments(
-        tempdir, chunk_size=1, filesystem=fs)
+    table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1,
+                                                   filesystem=fs)
     fragment = list(dataset.get_fragments())[0]
 
     # select with row group ids
@@ -1206,8 +1142,8 @@ def test_fragments_parquet_subset_ids(tempdir, open_logging_fs):
 @pytest.mark.parquet
 def test_fragments_parquet_subset_filter(tempdir, open_logging_fs):
     fs, assert_opens = open_logging_fs
-    table, dataset = _create_dataset_for_fragments(
-        tempdir, chunk_size=1, filesystem=fs)
+    table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1,
+                                                   filesystem=fs)
     fragment = list(dataset.get_fragments())[0]
 
     # select with filter
@@ -1249,43 +1185,41 @@ def test_fragments_parquet_subset_invalid(tempdir):
 
 
 def test_partitioning_factory(mockfs):
-    paths_or_selector = fs.FileSelector("subdir", recursive=True)
+    paths_or_selector = fs.FileSelector('subdir', recursive=True)
     format = ds.ParquetFileFormat()
 
-    options = ds.FileSystemFactoryOptions("subdir")
-    partitioning_factory = ds.DirectoryPartitioning.discover(["group", "key"])
+    options = ds.FileSystemFactoryOptions('subdir')
+    partitioning_factory = ds.DirectoryPartitioning.discover(['group', 'key'])
     assert isinstance(partitioning_factory, ds.PartitioningFactory)
     options.partitioning_factory = partitioning_factory
 
     factory = ds.FileSystemDatasetFactory(
-        mockfs, paths_or_selector, format, options)
+        mockfs, paths_or_selector, format, options
+    )
     inspected_schema = factory.inspect()
     # i64/f64 from data, group/key from "/1/xxx" and "/2/yyy" paths
-    expected_schema = pa.schema(
-        [
-            ("i64", pa.int64()),
-            ("f64", pa.float64()),
-            ("str", pa.string()),
-            ("const", pa.int64()),
-            ("group", pa.int32()),
-            ("key", pa.string()),
-        ]
-    )
+    expected_schema = pa.schema([
+        ("i64", pa.int64()),
+        ("f64", pa.float64()),
+        ("str", pa.string()),
+        ("const", pa.int64()),
+        ("group", pa.int32()),
+        ("key", pa.string()),
+    ])
     assert inspected_schema.equals(expected_schema)
 
     hive_partitioning_factory = ds.HivePartitioning.discover()
     assert isinstance(hive_partitioning_factory, ds.PartitioningFactory)
 
 
-@pytest.mark.parametrize("infer_dictionary", [False, True])
+@pytest.mark.parametrize('infer_dictionary', [False, True])
 def test_partitioning_factory_dictionary(mockfs, infer_dictionary):
-    paths_or_selector = fs.FileSelector("subdir", recursive=True)
+    paths_or_selector = fs.FileSelector('subdir', recursive=True)
     format = ds.ParquetFileFormat()
-    options = ds.FileSystemFactoryOptions("subdir")
+    options = ds.FileSystemFactoryOptions('subdir')
 
     options.partitioning_factory = ds.DirectoryPartitioning.discover(
-        ["group", "key"], infer_dictionary=infer_dictionary
-    )
+        ['group', 'key'], infer_dictionary=infer_dictionary)
 
     factory = ds.FileSystemDatasetFactory(
         mockfs, paths_or_selector, format, options)
@@ -1293,20 +1227,20 @@ def test_partitioning_factory_dictionary(mockfs, infer_dictionary):
     inferred_schema = factory.inspect()
     if infer_dictionary:
         expected_type = pa.dictionary(pa.int32(), pa.string())
-        assert inferred_schema.field("key").type == expected_type
+        assert inferred_schema.field('key').type == expected_type
 
         table = factory.finish().to_table().combine_chunks()
-        actual = table.column("key").chunk(0)
-        expected = pa.array(["xxx"] * 5 + ["yyy"] * 5).dictionary_encode()
+        actual = table.column('key').chunk(0)
+        expected = pa.array(['xxx'] * 5 + ['yyy'] * 5).dictionary_encode()
         assert actual.equals(expected)
 
         # ARROW-9345 ensure filtering on the partition field works
-        table = factory.finish().to_table(filter=ds.field("key") == "xxx")
-        actual = table.column("key").chunk(0)
+        table = factory.finish().to_table(filter=ds.field('key') == 'xxx')
+        actual = table.column('key').chunk(0)
         expected = expected.slice(0, 5)
         assert actual.equals(expected)
     else:
-        assert inferred_schema.field("key").type == pa.string()
+        assert inferred_schema.field('key').type == pa.string()
 
 
 def test_partitioning_function():
@@ -1344,9 +1278,8 @@ def test_partitioning_function():
 
 def _create_single_file(base_dir, table=None, row_group_size=None):
     import pyarrow.parquet as pq
-
     if table is None:
-        table = pa.table({"a": range(9), "b": [0.0] * 4 + [1.0] * 5})
+        table = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5})
     path = base_dir / "test.parquet"
     pq.write_table(table, path, row_group_size=row_group_size)
     return table, path
@@ -1354,11 +1287,10 @@ def _create_single_file(base_dir, table=None, row_group_size=None):
 
 def _create_directory_of_files(base_dir):
     import pyarrow.parquet as pq
-
-    table1 = pa.table({"a": range(9), "b": [0.0] * 4 + [1.0] * 5})
+    table1 = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5})
     path1 = base_dir / "test1.parquet"
     pq.write_table(table1, path1)
-    table2 = pa.table({"a": range(9, 18), "b": [0.0] * 4 + [1.0] * 5})
+    table2 = pa.table({'a': range(9, 18), 'b': [0.] * 4 + [1.] * 5})
     path2 = base_dir / "test2.parquet"
     pq.write_table(table2, path2)
     return (table1, table2), (path1, path2)
@@ -1415,9 +1347,13 @@ def test_open_dataset_list_of_files(tempdir):
     tables, (path1, path2) = _create_directory_of_files(tempdir)
     table = pa.concat_tables(tables)
 
-    datasets = [ds.dataset([path1, path2]),
-                ds.dataset([str(path1), str(path2)])]
-    datasets += [pickle.loads(pickle.dumps(d)) for d in datasets]
+    datasets = [
+        ds.dataset([path1, path2]),
+        ds.dataset([str(path1), str(path2)])
+    ]
+    datasets += [
+        pickle.loads(pickle.dumps(d)) for d in datasets
+    ]
 
     for dataset in datasets:
         assert dataset.schema.equals(table.schema)
@@ -1426,7 +1362,7 @@ def test_open_dataset_list_of_files(tempdir):
 
 
 def test_construct_from_single_file(tempdir):
-    directory = tempdir / "single-file"
+    directory = tempdir / 'single-file'
     directory.mkdir()
     table, path = _create_single_file(directory)
     relative_path = path.relative_to(directory)
@@ -1444,7 +1380,7 @@ def test_construct_from_single_file(tempdir):
 
 
 def test_construct_from_single_directory(tempdir):
-    directory = tempdir / "single-directory"
+    directory = tempdir / 'single-directory'
     directory.mkdir()
     tables, paths = _create_directory_of_files(directory)
 
@@ -1464,7 +1400,7 @@ def test_construct_from_single_directory(tempdir):
 
 def test_construct_from_list_of_files(tempdir):
     # instantiate from a list of files
-    directory = tempdir / "list-of-files"
+    directory = tempdir / 'list-of-files'
     directory.mkdir()
     tables, paths = _create_directory_of_files(directory)
 
@@ -1487,19 +1423,18 @@ def test_construct_from_list_of_files(tempdir):
 def test_construct_from_list_of_mixed_paths_fails(mockfs):
     # isntantiate from a list of mixed paths
     files = [
-        "subdir/1/xxx/file0.parquet",
-        "subdir/1/xxx/doesnt-exist.parquet",
+        'subdir/1/xxx/file0.parquet',
+        'subdir/1/xxx/doesnt-exist.parquet',
     ]
-    with pytest.raises(FileNotFoundError, match="doesnt-exist"):
+    with pytest.raises(FileNotFoundError, match='doesnt-exist'):
         ds.dataset(files, filesystem=mockfs)
 
 
 def test_construct_from_mixed_child_datasets(mockfs):
     # isntantiate from a list of mixed paths
-    a = ds.dataset(
-        ["subdir/1/xxx/file0.parquet", "subdir/2/yyy/file1.parquet"], filesystem=mockfs
-    )
-    b = ds.dataset("subdir", filesystem=mockfs)
+    a = ds.dataset(['subdir/1/xxx/file0.parquet',
+                    'subdir/2/yyy/file1.parquet'], filesystem=mockfs)
+    b = ds.dataset('subdir', filesystem=mockfs)
 
     dataset = ds.dataset([a, b])
 
@@ -1512,10 +1447,8 @@ def test_construct_from_mixed_child_datasets(mockfs):
 
     assert len(dataset.children) == 2
     for child in dataset.children:
-        assert child.files == [
-            "subdir/1/xxx/file0.parquet",
-            "subdir/2/yyy/file1.parquet",
-        ]
+        assert child.files == ['subdir/1/xxx/file0.parquet',
+                               'subdir/2/yyy/file1.parquet']
 
 
 def test_construct_empty_dataset():
@@ -1524,8 +1457,10 @@ def test_construct_empty_dataset():
     assert table.num_rows == 0
     assert table.num_columns == 0
 
-    empty = ds.dataset([], schema=pa.schema(
-        [("a", pa.int64()), ("a", pa.string())]))
+    empty = ds.dataset([], schema=pa.schema([
+        ('a', pa.int64()),
+        ('a', pa.string())
+    ]))
     table = empty.to_table()
     assert table.num_rows == 0
     assert table.num_columns == 2
@@ -1533,13 +1468,17 @@ def test_construct_empty_dataset():
 
 def test_construct_from_invalid_sources_raise(multisourcefs):
     child1 = ds.FileSystemDatasetFactory(
-        multisourcefs, fs.FileSelector("/plain"), format=ds.ParquetFileFormat()
+        multisourcefs,
+        fs.FileSelector('/plain'),
+        format=ds.ParquetFileFormat()
     )
     child2 = ds.FileSystemDatasetFactory(
-        multisourcefs, fs.FileSelector("/schema"), format=ds.ParquetFileFormat()
+        multisourcefs,
+        fs.FileSelector('/schema'),
+        format=ds.ParquetFileFormat()
     )
 
-    with pytest.raises(TypeError, match="Expected.*FileSystemDatasetFactory"):
+    with pytest.raises(TypeError, match='Expected.*FileSystemDatasetFactory'):
         ds.dataset([child1, child2])
 
     expected = (
@@ -1560,8 +1499,7 @@ def test_construct_from_invalid_sources_raise(multisourcefs):
 @pytest.mark.parquet
 def test_open_dataset_partitioned_directory(tempdir):
     import pyarrow.parquet as pq
-
-    table = pa.table({"a": range(9), "b": [0.0] * 4 + [1.0] * 5})
+    table = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5})
 
     path = tempdir / "dataset"
     path.mkdir()
@@ -1596,15 +1534,13 @@ def test_open_dataset_partitioned_directory(tempdir):
     dataset = ds.dataset(
         str(path),
         partitioning=ds.partitioning(
-            pa.schema([("part", pa.int8())]), flavor="hive"),
-    )
+            pa.schema([("part", pa.int8())]), flavor="hive"))
     expected_schema = table.schema.append(pa.field("part", pa.int8()))
     assert dataset.schema.equals(expected_schema)
 
     result = dataset.to_table()
     expected = full_table.append_column(
-        "part", pa.array(np.repeat([0, 1, 2], 9), type=pa.int8())
-    )
+        "part", pa.array(np.repeat([0, 1, 2], 9), type=pa.int8()))
     assert result.equals(expected)
 
 
@@ -1651,7 +1587,7 @@ def test_open_union_dataset(tempdir):
 
 
 def test_open_union_dataset_with_additional_kwargs(multisourcefs):
-    child = ds.dataset("/plain", filesystem=multisourcefs, format="parquet")
+    child = ds.dataset('/plain', filesystem=multisourcefs, format='parquet')
     with pytest.raises(ValueError, match="cannot pass any additional"):
         ds.dataset([child], format="parquet")
 
@@ -1660,34 +1596,31 @@ def test_open_dataset_non_existing_file():
     # ARROW-8213: Opening a dataset with a local incorrect path gives confusing
     #             error message
     with pytest.raises(FileNotFoundError):
-        ds.dataset("i-am-not-existing.parquet", format="parquet")
+        ds.dataset('i-am-not-existing.parquet', format='parquet')
 
-    with pytest.raises(pa.ArrowInvalid, match="cannot be relative"):
-        ds.dataset("file:i-am-not-existing.parquet", format="parquet")
+    with pytest.raises(pa.ArrowInvalid, match='cannot be relative'):
+        ds.dataset('file:i-am-not-existing.parquet', format='parquet')
 
 
 @pytest.mark.parquet
-@pytest.mark.parametrize("partitioning", ["directory", "hive"])
-@pytest.mark.parametrize("null_fallback", ["xyz", None])
-@pytest.mark.parametrize(
-    "partition_keys",
-    [
-        (["A", "B", "C"], [1, 2, 3]),
-        ([1, 2, 3], ["A", "B", "C"]),
-        (["A", "B", "C"], ["D", "E", "F"]),
-        ([1, 2, 3], [4, 5, 6]),
-        ([1, None, 3], ["A", "B", "C"]),
-        ([1, 2, 3], ["A", None, "C"]),
-        ([None, 2, 3], [None, 2, 3]),
-    ],
-)
+@pytest.mark.parametrize('partitioning', ["directory", "hive"])
+@pytest.mark.parametrize('null_fallback', ['xyz', None])
+@pytest.mark.parametrize('partition_keys', [
+    (["A", "B", "C"], [1, 2, 3]),
+    ([1, 2, 3], ["A", "B", "C"]),
+    (["A", "B", "C"], ["D", "E", "F"]),
+    ([1, 2, 3], [4, 5, 6]),
+    ([1, None, 3], ["A", "B", "C"]),
+    ([1, 2, 3], ["A", None, "C"]),
+    ([None, 2, 3], [None, 2, 3]),
+])
 def test_open_dataset_partitioned_dictionary_type(
     tempdir, partitioning, null_fallback, partition_keys
 ):
     # ARROW-9288 / ARROW-9476
     import pyarrow.parquet as pq
 
-    table = pa.table({"a": range(9), "b": [0.0] * 4 + [1.0] * 5})
+    table = pa.table({'a': range(9), 'b': [0.0] * 4 + [1.0] * 5})
 
     if None in partition_keys[0] or None in partition_keys[1]:
         # Directory partitioning can't handle the first part being null
@@ -1695,8 +1628,7 @@ def test_open_dataset_partitioned_dictionary_type(
 
     if partitioning == "directory":
         partitioning = ds.DirectoryPartitioning.discover(
-            ["part1", "part2"], infer_dictionary=True
-        )
+            ["part1", "part2"], infer_dictionary=True)
         fmt = "{0}/{1}"
         null_value = None
     else:
@@ -1728,27 +1660,27 @@ def test_open_dataset_partitioned_dictionary_type(
     def dict_type(key):
         value_type = pa.string() if isinstance(key, str) else pa.int32()
         return pa.dictionary(pa.int32(), value_type)
-
     expected_schema = table.schema.append(
         pa.field("part1", dict_type(part_keys1[0]))
-    ).append(pa.field("part2", dict_type(part_keys2[0])))
+    ).append(
+        pa.field("part2", dict_type(part_keys2[0]))
+    )
     assert dataset.schema.equals(expected_schema)
 
 
 @pytest.mark.pandas
 def test_dataset_partitioned_dictionary_type_reconstruct(tempdir):
     # https://issues.apache.org/jira/browse/ARROW-11400
-    table = pa.table({"part": np.repeat(["A", "B"], 5), "col": range(10)})
-    part = ds.partitioning(table.select(["part"]).schema, flavor="hive")
+    table = pa.table({'part': np.repeat(['A', 'B'], 5), 'col': range(10)})
+    part = ds.partitioning(table.select(['part']).schema, flavor="hive")
     ds.write_dataset(table, tempdir, partitioning=part, format="feather")
 
     dataset = ds.dataset(
-        tempdir,
-        format="feather",
-        partitioning=ds.HivePartitioning.discover(infer_dictionary=True),
+        tempdir, format="feather",
+        partitioning=ds.HivePartitioning.discover(infer_dictionary=True)
     )
     expected = pa.table(
-        {"col": table["col"], "part": table["part"].dictionary_encode()}
+        {'col': table['col'], 'part': table['part'].dictionary_encode()}
     )
     assert dataset.to_table().equals(expected)
     fragment = list(dataset.get_fragments())[0]
@@ -1761,10 +1693,8 @@ def test_dataset_partitioned_dictionary_type_reconstruct(tempdir):
     restored = pickle.loads(pickle.dumps(fragment))
     assert restored.to_table(schema=dataset.schema).equals(expected[:5])
     # to_pandas call triggers computation of the actual dictionary values
-    assert (
-        restored.to_table(schema=dataset.schema)
-        .to_pandas()
-        .equals(expected[:5].to_pandas())
+    assert restored.to_table(schema=dataset.schema).to_pandas().equals(
+        expected[:5].to_pandas()
     )
     assert restored.partition_expression.equals(part_expr)
 
@@ -1775,14 +1705,15 @@ def s3_example_simple(s3_connection, s3_server):
     import pyarrow.parquet as pq
 
     host, port, access_key, secret_key = s3_connection
-    uri = "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}".format(
-        access_key, secret_key, host, port
+    uri = (
+        "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}"
+        .format(access_key, secret_key, host, port)
     )
 
     fs, path = FileSystem.from_uri(uri)
 
     fs.create_dir("mybucket")
-    table = pa.table({"a": [1, 2, 3]})
+    table = pa.table({'a': [1, 2, 3]})
     with fs.open_output_stream("mybucket/data.parquet") as out:
         pq.write_table(table, out)
 
@@ -1815,7 +1746,9 @@ def test_open_dataset_from_uri_s3_fsspec(s3_example_simple):
     fs = s3fs.S3FileSystem(
         key=access_key,
         secret=secret_key,
-        client_kwargs={"endpoint_url": "http://{}:{}".format(host, port)},
+        client_kwargs={
+            'endpoint_url': 'http://{}:{}'.format(host, port)
+        }
     )
 
     # passing as fsspec filesystem
@@ -1835,18 +1768,18 @@ def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server):
     import pyarrow.parquet as pq
 
     host, port, access_key, secret_key = s3_connection
-    bucket = "theirbucket"
-    path = "nested/folder/data.parquet"
+    bucket = 'theirbucket'
+    path = 'nested/folder/data.parquet'
     uri = "s3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}".format(
         access_key, secret_key, bucket, path, host, port
     )
 
     fs, path = FileSystem.from_uri(uri)
-    assert path == "theirbucket/nested/folder/data.parquet"
+    assert path == 'theirbucket/nested/folder/data.parquet'
 
     fs.create_dir(bucket)
 
-    table = pa.table({"a": [1, 2, 3]})
+    table = pa.table({'a': [1, 2, 3]})
     with fs.open_output_stream(path) as out:
         pq.write_table(table, out)
 
@@ -1855,25 +1788,27 @@ def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server):
     assert dataset.to_table().equals(table)
 
     # passing filesystem as an uri
-    template = "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format(
-        access_key, secret_key, host, port
+    template = (
+        "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format(
+            access_key, secret_key, host, port
+        )
     )
     cases = [
-        ("theirbucket/nested/folder/", "/data.parquet"),
-        ("theirbucket/nested/folder", "data.parquet"),
-        ("theirbucket/nested/", "folder/data.parquet"),
-        ("theirbucket/nested", "folder/data.parquet"),
-        ("theirbucket", "/nested/folder/data.parquet"),
-        ("theirbucket", "nested/folder/data.parquet"),
+        ('theirbucket/nested/folder/', '/data.parquet'),
+        ('theirbucket/nested/folder', 'data.parquet'),
+        ('theirbucket/nested/', 'folder/data.parquet'),
+        ('theirbucket/nested', 'folder/data.parquet'),
+        ('theirbucket', '/nested/folder/data.parquet'),
+        ('theirbucket', 'nested/folder/data.parquet'),
     ]
     for prefix, path in cases:
         uri = template.format(prefix)
         dataset = ds.dataset(path, filesystem=uri, format="parquet")
         assert dataset.to_table().equals(table)
 
-    with pytest.raises(pa.ArrowInvalid, match="Missing bucket name"):
-        uri = template.format("/")
-        ds.dataset("/theirbucket/nested/folder/data.parquet", filesystem=uri)
+    with pytest.raises(pa.ArrowInvalid, match='Missing bucket name'):
+        uri = template.format('/')
+        ds.dataset('/theirbucket/nested/folder/data.parquet', filesystem=uri)
 
     error = (
         "The path component of the filesystem URI must point to a directory "
@@ -1881,17 +1816,17 @@ def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server):
         "filesystem URI is `{}`"
     )
 
-    path = "theirbucket/doesnt/exist"
+    path = 'theirbucket/doesnt/exist'
     uri = template.format(path)
     with pytest.raises(ValueError) as exc:
-        ds.dataset("data.parquet", filesystem=uri)
-    assert str(exc.value) == error.format("NotFound", path, uri)
+        ds.dataset('data.parquet', filesystem=uri)
+    assert str(exc.value) == error.format('NotFound', path, uri)
 
-    path = "theirbucket/nested/folder/data.parquet"
+    path = 'theirbucket/nested/folder/data.parquet'
     uri = template.format(path)
     with pytest.raises(ValueError) as exc:
-        ds.dataset("data.parquet", filesystem=uri)
-    assert str(exc.value) == error.format("File", path, uri)
+        ds.dataset('data.parquet', filesystem=uri)
+    assert str(exc.value) == error.format('File', path, uri)
 
 
 @pytest.mark.parquet
@@ -1936,17 +1871,18 @@ def test_filter_timestamp(tempdir):
 @pytest.mark.parquet
 def test_filter_implicit_cast(tempdir):
     # ARROW-7652
-    table = pa.table({"a": pa.array([0, 1, 2, 3, 4, 5], type=pa.int8())})
+    table = pa.table({'a': pa.array([0, 1, 2, 3, 4, 5], type=pa.int8())})
     _, path = _create_single_file(tempdir, table)
     dataset = ds.dataset(str(path))
 
-    filter_ = ds.field("a") > 2
+    filter_ = ds.field('a') > 2
     assert len(dataset.to_table(filter=filter_)) == 3
 
 
 def test_dataset_union(multisourcefs):
     child = ds.FileSystemDatasetFactory(
-        multisourcefs, fs.FileSelector("/plain"), format=ds.ParquetFileFormat()
+        multisourcefs, fs.FileSelector('/plain'),
+        format=ds.ParquetFileFormat()
     )
     factory = ds.UnionDatasetFactory([child])
 
@@ -1959,128 +1895,106 @@ def test_dataset_union(multisourcefs):
 
 
 def test_union_dataset_from_other_datasets(tempdir, multisourcefs):
-    child1 = ds.dataset("/plain", filesystem=multisourcefs, format="parquet")
-    child2 = ds.dataset(
-        "/schema",
-        filesystem=multisourcefs,
-        format="parquet",
-        partitioning=["week", "color"],
-    )
-    child3 = ds.dataset(
-        "/hive", filesystem=multisourcefs, format="parquet", partitioning="hive"
-    )
+    child1 = ds.dataset('/plain', filesystem=multisourcefs, format='parquet')
+    child2 = ds.dataset('/schema', filesystem=multisourcefs, format='parquet',
+                        partitioning=['week', 'color'])
+    child3 = ds.dataset('/hive', filesystem=multisourcefs, format='parquet',
+                        partitioning='hive')
 
     assert child1.schema != child2.schema != child3.schema
 
     assembled = ds.dataset([child1, child2, child3])
     assert isinstance(assembled, ds.UnionDataset)
 
-    msg = "cannot pass any additional arguments"
+    msg = 'cannot pass any additional arguments'
     with pytest.raises(ValueError, match=msg):
         ds.dataset([child1, child2], filesystem=multisourcefs)
 
-    expected_schema = pa.schema(
-        [
-            ("date", pa.date32()),
-            ("index", pa.int64()),
-            ("value", pa.float64()),
-            ("color", pa.string()),
-            ("week", pa.int32()),
-            ("year", pa.int32()),
-            ("month", pa.int32()),
-        ]
-    )
+    expected_schema = pa.schema([
+        ('date', pa.date32()),
+        ('index', pa.int64()),
+        ('value', pa.float64()),
+        ('color', pa.string()),
+        ('week', pa.int32()),
+        ('year', pa.int32()),
+        ('month', pa.int32()),
+    ])
     assert assembled.schema.equals(expected_schema)
     assert assembled.to_table().schema.equals(expected_schema)
 
     assembled = ds.dataset([child1, child3])
-    expected_schema = pa.schema(
-        [
-            ("date", pa.date32()),
-            ("index", pa.int64()),
-            ("value", pa.float64()),
-            ("color", pa.string()),
-            ("year", pa.int32()),
-            ("month", pa.int32()),
-        ]
-    )
+    expected_schema = pa.schema([
+        ('date', pa.date32()),
+        ('index', pa.int64()),
+        ('value', pa.float64()),
+        ('color', pa.string()),
+        ('year', pa.int32()),
+        ('month', pa.int32()),
+    ])
     assert assembled.schema.equals(expected_schema)
     assert assembled.to_table().schema.equals(expected_schema)
 
-    expected_schema = pa.schema(
-        [
-            ("month", pa.int32()),
-            ("color", pa.string()),
-            ("date", pa.date32()),
-        ]
-    )
+    expected_schema = pa.schema([
+        ('month', pa.int32()),
+        ('color', pa.string()),
+        ('date', pa.date32()),
+    ])
     assembled = ds.dataset([child1, child3], schema=expected_schema)
     assert assembled.to_table().schema.equals(expected_schema)
 
-    expected_schema = pa.schema(
-        [
-            ("month", pa.int32()),
-            ("color", pa.string()),
-            ("unknown", pa.string()),  # fill with nulls
-        ]
-    )
+    expected_schema = pa.schema([
+        ('month', pa.int32()),
+        ('color', pa.string()),
+        ('unknown', pa.string())  # fill with nulls
+    ])
     assembled = ds.dataset([child1, child3], schema=expected_schema)
     assert assembled.to_table().schema.equals(expected_schema)
 
     # incompatible schemas, date and index columns have conflicting types
-    table = pa.table(
-        [range(9), [0.0] * 4 + [1.0] * 5, "abcdefghj"], names=["date", "value", "index"]
-    )
+    table = pa.table([range(9), [0.] * 4 + [1.] * 5, 'abcdefghj'],
+                     names=['date', 'value', 'index'])
     _, path = _create_single_file(tempdir, table=table)
     child4 = ds.dataset(path)
 
-    with pytest.raises(pa.ArrowInvalid, match="Unable to merge"):
+    with pytest.raises(pa.ArrowInvalid, match='Unable to merge'):
         ds.dataset([child1, child4])
 
 
 def test_dataset_from_a_list_of_local_directories_raises(multisourcefs):
-    msg = "points to a directory, but only file paths are supported"
+    msg = 'points to a directory, but only file paths are supported'
     with pytest.raises(IsADirectoryError, match=msg):
-        ds.dataset(["/plain", "/schema", "/hive"], filesystem=multisourcefs)
+        ds.dataset(['/plain', '/schema', '/hive'], filesystem=multisourcefs)
 
 
 def test_union_dataset_filesystem_datasets(multisourcefs):
     # without partitioning
-    dataset = ds.dataset(
-        [
-            ds.dataset("/plain", filesystem=multisourcefs),
-            ds.dataset("/schema", filesystem=multisourcefs),
-            ds.dataset("/hive", filesystem=multisourcefs),
-        ]
-    )
-    expected_schema = pa.schema(
-        [
-            ("date", pa.date32()),
-            ("index", pa.int64()),
-            ("value", pa.float64()),
-            ("color", pa.string()),
-        ]
-    )
+    dataset = ds.dataset([
+        ds.dataset('/plain', filesystem=multisourcefs),
+        ds.dataset('/schema', filesystem=multisourcefs),
+        ds.dataset('/hive', filesystem=multisourcefs),
+    ])
+    expected_schema = pa.schema([
+        ('date', pa.date32()),
+        ('index', pa.int64()),
+        ('value', pa.float64()),
+        ('color', pa.string()),
+    ])
     assert dataset.schema.equals(expected_schema)
 
     # with hive partitioning for two hive sources
-    dataset = ds.dataset(
-        [
-            ds.dataset("/plain", filesystem=multisourcefs),
-            ds.dataset("/schema", filesystem=multisourcefs),
-            ds.dataset("/hive", filesystem=multisourcefs, partitioning="hive"),
-        ]
-    )
-    expected_schema = pa.schema(
-        [
-            ("date", pa.date32()),
-            ("index", pa.int64()),
-            ("value", pa.float64()),
-            ("color", pa.string()),
-            ("year", pa.int32()),
-            ("month", pa.int32()),
-        ]
-    )
+    dataset = ds.dataset([
+        ds.dataset('/plain', filesystem=multisourcefs),
+        ds.dataset('/schema', filesystem=multisourcefs),
+        ds.dataset('/hive', filesystem=multisourcefs, partitioning='hive')
+    ])
+    expected_schema = pa.schema([
+        ('date', pa.date32()),
+        ('index', pa.int64()),
+        ('value', pa.float64()),
+        ('color', pa.string()),
+        ('year', pa.int32()),
+        ('month', pa.int32()),
+    ])
     assert dataset.schema.equals(expected_schema)
 
 
@@ -2088,7 +2002,7 @@ def test_union_dataset_filesystem_datasets(multisourcefs):
 def test_specified_schema(tempdir):
     import pyarrow.parquet as pq
 
-    table = pa.table({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
+    table = pa.table({'a': [1, 2, 3], 'b': [.1, .2, .3]})
     pq.write_table(table, tempdir / "data.parquet")
 
     def _check_dataset(schema, expected, expected_schema=None):
@@ -2111,24 +2025,24 @@ def _check_dataset(schema, expected, expected_schema=None):
     _check_dataset(schema, expected)
 
     # Specifying schema with change column order
-    schema = pa.schema([("b", "float64"), ("a", "int64")])
-    expected = pa.table([[0.1, 0.2, 0.3], [1, 2, 3]], names=["b", "a"])
+    schema = pa.schema([('b', 'float64'), ('a', 'int64')])
+    expected = pa.table([[.1, .2, .3], [1, 2, 3]], names=['b', 'a'])
     _check_dataset(schema, expected)
 
     # Specifying schema with missing column
-    schema = pa.schema([("a", "int64")])
-    expected = pa.table([[1, 2, 3]], names=["a"])
+    schema = pa.schema([('a', 'int64')])
+    expected = pa.table([[1, 2, 3]], names=['a'])
     _check_dataset(schema, expected)
 
     # Specifying schema with additional column
-    schema = pa.schema([("a", "int64"), ("c", "int32")])
-    expected = pa.table(
-        [[1, 2, 3], pa.array([None, None, None], type="int32")], names=["a", "c"]
-    )
+    schema = pa.schema([('a', 'int64'), ('c', 'int32')])
+    expected = pa.table([[1, 2, 3],
+                         pa.array([None, None, None], type='int32')],
+                        names=['a', 'c'])
     _check_dataset(schema, expected)
 
     # Specifying with incompatible schema
-    schema = pa.schema([("a", "int32"), ("b", "float64")])
+    schema = pa.schema([('a', 'int32'), ('b', 'float64')])
     dataset = ds.dataset(str(tempdir / "data.parquet"), schema=schema)
     assert dataset.schema.equals(schema)
     with pytest.raises(TypeError):
@@ -2136,14 +2050,10 @@ def _check_dataset(schema, expected, expected_schema=None):
 
 
 def test_ipc_format(tempdir):
-    table = pa.table(
-        {
-            "a": pa.array([1, 2, 3], type="int8"),
-            "b": pa.array([0.1, 0.2, 0.3], type="float64"),
-        }
-    )
+    table = pa.table({'a': pa.array([1, 2, 3], type="int8"),
+                      'b': pa.array([.1, .2, .3], type="float64")})
 
-    path = str(tempdir / "test.arrow")
+    path = str(tempdir / 'test.arrow')
     with pa.output_stream(path) as sink:
         writer = pa.RecordBatchFileWriter(sink, table.schema)
         writer.write_batch(table.to_batches()[0])
@@ -2161,21 +2071,17 @@ def test_ipc_format(tempdir):
 
 @pytest.mark.pandas
 def test_csv_format(tempdir):
-    table = pa.table(
-        {
-            "a": pa.array([1, 2, 3], type="int64"),
-            "b": pa.array([0.1, 0.2, 0.3], type="float64"),
-        }
-    )
+    table = pa.table({'a': pa.array([1, 2, 3], type="int64"),
+                      'b': pa.array([.1, .2, .3], type="float64")})
 
-    path = str(tempdir / "test.csv")
+    path = str(tempdir / 'test.csv')
     table.to_pandas().to_csv(path, index=False)
 
     dataset = ds.dataset(path, format=ds.CsvFileFormat())
     result = dataset.to_table()
     assert result.equals(table)
 
-    dataset = ds.dataset(path, format="csv")
+    dataset = ds.dataset(path, format='csv')
     result = dataset.to_table()
     assert result.equals(table)
 
@@ -2183,12 +2089,8 @@ def test_csv_format(tempdir):
 def test_feather_format(tempdir):
     from pyarrow.feather import write_feather
 
-    table = pa.table(
-        {
-            "a": pa.array([1, 2, 3], type="int8"),
-            "b": pa.array([0.1, 0.2, 0.3], type="float64"),
-        }
-    )
+    table = pa.table({'a': pa.array([1, 2, 3], type="int8"),
+                      'b': pa.array([.1, .2, .3], type="float64")})
 
     basedir = tempdir / "feather_dataset"
     basedir.mkdir()
@@ -2220,15 +2122,16 @@ def _create_parquet_dataset_simple(root_path):
     metadata_collector = []
 
     for i in range(4):
-        table = pa.table({"f1": [i] * 10, "f2": np.random.randn(10)})
+        table = pa.table({'f1': [i] * 10, 'f2': np.random.randn(10)})
         pq.write_to_dataset(
             table, str(root_path), metadata_collector=metadata_collector
         )
 
-    metadata_path = str(root_path / "_metadata")
+    metadata_path = str(root_path / '_metadata')
     # write _metadata file
     pq.write_metadata(
-        table.schema, metadata_path, metadata_collector=metadata_collector
+        table.schema, metadata_path,
+        metadata_collector=metadata_collector
     )
     return metadata_path, table
 
@@ -2273,24 +2176,22 @@ def _create_metadata_file(root_path):
         metadata_collector.append(metadata)
 
     metadata_path = root_path / "_metadata"
-    pq.write_metadata(schema, metadata_path,
-                      metadata_collector=metadata_collector)
+    pq.write_metadata(
+        schema, metadata_path, metadata_collector=metadata_collector
+    )
     return metadata_path
 
 
 def _create_parquet_dataset_partitioned(root_path):
     import pyarrow.parquet as pq
 
-    table = pa.table(
-        [
-            pa.array(range(20)),
-            pa.array(np.random.randn(20)),
-            pa.array(np.repeat(["a", "b"], 10)),
-        ],
-        names=["f1", "f2", "part"],
+    table = pa.table([
+        pa.array(range(20)), pa.array(np.random.randn(20)),
+        pa.array(np.repeat(['a', 'b'], 10))],
+        names=["f1", "f2", "part"]
     )
     table = table.replace_schema_metadata({"key": "value"})
-    pq.write_to_dataset(table, str(root_path), partition_cols=["part"])
+    pq.write_to_dataset(table, str(root_path), partition_cols=['part'])
     return _create_metadata_file(root_path), table
 
 
@@ -2343,8 +2244,9 @@ def test_parquet_dataset_lazy_filtering(tempdir, open_logging_fs):
     # creating the dataset should only open the metadata file
     with assert_opens([metadata_path]):
         dataset = ds.parquet_dataset(
-            metadata_path, partitioning=ds.partitioning(flavor="hive"), filesystem=fs
-        )
+            metadata_path,
+            partitioning=ds.partitioning(flavor="hive"),
+            filesystem=fs)
 
     # materializing fragments should not open any file
     with assert_opens([]):
@@ -2373,7 +2275,7 @@ def test_parquet_dataset_lazy_filtering(tempdir, open_logging_fs):
 @pytest.mark.pandas
 def test_dataset_schema_metadata(tempdir):
     # ARROW-8802
-    df = pd.DataFrame({"a": [1, 2, 3]})
+    df = pd.DataFrame({'a': [1, 2, 3]})
     path = tempdir / "test.parquet"
     df.to_parquet(path)
     dataset = ds.dataset(path)
@@ -2392,13 +2294,13 @@ def test_filter_mismatching_schema(tempdir):
     # ARROW-9146
     import pyarrow.parquet as pq
 
-    table = pa.table({"col": pa.array([1, 2, 3, 4], type="int32")})
+    table = pa.table({"col": pa.array([1, 2, 3, 4], type='int32')})
     pq.write_table(table, str(tempdir / "data.parquet"))
 
     # specifying explicit schema, but that mismatches the schema of the data
     schema = pa.schema([("col", pa.int64())])
-    dataset = ds.dataset(tempdir / "data.parquet",
-                         format="parquet", schema=schema)
+    dataset = ds.dataset(
+        tempdir / "data.parquet", format="parquet", schema=schema)
 
     # filtering on a column with such type mismatch should give a proper error
     with pytest.raises(TypeError):
@@ -2415,72 +2317,65 @@ def test_dataset_project_only_partition_columns(tempdir):
     # ARROW-8729
     import pyarrow.parquet as pq
 
-    table = pa.table({"part": "a a b b".split(), "col": list(range(4))})
+    table = pa.table({'part': 'a a b b'.split(), 'col': list(range(4))})
 
-    path = str(tempdir / "test_dataset")
-    pq.write_to_dataset(table, path, partition_cols=["part"])
-    dataset = ds.dataset(path, partitioning="hive")
+    path = str(tempdir / 'test_dataset')
+    pq.write_to_dataset(table, path, partition_cols=['part'])
+    dataset = ds.dataset(path, partitioning='hive')
 
     all_cols = dataset.to_table(use_threads=False)
-    part_only = dataset.to_table(columns=["part"], use_threads=False)
+    part_only = dataset.to_table(columns=['part'], use_threads=False)
 
-    assert all_cols.column("part").equals(part_only.column("part"))
+    assert all_cols.column('part').equals(part_only.column('part'))
 
 
 @pytest.mark.parquet
 @pytest.mark.pandas
 def test_dataset_project_null_column(tempdir):
     import pandas as pd
-
-    df = pd.DataFrame({"col": np.array([None, None, None], dtype="object")})
+    df = pd.DataFrame({"col": np.array([None, None, None], dtype='object')})
 
     f = tempdir / "test_dataset_project_null_column.parquet"
     df.to_parquet(f, engine="pyarrow")
 
     dataset = ds.dataset(f, format="parquet",
                          schema=pa.schema([("col", pa.int64())]))
-    expected = pa.table({"col": pa.array([None, None, None], pa.int64())})
+    expected = pa.table({'col': pa.array([None, None, None], pa.int64())})
     assert dataset.to_table().equals(expected)
 
 
-def _check_dataset_roundtrip(
-    dataset, base_dir, expected_files, base_dir_path=None, partitioning=None
-):
+def _check_dataset_roundtrip(dataset, base_dir, expected_files,
+                             base_dir_path=None, partitioning=None):
     base_dir_path = base_dir_path or base_dir
 
-    ds.write_dataset(
-        dataset,
-        base_dir,
-        format="feather",
-        partitioning=partitioning,
-        use_threads=False,
-    )
+    ds.write_dataset(dataset, base_dir, format="feather",
+                     partitioning=partitioning, use_threads=False)
 
     # check that all files are present
     file_paths = list(base_dir_path.rglob("*"))
     assert set(file_paths) == set(expected_files)
 
     # check that reading back in as dataset gives the same result
-    dataset2 = ds.dataset(base_dir_path, format="feather",
-                          partitioning=partitioning)
+    dataset2 = ds.dataset(
+        base_dir_path, format="feather", partitioning=partitioning)
     assert dataset2.to_table().equals(dataset.to_table())
 
 
 @pytest.mark.parquet
 def test_write_dataset(tempdir):
     # manually create a written dataset and read as dataset object
-    directory = tempdir / "single-file"
+    directory = tempdir / 'single-file'
     directory.mkdir()
     _ = _create_single_file(directory)
     dataset = ds.dataset(directory)
 
     # full string path
-    target = tempdir / "single-file-target"
+    target = tempdir / 'single-file-target'
     expected_files = [target / "part-0.feather"]
     _check_dataset_roundtrip(dataset, str(target), expected_files, target)
 
     # pathlib path object
-    target = tempdir / "single-file-target2"
+    target = tempdir / 'single-file-target2'
     expected_files = [target / "part-0.feather"]
     _check_dataset_roundtrip(dataset, target, expected_files, target)
 
@@ -2492,12 +2387,12 @@ def test_write_dataset(tempdir):
     #     dataset, './single-file-target3', expected_files, target)
 
     # Directory of files
-    directory = tempdir / "single-directory"
+    directory = tempdir / 'single-directory'
     directory.mkdir()
     _ = _create_directory_of_files(directory)
     dataset = ds.dataset(directory)
 
-    target = tempdir / "single-directory-target"
+    target = tempdir / 'single-directory-target'
     expected_files = [target / "part-0.feather"]
     _check_dataset_roundtrip(dataset, str(target), expected_files, target)
 
@@ -2511,32 +2406,28 @@ def test_write_dataset_partitioned(tempdir):
     dataset = ds.dataset(directory, partitioning=partitioning)
 
     # hive partitioning
-    target = tempdir / "partitioned-hive-target"
+    target = tempdir / 'partitioned-hive-target'
     expected_paths = [
-        target / "part=a",
-        target / "part=a" / "part-0.feather",
-        target / "part=b",
-        target / "part=b" / "part-1.feather",
+        target / "part=a", target / "part=a" / "part-0.feather",
+        target / "part=b", target / "part=b" / "part-1.feather"
     ]
     partitioning_schema = ds.partitioning(
-        pa.schema([("part", pa.string())]), flavor="hive"
-    )
+        pa.schema([("part", pa.string())]), flavor="hive")
     _check_dataset_roundtrip(
-        dataset, str(target), expected_paths, target, partitioning=partitioning_schema
-    )
+        dataset, str(target), expected_paths, target,
+        partitioning=partitioning_schema)
 
     # directory partitioning
-    target = tempdir / "partitioned-dir-target"
+    target = tempdir / 'partitioned-dir-target'
     expected_paths = [
-        target / "a",
-        target / "a" / "part-0.feather",
-        target / "b",
-        target / "b" / "part-1.feather",
+        target / "a", target / "a" / "part-0.feather",
+        target / "b", target / "b" / "part-1.feather"
     ]
-    partitioning_schema = ds.partitioning(pa.schema([("part", pa.string())]))
+    partitioning_schema = ds.partitioning(
+        pa.schema([("part", pa.string())]))
     _check_dataset_roundtrip(
-        dataset, str(target), expected_paths, target, partitioning=partitioning_schema
-    )
+        dataset, str(target), expected_paths, target,
+        partitioning=partitioning_schema)
 
 
 @pytest.mark.parquet
@@ -2547,26 +2438,22 @@ def test_write_dataset_partitioned_dict(tempdir):
 
     # directory partitioning, dictionary partition columns
     dataset = ds.dataset(
-        directory, partitioning=ds.HivePartitioning.discover(
-            infer_dictionary=True)
-    )
-    target = tempdir / "partitioned-dir-target"
+        directory,
+        partitioning=ds.HivePartitioning.discover(infer_dictionary=True))
+    target = tempdir / 'partitioned-dir-target'
     expected_paths = [
-        target / "a",
-        target / "a" / "part-0.feather",
-        target / "b",
-        target / "b" / "part-1.feather",
+        target / "a", target / "a" / "part-0.feather",
+        target / "b", target / "b" / "part-1.feather"
     ]
-    partitioning = ds.partitioning(
-        pa.schema([dataset.schema.field("part")]),
-        dictionaries={"part": pa.array(["a", "b"])},
-    )
+    partitioning = ds.partitioning(pa.schema([
+        dataset.schema.field('part')]),
+        dictionaries={'part': pa.array(['a', 'b'])})
     # NB: dictionaries required here since we use partitioning to parse
     # directories in _check_dataset_roundtrip (not currently required for
     # the formatting step)
     _check_dataset_roundtrip(
-        dataset, str(target), expected_paths, target, partitioning=partitioning
-    )
+        dataset, str(target), expected_paths, target,
+        partitioning=partitioning)
 
 
 @pytest.mark.parquet
@@ -2579,13 +2466,15 @@ def test_write_dataset_use_threads(tempdir):
     partitioning = ds.partitioning(
         pa.schema([("part", pa.string())]), flavor="hive")
 
-    target1 = tempdir / "partitioned1"
+    target1 = tempdir / 'partitioned1'
     ds.write_dataset(
-        dataset, target1, format="feather", partitioning=partitioning, use_threads=True
+        dataset, target1, format="feather", partitioning=partitioning,
+        use_threads=True
     )
-    target2 = tempdir / "partitioned2"
+    target2 = tempdir / 'partitioned2'
     ds.write_dataset(
-        dataset, target2, format="feather", partitioning=partitioning, use_threads=False
+        dataset, target2, format="feather", partitioning=partitioning,
+        use_threads=False
     )
 
     # check that reading in gives same result
@@ -2595,19 +2484,14 @@ def test_write_dataset_use_threads(tempdir):
 
 
 def test_write_table(tempdir):
-    table = pa.table(
-        [
-            pa.array(range(20)),
-            pa.array(np.random.randn(20)),
-            pa.array(np.repeat(["a", "b"], 10)),
-        ],
-        names=["f1", "f2", "part"],
-    )
-
-    base_dir = tempdir / "single"
-    ds.write_dataset(
-        table, base_dir, basename_template="dat_{i}.arrow", format="feather"
-    )
+    table = pa.table([
+        pa.array(range(20)), pa.array(np.random.randn(20)),
+        pa.array(np.repeat(['a', 'b'], 10))
+    ], names=["f1", "f2", "part"])
+
+    base_dir = tempdir / 'single'
+    ds.write_dataset(table, base_dir,
+                     basename_template='dat_{i}.arrow', format="feather")
     # check that all files are present
     file_paths = list(base_dir.rglob("*"))
     expected_paths = [base_dir / "dat_0.arrow"]
@@ -2617,22 +2501,16 @@ def test_write_table(tempdir):
     assert result.equals(table)
 
     # with partitioning
-    base_dir = tempdir / "partitioned"
+    base_dir = tempdir / 'partitioned'
     partitioning = ds.partitioning(
         pa.schema([("part", pa.string())]), flavor="hive")
-    ds.write_dataset(
-        table,
-        base_dir,
-        format="feather",
-        basename_template="dat_{i}.arrow",
-        partitioning=partitioning,
-    )
+    ds.write_dataset(table, base_dir, format="feather",
+                     basename_template='dat_{i}.arrow',
+                     partitioning=partitioning)
     file_paths = list(base_dir.rglob("*"))
     expected_paths = [
-        base_dir / "part=a",
-        base_dir / "part=a" / "dat_0.arrow",
-        base_dir / "part=b",
-        base_dir / "part=b" / "dat_1.arrow",
+        base_dir / "part=a", base_dir / "part=a" / "dat_0.arrow",
+        base_dir / "part=b", base_dir / "part=b" / "dat_1.arrow"
     ]
     assert set(file_paths) == set(expected_paths)
     result = ds.dataset(base_dir, format="ipc", partitioning=partitioning)
@@ -2640,66 +2518,59 @@ def test_write_table(tempdir):
 
 
 def test_write_table_multiple_fragments(tempdir):
-    table = pa.table(
-        [
-            pa.array(range(10)),
-            pa.array(np.random.randn(10)),
-            pa.array(np.repeat(["a", "b"], 5)),
-        ],
-        names=["f1", "f2", "part"],
-    )
-    table = pa.concat_tables([table] * 2)
+    table = pa.table([
+        pa.array(range(10)), pa.array(np.random.randn(10)),
+        pa.array(np.repeat(['a', 'b'], 5))
+    ], names=["f1", "f2", "part"])
+    table = pa.concat_tables([table]*2)
 
     # Table with multiple batches written as single Fragment by default
-    base_dir = tempdir / "single"
+    base_dir = tempdir / 'single'
     ds.write_dataset(table, base_dir, format="feather")
     assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"])
     assert ds.dataset(base_dir, format="ipc").to_table().equals(table)
 
     # Same for single-element list of Table
-    base_dir = tempdir / "single-list"
+    base_dir = tempdir / 'single-list'
     ds.write_dataset([table], base_dir, format="feather")
     assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"])
     assert ds.dataset(base_dir, format="ipc").to_table().equals(table)
 
     # Provide list of batches to write multiple fragments
-    base_dir = tempdir / "multiple"
+    base_dir = tempdir / 'multiple'
     ds.write_dataset(table.to_batches(), base_dir, format="feather")
-    assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"])
+    assert set(base_dir.rglob("*")) == set(
+        [base_dir / "part-0.feather"])
     assert ds.dataset(base_dir, format="ipc").to_table().equals(table)
 
     # Provide list of tables to write multiple fragments
-    base_dir = tempdir / "multiple-table"
+    base_dir = tempdir / 'multiple-table'
     ds.write_dataset([table, table], base_dir, format="feather")
-    assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"])
-    assert (
-        ds.dataset(base_dir, format="ipc")
-        .to_table()
-        .equals(pa.concat_tables([table] * 2))
+    assert set(base_dir.rglob("*")) == set(
+        [base_dir / "part-0.feather"])
+    assert ds.dataset(base_dir, format="ipc").to_table().equals(
+        pa.concat_tables([table]*2)
     )
 
 
 def test_write_table_partitioned_dict(tempdir):
     # ensure writing table partitioned on a dictionary column works without
     # specifying the dictionary values explicitly
-    table = pa.table(
-        [
-            pa.array(range(20)),
-            pa.array(np.repeat(["a", "b"], 10)).dictionary_encode(),
-        ],
-        names=["col", "part"],
-    )
+    table = pa.table([
+        pa.array(range(20)),
+        pa.array(np.repeat(['a', 'b'], 10)).dictionary_encode(),
+    ], names=['col', 'part'])
 
     partitioning = ds.partitioning(table.select(["part"]).schema)
 
     base_dir = tempdir / "dataset"
-    ds.write_dataset(table, base_dir, format="feather",
-                     partitioning=partitioning)
+    ds.write_dataset(
+        table, base_dir, format="feather", partitioning=partitioning
+    )
 
     # check roundtrip
     partitioning_read = ds.DirectoryPartitioning.discover(
-        ["part"], infer_dictionary=True
-    )
+        ["part"], infer_dictionary=True)
     result = ds.dataset(
         base_dir, format="ipc", partitioning=partitioning_read
     ).to_table()
@@ -2710,18 +2581,14 @@ def test_write_table_partitioned_dict(tempdir):
 def test_write_dataset_parquet(tempdir):
     import pyarrow.parquet as pq
 
-    table = pa.table(
-        [
-            pa.array(range(20)),
-            pa.array(np.random.randn(20)),
-            pa.array(np.repeat(["a", "b"], 10)),
-        ],
-        names=["f1", "f2", "part"],
-    )
+    table = pa.table([
+        pa.array(range(20)), pa.array(np.random.randn(20)),
+        pa.array(np.repeat(['a', 'b'], 10))
+    ], names=["f1", "f2", "part"])
 
     # using default "parquet" format string
 
-    base_dir = tempdir / "parquet_dataset"
+    base_dir = tempdir / 'parquet_dataset'
     ds.write_dataset(table, base_dir, format="parquet")
     # check that all files are present
     file_paths = list(base_dir.rglob("*"))
@@ -2735,7 +2602,7 @@ def test_write_dataset_parquet(tempdir):
     for version in ["1.0", "2.0"]:
         format = ds.ParquetFileFormat()
         opts = format.make_write_options(version=version)
-        base_dir = tempdir / "parquet_dataset_version{0}".format(version)
+        base_dir = tempdir / 'parquet_dataset_version{0}'.format(version)
         ds.write_dataset(table, base_dir, format=format, file_options=opts)
         meta = pq.read_metadata(base_dir / "part-0.parquet")
         assert meta.format_version == version
@@ -2760,12 +2627,12 @@ def test_write_dataset_schema_metadata(tempdir):
     # ensure that schema metadata gets written
     from pyarrow import feather
 
-    table = pa.table({"a": [1, 2, 3]})
-    table = table.replace_schema_metadata({b"key": b"value"})
+    table = pa.table({'a': [1, 2, 3]})
+    table = table.replace_schema_metadata({b'key': b'value'})
     ds.write_dataset(table, tempdir, format="feather")
 
     schema = feather.read_table(tempdir / "part-0.feather").schema
-    assert schema.metadata == {b"key": b"value"}
+    assert schema.metadata == {b'key': b'value'}
 
 
 @pytest.mark.parquet
@@ -2773,12 +2640,12 @@ def test_write_dataset_schema_metadata_parquet(tempdir):
     # ensure that schema metadata gets written
     import pyarrow.parquet as pq
 
-    table = pa.table({"a": [1, 2, 3]})
-    table = table.replace_schema_metadata({b"key": b"value"})
+    table = pa.table({'a': [1, 2, 3]})
+    table = table.replace_schema_metadata({b'key': b'value'})
     ds.write_dataset(table, tempdir, format="parquet")
 
     schema = pq.read_table(tempdir / "part-0.parquet").schema
-    assert schema.metadata == {b"key": b"value"}
+    assert schema.metadata == {b'key': b'value'}
 
 
 @pytest.mark.parquet
@@ -2786,23 +2653,22 @@ def test_write_dataset_schema_metadata_parquet(tempdir):
 def test_write_dataset_s3(s3_example_simple):
     # write dataset with s3 filesystem
     _, _, fs, _, host, port, access_key, secret_key = s3_example_simple
-    uri_template = "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format(
-        access_key, secret_key, host, port
+    uri_template = (
+        "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format(
+            access_key, secret_key, host, port)
     )
 
-    table = pa.table(
-        [
-            pa.array(range(20)),
-            pa.array(np.random.randn(20)),
-            pa.array(np.repeat(["a", "b"], 10)),
-        ],
-        names=["f1", "f2", "part"],
+    table = pa.table([
+        pa.array(range(20)), pa.array(np.random.randn(20)),
+        pa.array(np.repeat(['a', 'b'], 10))],
+        names=["f1", "f2", "part"]
     )
     part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive")
 
     # writing with filesystem object
     ds.write_dataset(
-        table, "mybucket/dataset", filesystem=fs, format="feather", partitioning=part
+        table, "mybucket/dataset", filesystem=fs, format="feather",
+        partitioning=part
     )
     # check rountrip
     result = ds.dataset(

From 982f68c1afefeedf1656ebd14f5b51ef0f6c796a Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 15 Feb 2021 11:12:40 -1000
Subject: [PATCH 18/33] Added more tests, rounded out a few behaviors

---
 cpp/src/arrow/dataset/partition.cc      |  15 ++-
 cpp/src/arrow/dataset/partition.h       |   1 +
 cpp/src/arrow/dataset/partition_test.cc |  40 ++++++--
 python/pyarrow/tests/test_dataset.py    | 123 +++++++++++++++++++++---
 4 files changed, 153 insertions(+), 26 deletions(-)

diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 2afaf414f9d..46142560c13 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -98,12 +98,16 @@ Status KeyValuePartitioning::SetDefaultValuesFromKeys(const Expression& expr,
   return Status::OK();
 }
 
-inline Expression ConjunctionFromGroupingRow(Scalar* row) {
+Expression ConjunctionFromGroupingRow(Scalar* row) {
   ScalarVector* values = &checked_cast<StructScalar*>(row)->value;
   std::vector<Expression> equality_expressions(values->size());
   for (size_t i = 0; i < values->size(); ++i) {
     const std::string& name = row->type->field(static_cast<int>(i))->name();
-    equality_expressions[i] = equal(field_ref(name), literal(std::move(values->at(i))));
+    if (values->at(i)->is_valid) {
+      equality_expressions[i] = equal(field_ref(name), literal(std::move(values->at(i))));
+    } else {
+      equality_expressions[i] = is_null(field_ref(name));
+    }
   }
   return and_(std::move(equality_expressions));
 }
@@ -272,7 +276,7 @@ Result<std::string> DirectoryPartitioning::FormatValues(
   std::vector<std::string> segments(static_cast<size_t>(schema_->num_fields()));
 
   for (int i = 0; i < schema_->num_fields(); ++i) {
-    if (values[i] != nullptr) {
+    if (values[i] != nullptr && values[i]->is_valid) {
       segments[i] = values[i]->ToString();
       continue;
     }
@@ -432,7 +436,7 @@ std::shared_ptr<PartitioningFactory> DirectoryPartitioning::MakeFactory(
 util::optional<KeyValuePartitioning::Key> HivePartitioning::ParseKey(
     const std::string& segment, const std::string& null_fallback) {
   auto name_end = string_view(segment).find_first_of('=');
-  // Keep for backwards compatibility, this would be produced by arrow <= 3
+  // Not round-trippable
   if (name_end == string_view::npos) {
     return util::nullopt;
   }
@@ -513,7 +517,8 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory {
       // drop fields which aren't in field_names_
       auto out_schema = SchemaFromColumnNames(schema, field_names_);
 
-      return std::make_shared<HivePartitioning>(std::move(out_schema), dictionaries_);
+      return std::make_shared<HivePartitioning>(std::move(out_schema), dictionaries_,
+                                                null_fallback_);
     }
   }
 
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index e5afd00c76d..bc59dfe53c5 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -202,6 +202,7 @@ class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning {
         null_fallback_(null_fallback) {}
 
   std::string type_name() const override { return "hive"; }
+  std::string null_fallback() const { return null_fallback_; }
 
   static util::optional<Key> ParseKey(const std::string& segment,
                                       const std::string& null_fallback);
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index b8dade238c0..4b9c1d222f9 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -80,15 +80,19 @@ class TestPartitioning : public ::testing::Test {
 
   void AssertPartition(const std::shared_ptr<Partitioning> partitioning,
                        const std::shared_ptr<RecordBatch> full_batch,
-                       const RecordBatchVector& expected_batches) {
+                       const RecordBatchVector& expected_batches,
+                       const std::vector<Expression>& expected_expressions) {
     ASSERT_OK_AND_ASSIGN(auto partition_results, partitioning->Partition(full_batch));
     std::shared_ptr<RecordBatch> rest = full_batch;
     ASSERT_EQ(partition_results.batches.size(), expected_batches.size());
     auto max_index = std::min(partition_results.batches.size(), expected_batches.size());
     for (std::size_t partition_index = 0; partition_index < max_index;
          partition_index++) {
-      std::shared_ptr<RecordBatch> actual = partition_results.batches[partition_index];
-      AssertBatchesEqual(*expected_batches[partition_index], *actual);
+      std::shared_ptr<RecordBatch> actual_batch =
+          partition_results.batches[partition_index];
+      AssertBatchesEqual(*expected_batches[partition_index], *actual_batch);
+      Expression actual_expression = partition_results.expressions[partition_index];
+      ASSERT_EQ(expected_expressions[partition_index], actual_expression);
     }
   }
 
@@ -96,14 +100,15 @@ class TestPartitioning : public ::testing::Test {
                        const std::shared_ptr<Schema> schema,
                        const std::string& record_batch_json,
                        const std::shared_ptr<Schema> partitioned_schema,
-                       const std::vector<std::string>& expected_record_batch_strs) {
+                       const std::vector<std::string>& expected_record_batch_strs,
+                       const std::vector<Expression>& expected_expressions) {
     auto record_batch = RecordBatchFromJSON(schema, record_batch_json);
     RecordBatchVector expected_batches;
     for (const auto& expected_record_batch_str : expected_record_batch_strs) {
       expected_batches.push_back(
           RecordBatchFromJSON(partitioned_schema, expected_record_batch_str));
     }
-    AssertPartition(partitioning, record_batch, expected_batches);
+    AssertPartition(partitioning, record_batch, expected_batches, expected_expressions);
   }
 
   void AssertInspectError(const std::vector<std::string>& paths) {
@@ -132,7 +137,7 @@ class TestPartitioning : public ::testing::Test {
   std::shared_ptr<Schema> written_schema_;
 };
 
-TEST_F(TestPartitioning, Basic) {
+TEST_F(TestPartitioning, Partition) {
   auto partition_schema = schema({field("a", int32()), field("b", utf8())});
   auto schema_ = schema({field("a", int32()), field("b", utf8()), field("c", uint32())});
   auto remaining_schema = schema({field("c", uint32())});
@@ -147,7 +152,13 @@ TEST_F(TestPartitioning, Basic) {
   std::vector<std::string> expected_batches = {R"([{"c": 0}, {"c": 1}])", R"([{"c": 2}])",
                                                R"([{"c": 3}, {"c": 5}])",
                                                R"([{"c": 4}])"};
-  AssertPartition(partitioning, schema_, json, remaining_schema, expected_batches);
+  std::vector<Expression> expected_expressions = {
+      and_(equal(field_ref("a"), literal(3)), equal(field_ref("b"), literal("x"))),
+      and_(equal(field_ref("a"), literal(1)), is_null(field_ref("b"))),
+      and_(is_null(field_ref("a")), is_null(field_ref("b"))),
+      and_(is_null(field_ref("a")), equal(field_ref("b"), literal("z")))};
+  AssertPartition(partitioning, schema_, json, remaining_schema, expected_batches,
+                  expected_expressions);
 }
 
 TEST_F(TestPartitioning, StructDictionaryNull) {}
@@ -185,6 +196,10 @@ TEST_F(TestPartitioning, DirectoryPartitioningFormat) {
                     equal(field_ref("alpha"), literal(0))),
                "0/hello");
   AssertFormat(equal(field_ref("alpha"), literal(0)), "0");
+  AssertFormat(and_(equal(field_ref("alpha"), literal(0)), is_null(field_ref("beta"))),
+               "0");
+  AssertFormatError(
+      and_(is_null(field_ref("alpha")), equal(field_ref("beta"), literal("hello"))));
   AssertFormatError(equal(field_ref("beta"), literal("hello")));
   AssertFormat(literal(true), "");
 
@@ -406,6 +421,17 @@ TEST_F(TestPartitioning, HiveDictionaryInference) {
       {DictStr("alpha"), DictStr("beta")});
 }
 
+TEST_F(TestPartitioning, HiveNullFallbackPassedOn) {
+  HivePartitioningFactoryOptions options;
+  options.null_fallback = "xyz";
+  factory_ = HivePartitioning::MakeFactory(options);
+
+  EXPECT_OK_AND_ASSIGN(auto schema, factory_->Inspect({"/alpha=a/beta=0"}));
+  EXPECT_OK_AND_ASSIGN(auto partitioning, factory_->Finish(schema));
+  ASSERT_EQ("xyz",
+            std::static_pointer_cast<HivePartitioning>(partitioning)->null_fallback());
+}
+
 TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) {
   HivePartitioningFactoryOptions options;
   options.infer_dictionary = true;
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index b2c1fc9f030..a42dc83769e 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -16,8 +16,11 @@
 # under the License.
 
 import contextlib
+import os
+import posixpath
 import pathlib
 import pickle
+from pyarrow.dataset import partitioning
 import textwrap
 
 import numpy as np
@@ -381,11 +384,16 @@ def test_partitioning():
     with pytest.raises(pa.ArrowInvalid):
         partitioning.parse('/prefix/3/aaa')
 
+    expr = partitioning.parse('/3')
+    expected = ds.field('group') == 3
+    assert expr.equals(expected)
+
     partitioning = ds.HivePartitioning(
         pa.schema([
             pa.field('alpha', pa.int64()),
             pa.field('beta', pa.int64())
-        ])
+        ]),
+        null_fallback='xyz'
     )
     expr = partitioning.parse('/alpha=0/beta=3')
     expected = (
@@ -394,6 +402,12 @@ def test_partitioning():
     )
     assert expr.equals(expected)
 
+    expr = partitioning.parse('/alpha=xyz/beta=3')
+    expected = (
+        (ds.field('alpha').is_null() & (ds.field('beta') == ds.scalar(3)))
+    )
+    assert expr.equals(expected)
+
     for shouldfail in ['/alpha=one/beta=2', '/alpha=one', '/beta=two']:
         with pytest.raises(pa.ArrowInvalid):
             partitioning.parse(shouldfail)
@@ -412,7 +426,7 @@ def test_expression_serialization():
                  d.is_valid(), a.cast(pa.int32(), safe=False),
                  a.cast(pa.int32(), safe=False), a.isin([1, 2, 3]),
                  ds.field('i64') > 5, ds.field('i64') == 5,
-                 ds.field('i64') == 7]
+                 ds.field('i64') == 7, ds.field('i64').is_null()]
     for expr in all_exprs:
         assert isinstance(expr, ds.Expression)
         restored = pickle.loads(pickle.dumps(expr))
@@ -440,7 +454,6 @@ def test_expression_construction():
     with pytest.raises(pa.ArrowInvalid):
         field != {1}
 
-
 def test_expression_boolean_operators():
     # https://issues.apache.org/jira/browse/ARROW-11412
     true = ds.scalar(True)
@@ -468,6 +481,8 @@ def test_partition_keys():
     assert ds._get_partition_keys(nope) == {}
     assert ds._get_partition_keys(a & nope) == {'a': 'a'}
 
+    null = ds.field('a').is_null()
+    assert ds._get_partition_keys(null) == {'a': None}
 
 def test_parquet_read_options():
     opts1 = ds.ParquetReadOptions()
@@ -1242,6 +1257,45 @@ def test_partitioning_factory_dictionary(mockfs, infer_dictionary):
     else:
         assert inferred_schema.field('key').type == pa.string()
 
+def test_dictionary_partitioning_outer_nulls_raises(tempdir):
+    table = pa.table({'a': [ 'x', 'y', None ], 'b': ['x', 'y', 'z']})
+    part = ds.partitioning(pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())]))
+    with pytest.raises(pa.ArrowInvalid):
+        ds.write_dataset(table, tempdir, format='parquet', partitioning=part)
+
+def _has_subdirs(basedir):
+    return any([os.path.isdir(os.path.join(basedir, el)) for el in os.listdir(basedir)])
+
+def _do_list_all_dirs(basedir, path_so_far, result):
+    for f in os.listdir(basedir):
+        true_nested = os.path.join(basedir, f)
+        if os.path.isdir(true_nested):
+            norm_nested = posixpath.join(path_so_far, f)
+            if _has_subdirs(true_nested):
+                _do_list_all_dirs(true_nested, norm_nested, result)
+            else:
+                result.append(norm_nested)
+
+def _list_all_dirs(basedir):
+    result = []
+    _do_list_all_dirs(basedir, '', result)
+    return result
+
+def _check_dataset_directories(tempdir, expected_directories):
+    actual_directories = set(_list_all_dirs(tempdir))
+    assert actual_directories == set(expected_directories)
+
+def test_dictionary_partitioning_inner_nulls(tempdir):
+    table = pa.table({'a': [ 'x', 'y', 'z' ], 'b': ['x', 'y', None]})
+    part = ds.partitioning(pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())]))
+    ds.write_dataset(table, tempdir, format='parquet', partitioning=part)
+    _check_dataset_directories(tempdir, ['x/x', 'y/y', 'z'])
+
+def test_hive_partitioning_nulls(tempdir):
+    table = pa.table({'a': [ 'x', None, 'z' ], 'b': ['x', 'y', None]})
+    part = ds.HivePartitioning(pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())]), None, 'xyz')
+    ds.write_dataset(table, tempdir, format='parquet', partitioning=part)
+    _check_dataset_directories(tempdir, ['a=x/b=x', 'a=xyz/b=y', 'a=z/b=xyz'])
 
 def test_partitioning_function():
     schema = pa.schema([("year", pa.int16()), ("month", pa.int8())])
@@ -1605,6 +1659,7 @@ def test_open_dataset_non_existing_file():
 @pytest.mark.parquet
 @pytest.mark.parametrize('partitioning', ["directory", "hive"])
 @pytest.mark.parametrize('null_fallback', ['xyz', None])
+@pytest.mark.parametrize('infer_dictionary', [False, True])
 @pytest.mark.parametrize('partition_keys', [
     (["A", "B", "C"], [1, 2, 3]),
     ([1, 2, 3], ["A", "B", "C"]),
@@ -1614,30 +1669,30 @@ def test_open_dataset_non_existing_file():
     ([1, 2, 3], ["A", None, "C"]),
     ([None, 2, 3], [None, 2, 3]),
 ])
-def test_open_dataset_partitioned_dictionary_type(
-    tempdir, partitioning, null_fallback, partition_keys
+def test_partition_discovery(
+    tempdir, partitioning, null_fallback, infer_dictionary, partition_keys
 ):
     # ARROW-9288 / ARROW-9476
     import pyarrow.parquet as pq
 
     table = pa.table({'a': range(9), 'b': [0.0] * 4 + [1.0] * 5})
 
-    if None in partition_keys[0] or None in partition_keys[1]:
+    if partitioning == "directory" and (None in partition_keys[0] or None in partition_keys[1]):
         # Directory partitioning can't handle the first part being null
         return
 
     if partitioning == "directory":
         partitioning = ds.DirectoryPartitioning.discover(
-            ["part1", "part2"], infer_dictionary=True)
+            ["part1", "part2"], infer_dictionary=infer_dictionary)
         fmt = "{0}/{1}"
         null_value = None
     else:
         if null_fallback:
             partitioning = ds.HivePartitioning.discover(
-                infer_dictionary=True, null_fallback=null_fallback
+                infer_dictionary=infer_dictionary, null_fallback=null_fallback
             )
         else:
-            partitioning = ds.HivePartitioning.discover(infer_dictionary=True)
+            partitioning = ds.HivePartitioning.discover(infer_dictionary=infer_dictionary)
         fmt = "part1={0}/part2={1}"
         if null_fallback:
             null_value = null_fallback
@@ -1657,13 +1712,16 @@ def test_open_dataset_partitioned_dictionary_type(
 
     dataset = ds.dataset(str(basepath), partitioning=partitioning)
 
-    def dict_type(key):
-        value_type = pa.string() if isinstance(key, str) else pa.int32()
-        return pa.dictionary(pa.int32(), value_type)
+    def expected_type(key):
+        if infer_dictionary:
+            value_type = pa.string() if isinstance(key, str) else pa.int32()
+            return pa.dictionary(pa.int32(), value_type)
+        else:
+            return pa.string() if isinstance(key, str) else pa.int32()
     expected_schema = table.schema.append(
-        pa.field("part1", dict_type(part_keys1[0]))
+        pa.field("part1", expected_type(part_keys1[0]))
     ).append(
-        pa.field("part2", dict_type(part_keys2[0]))
+        pa.field("part2", expected_type(part_keys2[0]))
     )
     assert dataset.schema.equals(expected_schema)
 
@@ -2327,7 +2385,44 @@ def test_dataset_project_only_partition_columns(tempdir):
     part_only = dataset.to_table(columns=['part'], use_threads=False)
 
     assert all_cols.column('part').equals(part_only.column('part'))
+    
+@pytest.mark.parquet
+@pytest.mark.pandas
+def test_write_to_dataset_given_null_just_works(tempdir):
+    import pyarrow.parquet as pq
+
+    schema = pa.schema([
+        pa.field('col', pa.int64()),
+        pa.field('part', pa.dictionary(pa.int32(), pa.string()))
+    ])
+    table = pa.table({'part': [None, None, 'a', 'a'], 'col': list(range(4))}, schema=schema)
+
+    path = str(tempdir / 'test_dataset')
+    pq.write_to_dataset(table, path, partition_cols=['part'], use_legacy_dataset=False)
+
+    actual_table = pq.read_table(tempdir / 'test_dataset')
+    # column.equals can handle the difference in chunking but not the fact that `part`
+    # will have different dictionaries for the two chunks
+    assert actual_table.column('part').to_pylist() == table.column('part').to_pylist()
+    assert actual_table.column('col').equals(table.column('col'))
+
+@pytest.mark.parquet
+@pytest.mark.pandas
+def test_legacy_write_to_dataset_drops_null(tempdir):
+    import pyarrow.parquet as pq
+
+    schema = pa.schema([
+        pa.field('col', pa.int64()),
+        pa.field('part', pa.dictionary(pa.int32(), pa.string()))
+    ])
+    table = pa.table({'part': ['a', 'a', None, None], 'col': list(range(4))}, schema=schema)
+    expected = pa.table({'part': ['a', 'a'], 'col': list(range(2))}, schema=schema)
+
+    path = str(tempdir / 'test_dataset')
+    pq.write_to_dataset(table, path, partition_cols=['part'], use_legacy_dataset=True)
 
+    actual = pq.read_table(tempdir / 'test_dataset')
+    assert actual == expected
 
 @pytest.mark.parquet
 @pytest.mark.pandas

From 07eee3a40b02f2f05d23c65778ac574c36dd5274 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 15 Feb 2021 12:07:41 -1000
Subject: [PATCH 19/33] Added tests for SetDefaultValues to ensure it does the
 correct thing on null

---
 cpp/src/arrow/dataset/partition_test.cc | 49 +++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 4b9c1d222f9..1573b64dcbe 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -458,6 +458,55 @@ TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) {
   AssertParseError("/alpha=yosemite");  // not in inspected dictionary
 }
 
+TEST_F(TestPartitioning, SetDefaultValuesConcrete) {
+  auto small_schm = schema({field("c", int32())});
+  auto schm = schema({field("a", int32()), field("b", utf8())});
+  auto full_schm = schema({field("a", int32()), field("b", utf8()), field("c", int32())});
+  RecordBatchProjector record_batch_projector(full_schm);
+  HivePartitioning part(schm);
+  part.SetDefaultValuesFromKeys(
+      and_(equal(field_ref("a"), literal(10)), equal(field_ref("b"), literal("y"))),
+      &record_batch_projector);
+
+  auto in_rb = RecordBatchFromJSON(small_schm, R"([{"c": 0},
+                                                  {"c": 1},
+                                                  {"c": 2},
+                                                  {"c": 3}
+                                                ])");
+
+  EXPECT_OK_AND_ASSIGN(auto out_rb, record_batch_projector.Project(*in_rb));
+  auto expected_rb = RecordBatchFromJSON(full_schm, R"([{"a": 10,  "b": "y", "c": 0},
+                                                        {"a": 10, "b": "y", "c": 1},
+                                                        {"a": 10,  "b": "y", "c": 2},
+                                                        {"a": 10, "b": "y", "c": 3}
+                                                      ])");
+  AssertBatchesEqual(*expected_rb, *out_rb);
+}
+
+TEST_F(TestPartitioning, SetDefaultValuesNull) {
+  auto small_schm = schema({field("c", int32())});
+  auto schm = schema({field("a", int32()), field("b", utf8())});
+  auto full_schm = schema({field("a", int32()), field("b", utf8()), field("c", int32())});
+  RecordBatchProjector record_batch_projector(full_schm);
+  HivePartitioning part(schm);
+  part.SetDefaultValuesFromKeys(and_(is_null(field_ref("a")), is_null(field_ref("b"))),
+                                &record_batch_projector);
+
+  auto in_rb = RecordBatchFromJSON(small_schm, R"([{"c": 0},
+                                                  {"c": 1},
+                                                  {"c": 2},
+                                                  {"c": 3}
+                                                ])");
+
+  EXPECT_OK_AND_ASSIGN(auto out_rb, record_batch_projector.Project(*in_rb));
+  auto expected_rb = RecordBatchFromJSON(full_schm, R"([{"a": null,  "b": null, "c": 0},
+                                                        {"a": null,  "b": null, "c": 1},
+                                                        {"a": null,  "b": null, "c": 2},
+                                                        {"a": null,  "b": null, "c": 3}
+                                                      ])");
+  AssertBatchesEqual(*expected_rb, *out_rb);
+}
+
 TEST_F(TestPartitioning, EtlThenHive) {
   FieldVector etl_fields{field("year", int16()), field("month", int8()),
                          field("day", int8()), field("hour", int8())};

From 8f1792d88b01aa91b20228e39aadc3ec32cc5047 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 15 Feb 2021 14:01:12 -1000
Subject: [PATCH 20/33] Cleaned up logic for valid but not known case

---
 cpp/src/arrow/dataset/partition.cc      |  6 +++---
 cpp/src/arrow/dataset/partition_test.cc | 14 +++++++-------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 46142560c13..e9c198e3398 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -88,9 +88,9 @@ Status KeyValuePartitioning::SetDefaultValuesFromKeys(const Expression& expr,
     if (known_value.concrete()) {
       RETURN_NOT_OK(projector->SetDefaultValue(match, known_value.datum.scalar()));
     } else if (known_value.valid) {
-      return Status::Invalid(
-          "Partition expression not defined enough to set default value for ",
-          ref_value.first.name());
+      // We know some information about the value but nothing concrete enough to set.  Can
+      // happen if expression is something like is_valid(field_ref("a"))
+      continue;
     } else {
       RETURN_NOT_OK(projector->SetDefaultValue(match, MakeNullScalar(field->type())));
     }
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 1573b64dcbe..39303fc71e1 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -464,9 +464,9 @@ TEST_F(TestPartitioning, SetDefaultValuesConcrete) {
   auto full_schm = schema({field("a", int32()), field("b", utf8()), field("c", int32())});
   RecordBatchProjector record_batch_projector(full_schm);
   HivePartitioning part(schm);
-  part.SetDefaultValuesFromKeys(
-      and_(equal(field_ref("a"), literal(10)), equal(field_ref("b"), literal("y"))),
-      &record_batch_projector);
+  ARROW_EXPECT_OK(part.SetDefaultValuesFromKeys(
+      and_(equal(field_ref("a"), literal(10)), is_valid(field_ref("b"))),
+      &record_batch_projector));
 
   auto in_rb = RecordBatchFromJSON(small_schm, R"([{"c": 0},
                                                   {"c": 1},
@@ -475,10 +475,10 @@ TEST_F(TestPartitioning, SetDefaultValuesConcrete) {
                                                 ])");
 
   EXPECT_OK_AND_ASSIGN(auto out_rb, record_batch_projector.Project(*in_rb));
-  auto expected_rb = RecordBatchFromJSON(full_schm, R"([{"a": 10,  "b": "y", "c": 0},
-                                                        {"a": 10, "b": "y", "c": 1},
-                                                        {"a": 10,  "b": "y", "c": 2},
-                                                        {"a": 10, "b": "y", "c": 3}
+  auto expected_rb = RecordBatchFromJSON(full_schm, R"([{"a": 10,  "b": null, "c": 0},
+                                                        {"a": 10,  "b": null, "c": 1},
+                                                        {"a": 10,  "b": null, "c": 2},
+                                                        {"a": 10,  "b": null, "c": 3}
                                                       ])");
   AssertBatchesEqual(*expected_rb, *out_rb);
 }

From 6f7ced57d06a8b37707694dcda3732ca7834e21a Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Tue, 16 Feb 2021 08:29:20 -1000
Subject: [PATCH 21/33] Fixing compiler warning

---
 cpp/src/arrow/dataset/partition_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 39303fc71e1..aad828cb66d 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -489,8 +489,8 @@ TEST_F(TestPartitioning, SetDefaultValuesNull) {
   auto full_schm = schema({field("a", int32()), field("b", utf8()), field("c", int32())});
   RecordBatchProjector record_batch_projector(full_schm);
   HivePartitioning part(schm);
-  part.SetDefaultValuesFromKeys(and_(is_null(field_ref("a")), is_null(field_ref("b"))),
-                                &record_batch_projector);
+  ARROW_EXPECT_OK(part.SetDefaultValuesFromKeys(
+      and_(is_null(field_ref("a")), is_null(field_ref("b"))), &record_batch_projector));
 
   auto in_rb = RecordBatchFromJSON(small_schm, R"([{"c": 0},
                                                   {"c": 1},

From 212c9bc148578e6fad5e470b503ad3deb02caa30 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Tue, 16 Feb 2021 08:49:23 -1000
Subject: [PATCH 22/33] Python lint

---
 python/pyarrow/tests/test_dataset.py | 63 +++++++++++++++++++---------
 1 file changed, 43 insertions(+), 20 deletions(-)

diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index a42dc83769e..e12f802e610 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -20,14 +20,12 @@
 import posixpath
 import pathlib
 import pickle
-from pyarrow.dataset import partitioning
 import textwrap
 
 import numpy as np
 import pytest
 
 import pyarrow as pa
-import pyarrow.csv
 import pyarrow.fs as fs
 from pyarrow.tests.util import change_cwd, _filesystem_uri
 
@@ -454,6 +452,7 @@ def test_expression_construction():
     with pytest.raises(pa.ArrowInvalid):
         field != {1}
 
+
 def test_expression_boolean_operators():
     # https://issues.apache.org/jira/browse/ARROW-11412
     true = ds.scalar(True)
@@ -484,6 +483,7 @@ def test_partition_keys():
     null = ds.field('a').is_null()
     assert ds._get_partition_keys(null) == {'a': None}
 
+
 def test_parquet_read_options():
     opts1 = ds.ParquetReadOptions()
     opts2 = ds.ParquetReadOptions(buffer_size=4096,
@@ -1257,14 +1257,19 @@ def test_partitioning_factory_dictionary(mockfs, infer_dictionary):
     else:
         assert inferred_schema.field('key').type == pa.string()
 
+
 def test_dictionary_partitioning_outer_nulls_raises(tempdir):
-    table = pa.table({'a': [ 'x', 'y', None ], 'b': ['x', 'y', 'z']})
-    part = ds.partitioning(pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())]))
+    table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']})
+    part = ds.partitioning(
+        pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())]))
     with pytest.raises(pa.ArrowInvalid):
         ds.write_dataset(table, tempdir, format='parquet', partitioning=part)
 
+
 def _has_subdirs(basedir):
-    return any([os.path.isdir(os.path.join(basedir, el)) for el in os.listdir(basedir)])
+    elements = os.listdir(basedir)
+    return any([os.path.isdir(os.path.join(basedir, el)) for el in elements])
+
 
 def _do_list_all_dirs(basedir, path_so_far, result):
     for f in os.listdir(basedir):
@@ -1276,27 +1281,34 @@ def _do_list_all_dirs(basedir, path_so_far, result):
             else:
                 result.append(norm_nested)
 
+
 def _list_all_dirs(basedir):
     result = []
     _do_list_all_dirs(basedir, '', result)
     return result
 
+
 def _check_dataset_directories(tempdir, expected_directories):
     actual_directories = set(_list_all_dirs(tempdir))
     assert actual_directories == set(expected_directories)
 
+
 def test_dictionary_partitioning_inner_nulls(tempdir):
-    table = pa.table({'a': [ 'x', 'y', 'z' ], 'b': ['x', 'y', None]})
-    part = ds.partitioning(pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())]))
+    table = pa.table({'a': ['x', 'y', 'z'], 'b': ['x', 'y', None]})
+    part = ds.partitioning(
+        pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())]))
     ds.write_dataset(table, tempdir, format='parquet', partitioning=part)
     _check_dataset_directories(tempdir, ['x/x', 'y/y', 'z'])
 
+
 def test_hive_partitioning_nulls(tempdir):
-    table = pa.table({'a': [ 'x', None, 'z' ], 'b': ['x', 'y', None]})
-    part = ds.HivePartitioning(pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())]), None, 'xyz')
+    table = pa.table({'a': ['x', None, 'z'], 'b': ['x', 'y', None]})
+    part = ds.HivePartitioning(pa.schema(
+        [pa.field('a', pa.string()), pa.field('b', pa.string())]), None, 'xyz')
     ds.write_dataset(table, tempdir, format='parquet', partitioning=part)
     _check_dataset_directories(tempdir, ['a=x/b=x', 'a=xyz/b=y', 'a=z/b=xyz'])
 
+
 def test_partitioning_function():
     schema = pa.schema([("year", pa.int16()), ("month", pa.int8())])
     names = ["year", "month"]
@@ -1677,7 +1689,8 @@ def test_partition_discovery(
 
     table = pa.table({'a': range(9), 'b': [0.0] * 4 + [1.0] * 5})
 
-    if partitioning == "directory" and (None in partition_keys[0] or None in partition_keys[1]):
+    has_null = None in partition_keys[0] or None in partition_keys[1]
+    if partitioning == "directory" and has_null:
         # Directory partitioning can't handle the first part being null
         return
 
@@ -1692,7 +1705,8 @@ def test_partition_discovery(
                 infer_dictionary=infer_dictionary, null_fallback=null_fallback
             )
         else:
-            partitioning = ds.HivePartitioning.discover(infer_dictionary=infer_dictionary)
+            partitioning = ds.HivePartitioning.discover(
+                infer_dictionary=infer_dictionary)
         fmt = "part1={0}/part2={1}"
         if null_fallback:
             null_value = null_fallback
@@ -2385,7 +2399,8 @@ def test_dataset_project_only_partition_columns(tempdir):
     part_only = dataset.to_table(columns=['part'], use_threads=False)
 
     assert all_cols.column('part').equals(part_only.column('part'))
-    
+
+
 @pytest.mark.parquet
 @pytest.mark.pandas
 def test_write_to_dataset_given_null_just_works(tempdir):
@@ -2395,17 +2410,21 @@ def test_write_to_dataset_given_null_just_works(tempdir):
         pa.field('col', pa.int64()),
         pa.field('part', pa.dictionary(pa.int32(), pa.string()))
     ])
-    table = pa.table({'part': [None, None, 'a', 'a'], 'col': list(range(4))}, schema=schema)
+    table = pa.table({'part': [None, None, 'a', 'a'],
+                      'col': list(range(4))}, schema=schema)
 
     path = str(tempdir / 'test_dataset')
-    pq.write_to_dataset(table, path, partition_cols=['part'], use_legacy_dataset=False)
+    pq.write_to_dataset(table, path, partition_cols=[
+                        'part'], use_legacy_dataset=False)
 
     actual_table = pq.read_table(tempdir / 'test_dataset')
-    # column.equals can handle the difference in chunking but not the fact that `part`
-    # will have different dictionaries for the two chunks
-    assert actual_table.column('part').to_pylist() == table.column('part').to_pylist()
+    # column.equals can handle the difference in chunking but not the fact
+    # that `part` will have different dictionaries for the two chunks
+    assert actual_table.column('part').to_pylist(
+    ) == table.column('part').to_pylist()
     assert actual_table.column('col').equals(table.column('col'))
 
+
 @pytest.mark.parquet
 @pytest.mark.pandas
 def test_legacy_write_to_dataset_drops_null(tempdir):
@@ -2415,15 +2434,19 @@ def test_legacy_write_to_dataset_drops_null(tempdir):
         pa.field('col', pa.int64()),
         pa.field('part', pa.dictionary(pa.int32(), pa.string()))
     ])
-    table = pa.table({'part': ['a', 'a', None, None], 'col': list(range(4))}, schema=schema)
-    expected = pa.table({'part': ['a', 'a'], 'col': list(range(2))}, schema=schema)
+    table = pa.table({'part': ['a', 'a', None, None],
+                      'col': list(range(4))}, schema=schema)
+    expected = pa.table(
+        {'part': ['a', 'a'], 'col': list(range(2))}, schema=schema)
 
     path = str(tempdir / 'test_dataset')
-    pq.write_to_dataset(table, path, partition_cols=['part'], use_legacy_dataset=True)
+    pq.write_to_dataset(table, path, partition_cols=[
+                        'part'], use_legacy_dataset=True)
 
     actual = pq.read_table(tempdir / 'test_dataset')
     assert actual == expected
 
+
 @pytest.mark.parquet
 @pytest.mark.pandas
 def test_dataset_project_null_column(tempdir):

From 9ef4a71697ca2a263e046aa683aa5ae1a783cad3 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 22 Feb 2021 09:32:06 -1000
Subject: [PATCH 23/33] Addressing PR comments

---
 cpp/src/arrow/compute/kernels/vector_hash.cc |  3 +--
 cpp/src/arrow/dataset/expression.cc          |  4 ----
 cpp/src/arrow/dataset/expression.h           |  3 ---
 cpp/src/arrow/dataset/expression_test.cc     |  4 ++++
 cpp/src/arrow/dataset/partition.cc           | 25 ++++++++++----------
 cpp/src/arrow/dataset/partition.h            |  4 ++--
 cpp/src/arrow/dataset/partition_test.cc      | 12 ++++++++--
 7 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index c7b25347624..694c6265825 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -689,6 +689,7 @@ const FunctionDoc value_counts_doc(
      "Nulls in the input are ignored."),
     {"array"});
 
+const auto kDefaultDictionaryEncodeOptions = DictionaryEncodeOptions::Defaults();
 const FunctionDoc dictionary_encode_doc(
     "Dictionary-encode array",
     ("Return a dictionary-encoded version of the input array."), {"array"},
@@ -738,8 +739,6 @@ void RegisterVectorHash(FunctionRegistry* registry) {
   // ----------------------------------------------------------------------
   // dictionary_encode
 
-  const auto kDefaultDictionaryEncodeOptions = DictionaryEncodeOptions::Defaults();
-
   base.finalize = DictEncodeFinalize;
   // Unique and ValueCounts output unchunked arrays
   base.output_chunked = true;
diff --git a/cpp/src/arrow/dataset/expression.cc b/cpp/src/arrow/dataset/expression.cc
index ef92ae09fe7..fb62f819121 100644
--- a/cpp/src/arrow/dataset/expression.cc
+++ b/cpp/src/arrow/dataset/expression.cc
@@ -51,10 +51,6 @@ Expression::Expression(Parameter parameter)
 
 Expression literal(Datum lit) { return Expression(std::move(lit)); }
 
-Expression null_literal(const std::shared_ptr<DataType>& type) {
-  return Expression(MakeNullScalar(type));
-}
-
 Expression field_ref(FieldRef ref) {
   return Expression(Expression::Parameter{std::move(ref), {}});
 }
diff --git a/cpp/src/arrow/dataset/expression.h b/cpp/src/arrow/dataset/expression.h
index 1bbcb471015..b6b47fb8a2e 100644
--- a/cpp/src/arrow/dataset/expression.h
+++ b/cpp/src/arrow/dataset/expression.h
@@ -135,9 +135,6 @@ inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equ
 ARROW_DS_EXPORT
 Expression literal(Datum lit);
 
-ARROW_DS_EXPORT
-Expression null_literal(const std::shared_ptr<DataType>& type);
-
 template <typename Arg>
 Expression literal(Arg&& arg) {
   return literal(Datum(std::forward<Arg>(arg)));
diff --git a/cpp/src/arrow/dataset/expression_test.cc b/cpp/src/arrow/dataset/expression_test.cc
index 3aa62319e85..adaf6c3410d 100644
--- a/cpp/src/arrow/dataset/expression_test.cc
+++ b/cpp/src/arrow/dataset/expression_test.cc
@@ -240,6 +240,10 @@ TEST(Expression, Equality) {
             call("cast", {field_ref("a")}, compute::CastOptions::Unsafe(int32())));
 }
 
+Expression null_literal(const std::shared_ptr<DataType>& type) {
+  return Expression(MakeNullScalar(type));
+}
+
 TEST(Expression, Hash) {
   std::unordered_set<Expression, Expression::Hash> set;
 
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index e9c198e3398..9515f631d1a 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -162,7 +162,7 @@ Result<Expression> KeyValuePartitioning::ConvertKey(const Key& key) const {
 
   std::shared_ptr<Scalar> converted;
 
-  if (key.null) {
+  if (!key.value.has_value()) {
     return is_null(field_ref(field->name()));
   } else if (field->type()->id() == Type::DICTIONARY) {
     if (dictionaries_.empty() || dictionaries_[field_index] == nullptr) {
@@ -181,7 +181,7 @@ Result<Expression> KeyValuePartitioning::ConvertKey(const Key& key) const {
     }
 
     // look up the partition value in the dictionary
-    ARROW_ASSIGN_OR_RAISE(converted, Scalar::Parse(value.dictionary->type(), key.value));
+    ARROW_ASSIGN_OR_RAISE(converted, Scalar::Parse(value.dictionary->type(), *key.value));
     ARROW_ASSIGN_OR_RAISE(auto index, compute::IndexIn(converted, value.dictionary));
     value.index = index.scalar();
     if (!value.index->is_valid) {
@@ -190,7 +190,7 @@ Result<Expression> KeyValuePartitioning::ConvertKey(const Key& key) const {
     }
     converted = std::make_shared<DictionaryScalar>(std::move(value), field->type());
   } else {
-    ARROW_ASSIGN_OR_RAISE(converted, Scalar::Parse(field->type(), key.value));
+    ARROW_ASSIGN_OR_RAISE(converted, Scalar::Parse(field->type(), *key.value));
   }
 
   return equal(field_ref(field->name()), literal(std::move(converted)));
@@ -254,7 +254,7 @@ std::vector<KeyValuePartitioning::Key> DirectoryPartitioning::ParseKeys(
   for (auto&& segment : fs::internal::SplitAbstractPath(path)) {
     if (i >= schema_->num_fields()) break;
 
-    keys.push_back({schema_->field(i++)->name(), std::move(segment), false});
+    keys.push_back({schema_->field(i++)->name(), std::move(segment)});
   }
 
   return keys;
@@ -441,11 +441,12 @@ util::optional<KeyValuePartitioning::Key> HivePartitioning::ParseKey(
     return util::nullopt;
   }
 
+  auto name = segment.substr(0, name_end);
   auto value = segment.substr(name_end + 1);
   if (value == null_fallback) {
-    return Key{segment.substr(0, name_end), "", true};
+    return Key{name, util::nullopt};
   }
-  return Key{segment.substr(0, name_end), segment.substr(name_end + 1), false};
+  return Key{name, value};
 }
 
 std::vector<KeyValuePartitioning::Key> HivePartitioning::ParseKeys(
@@ -493,8 +494,8 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory {
     for (auto path : paths) {
       for (auto&& segment : fs::internal::SplitAbstractPath(path)) {
         if (auto key = HivePartitioning::ParseKey(segment, null_fallback_)) {
-          if (!key->null) {
-            RETURN_NOT_OK(InsertRepr(key->name, key->value));
+          if (key->value.has_value()) {
+            RETURN_NOT_OK(InsertRepr(key->name, *key->value));
           }
         }
       }
@@ -656,10 +657,10 @@ class StructDictionary {
   Status AddOne(Datum column, std::shared_ptr<Int32Array>* fused_indices) {
     if (column.type()->id() == Type::DICTIONARY) {
       if (column.null_count() != 0) {
-        // TODO Optimize this by allowign DictionaryEncode to transfer a null-masked
-        // dictionary to a null-encoded dictionary.  At the moment we decode and then
-        // encode causing one extra copy, and a potentially expansive decoding copy at
-        // that.
+        // TODO(ARROW-11732) Optimize this by allowign DictionaryEncode to transfer a
+        // null-masked dictionary to a null-encoded dictionary.  At the moment we decode
+        // and then encode causing one extra copy, and a potentially expansive decoding
+        // copy at that.
         ARROW_ASSIGN_OR_RAISE(
             auto decoded_dictionary,
             compute::Cast(
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index bc59dfe53c5..42e1b4c4097 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -124,8 +124,8 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
   /// An unconverted equality expression consisting of a field name and the representation
   /// of a scalar value
   struct Key {
-    std::string name, value;
-    bool null;
+    std::string name;
+    util::optional<std::string> value;
   };
 
   static Status SetDefaultValuesFromKeys(const Expression& expr,
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index aad828cb66d..80d65daf159 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -375,7 +375,9 @@ TEST_F(TestPartitioning, HivePartitioningFormat) {
 }
 
 TEST_F(TestPartitioning, DiscoverHiveSchema) {
-  factory_ = HivePartitioning::MakeFactory();
+  auto options = HivePartitioningFactoryOptions();
+  options.infer_dictionary = "xyz";
+  factory_ = HivePartitioning::MakeFactory(options);
 
   // type is int32 if possible
   AssertInspect({"/alpha=0/beta=1"}, {Int("alpha"), Int("beta")});
@@ -388,6 +390,12 @@ TEST_F(TestPartitioning, DiscoverHiveSchema) {
   // (...so ensure your partitions are ordered the same for all paths)
   AssertInspect({"/alpha=0/beta=1", "/beta=2/alpha=3"}, {Int("alpha"), Int("beta")});
 
+  // Null fallback strings shouldn't interfere with type inference
+  AssertInspect({"/alpha=xyz/beta=x", "/alpha=7/beta=xyz"}, {Int("alpha"), Str("beta")});
+
+  // Only null strings are inferred as text
+  AssertInspect({"/alpha=xyz"}, {Str("alpha")});
+
   // If there are too many digits fall back to string
   AssertInspect({"/alpha=3760212050"}, {Str("alpha")});
 
@@ -611,7 +619,7 @@ class RangePartitioning : public Partitioning {
       }
 
       std::smatch matches;
-      RETURN_NOT_OK(DoRegex(key->value, &matches));
+      RETURN_NOT_OK(DoRegex(*key->value, &matches));
 
       auto& min_cmp = matches[1] == "[" ? greater_equal : greater;
       std::string min_repr = matches[2];

From c54c55db45affd5cfb550171980ffedfa7f0b160 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 22 Feb 2021 09:06:40 -1000
Subject: [PATCH 24/33] Update cpp/src/arrow/compute/kernels/vector_hash.cc

Co-authored-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/arrow/compute/kernels/vector_hash.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index 694c6265825..da8e7db1929 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -578,7 +578,6 @@ std::unique_ptr<KernelState> DictionaryHashInit(KernelContext* ctx,
       DCHECK(false) << "Unsupported dictionary index type";
       break;
   }
-  DictionaryEncodeOptions options = DictionaryEncodeOptions::Defaults();
   return ::arrow::internal::make_unique<DictionaryHashKernel>(std::move(indices_hasher));
 }
 

From 9b0f8eecf9a0c10e655b93562959616b7e6ddeb6 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 22 Feb 2021 09:06:47 -1000
Subject: [PATCH 25/33] Update cpp/src/arrow/compute/kernels/vector_hash.cc

Co-authored-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/arrow/compute/kernels/vector_hash.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index da8e7db1929..754d8fba83b 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -152,7 +152,7 @@ class ValueCountsAction final : ActionBase {
     }
   }
 
-  bool ShouldEncodeNulls() { return true; }
+  constexpr bool ShouldEncodeNulls() const { return true; }
 
  private:
   Int64Builder count_builder_;

From ce53d4eaa773ccfbe525c7b7e93efa20af5d024f Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 22 Feb 2021 09:07:49 -1000
Subject: [PATCH 26/33] Update
 cpp/src/arrow/compute/kernels/vector_hash_test.cc

Co-authored-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/arrow/compute/kernels/vector_hash_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_hash_test.cc b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
index f4cd7dbf41f..179792e2141 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
@@ -306,8 +306,8 @@ TEST_F(TestHashKernel, ValueCountsBoolean) {
 }
 
 TEST_F(TestHashKernel, ValueCountsNull) {
-  CheckValueCounts<NullType, std::nullptr_t>(
-      null(), {nullptr, nullptr, nullptr}, {true, false, true}, {nullptr}, {false}, {3});
+  CheckValueCounts(ArrayFromJSON(null(), "[null, null, null]"),
+                   ArrayFromJSON(null(), "[null]"), ArrayFromJSON(int64(), "[3]"));
 }
 
 TEST_F(TestHashKernel, DictEncodeBoolean) {

From c2aa3ad879f4371974039aa37ad344b34c645467 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 22 Feb 2021 09:09:25 -1000
Subject: [PATCH 27/33] Update cpp/src/arrow/dataset/partition_test.cc

Co-authored-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/arrow/dataset/partition_test.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 80d65daf159..eb8f5aa957b 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -161,8 +161,6 @@ TEST_F(TestPartitioning, Partition) {
                   expected_expressions);
 }
 
-TEST_F(TestPartitioning, StructDictionaryNull) {}
-
 TEST_F(TestPartitioning, DirectoryPartitioning) {
   partitioning_ = std::make_shared<DirectoryPartitioning>(
       schema({field("alpha", int32()), field("beta", utf8())}));

From 7d5de82c5edf379fb4bb5184778e9ec6ac493f8a Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 22 Feb 2021 09:16:25 -1000
Subject: [PATCH 28/33] Update python/pyarrow/_dataset.pyx

Co-authored-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 python/pyarrow/_dataset.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index e38ea626d79..104a47b98c5 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -1594,7 +1594,7 @@ cdef class HivePartitioning(Partitioning):
         corresponding entry of `dictionaries` must be an array containing
         every value which may be taken by the corresponding column or an
         error will be raised in parsing.
-    null_fallback : str
+    null_fallback : str, default "__HIVE_DEFAULT_PARTITION__"
         If any field is None then this fallback will be used as a label
 
     Returns

From f1a6759da12b36d6a229f51d79c9aa5dfecbba47 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 22 Feb 2021 10:11:53 -1000
Subject: [PATCH 29/33] Added test case to probe what happens when inferring a
 partition column that is only null.  Changed it to an error to match
 directory partitioning

---
 cpp/src/arrow/dataset/partition.cc      | 17 ++++++++++-------
 cpp/src/arrow/dataset/partition_test.cc |  6 +++---
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 9515f631d1a..f96002b36b2 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -186,7 +186,7 @@ Result<Expression> KeyValuePartitioning::ConvertKey(const Key& key) const {
     value.index = index.scalar();
     if (!value.index->is_valid) {
       return Status::Invalid("Dictionary supplied for field ", field->ToString(),
-                             " does not contain '", key.value, "'");
+                             " does not contain '", *key.value, "'");
     }
     converted = std::make_shared<DictionaryScalar>(std::move(value), field->type());
   } else {
@@ -311,8 +311,13 @@ class KeyValuePartitioningFactory : public PartitioningFactory {
     return it_inserted.first->second;
   }
 
-  Status InsertRepr(const std::string& name, util::string_view repr) {
-    return InsertRepr(GetOrInsertField(name), repr);
+  Status InsertRepr(const std::string& name, util::optional<string_view> repr) {
+    auto field_index = GetOrInsertField(name);
+    if (repr.has_value()) {
+      return InsertRepr(field_index, *repr);
+    } else {
+      return Status::OK();
+    }
   }
 
   Status InsertRepr(int index, util::string_view repr) {
@@ -333,7 +338,7 @@ class KeyValuePartitioningFactory : public PartitioningFactory {
       RETURN_NOT_OK(repr_memos_[index]->GetArrayData(0, &reprs));
 
       if (reprs->length == 0) {
-        return Status::Invalid("No segments were available for field '", name,
+        return Status::Invalid("No non-null segments were available for field '", name,
                                "'; couldn't infer type");
       }
 
@@ -494,9 +499,7 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory {
     for (auto path : paths) {
       for (auto&& segment : fs::internal::SplitAbstractPath(path)) {
         if (auto key = HivePartitioning::ParseKey(segment, null_fallback_)) {
-          if (key->value.has_value()) {
-            RETURN_NOT_OK(InsertRepr(key->name, *key->value));
-          }
+          RETURN_NOT_OK(InsertRepr(key->name, key->value));
         }
       }
     }
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index eb8f5aa957b..75e60f994f0 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -374,7 +374,7 @@ TEST_F(TestPartitioning, HivePartitioningFormat) {
 
 TEST_F(TestPartitioning, DiscoverHiveSchema) {
   auto options = HivePartitioningFactoryOptions();
-  options.infer_dictionary = "xyz";
+  options.null_fallback = "xyz";
   factory_ = HivePartitioning::MakeFactory(options);
 
   // type is int32 if possible
@@ -391,8 +391,8 @@ TEST_F(TestPartitioning, DiscoverHiveSchema) {
   // Null fallback strings shouldn't interfere with type inference
   AssertInspect({"/alpha=xyz/beta=x", "/alpha=7/beta=xyz"}, {Int("alpha"), Str("beta")});
 
-  // Only null strings are inferred as text
-  AssertInspect({"/alpha=xyz"}, {Str("alpha")});
+  // Cannot infer if the only values are null
+  AssertInspectError({"/alpha=xyz"});
 
   // If there are too many digits fall back to string
   AssertInspect({"/alpha=3760212050"}, {Str("alpha")});

From dadbe8b3362346ff2d54e076215b46f44835c976 Mon Sep 17 00:00:00 2001
From: Benjamin Kietzman <bengilgit@gmail.com>
Date: Fri, 19 Feb 2021 13:01:35 -0500
Subject: [PATCH 30/33] Use null scalars for known-null fields

---
 cpp/src/arrow/dataset/expression.cc          | 82 ++++++++------------
 cpp/src/arrow/dataset/expression.h           | 24 +-----
 cpp/src/arrow/dataset/expression_test.cc     | 48 ++++++------
 cpp/src/arrow/dataset/partition.cc           | 60 +++++++-------
 cpp/src/arrow/dataset/projector.cc           | 16 +++-
 python/pyarrow/_dataset.pyx                  | 11 +--
 python/pyarrow/includes/libarrow_dataset.pxd |  9 +--
 python/pyarrow/public-api.pxi                |  3 +
 python/pyarrow/tests/test_dataset.py         |  1 +
 9 files changed, 108 insertions(+), 146 deletions(-)

diff --git a/cpp/src/arrow/dataset/expression.cc b/cpp/src/arrow/dataset/expression.cc
index fb62f819121..9764700816c 100644
--- a/cpp/src/arrow/dataset/expression.cc
+++ b/cpp/src/arrow/dataset/expression.cc
@@ -95,6 +95,8 @@ namespace {
 
 std::string PrintDatum(const Datum& datum) {
   if (datum.is_scalar()) {
+    if (!datum.scalar()->is_valid) return "null";
+
     switch (datum.type()->id()) {
       case Type::STRING:
       case Type::LARGE_STRING:
@@ -110,6 +112,7 @@ std::string PrintDatum(const Datum& datum) {
       default:
         break;
     }
+
     return datum.scalar()->ToString();
   }
   return datum.ToString();
@@ -684,27 +687,27 @@ std::vector<Expression> GuaranteeConjunctionMembers(
 // conjunction_members
 Status ExtractKnownFieldValuesImpl(
     std::vector<Expression>* conjunction_members,
-    std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash>* known_values) {
-  auto unconsumed_end = std::partition(
-      conjunction_members->begin(), conjunction_members->end(),
-      [](const Expression& expr) {
-        // search for an equality conditions between a field and a literal
-        auto call = expr.call();
-        if (!call) return true;
-
-        if (call->function_name == "equal") {
-          auto ref = call->arguments[0].field_ref();
-          auto lit = call->arguments[1].literal();
-          return !(ref && lit);
-        }
-
-        if (call->function_name == "is_null" || call->function_name == "is_valid") {
-          auto ref = call->arguments[0].field_ref();
-          return !ref;
-        }
-
-        return true;
-      });
+    std::unordered_map<FieldRef, Datum, FieldRef::Hash>* known_values) {
+  auto unconsumed_end =
+      std::partition(conjunction_members->begin(), conjunction_members->end(),
+                     [](const Expression& expr) {
+                       // search for an equality conditions between a field and a literal
+                       auto call = expr.call();
+                       if (!call) return true;
+
+                       if (call->function_name == "equal") {
+                         auto ref = call->arguments[0].field_ref();
+                         auto lit = call->arguments[1].literal();
+                         return !(ref && lit);
+                       }
+
+                       if (call->function_name == "is_null") {
+                         auto ref = call->arguments[0].field_ref();
+                         return !ref;
+                       }
+
+                       return true;
+                     });
 
   for (auto it = unconsumed_end; it != conjunction_members->end(); ++it) {
     auto call = CallNotNull(*it);
@@ -715,10 +718,7 @@ Status ExtractKnownFieldValuesImpl(
       known_values->emplace(*ref, *lit);
     } else if (call->function_name == "is_null") {
       auto ref = call->arguments[0].field_ref();
-      known_values->emplace(*ref, false);
-    } else if (call->function_name == "is_valid") {
-      auto ref = call->arguments[0].field_ref();
-      known_values->emplace(*ref, true);
+      known_values->emplace(*ref, std::make_shared<NullScalar>());
     }
   }
 
@@ -729,16 +729,16 @@ Status ExtractKnownFieldValuesImpl(
 
 }  // namespace
 
-Result<std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash>>
-ExtractKnownFieldValues(const Expression& guaranteed_true_predicate) {
+Result<std::unordered_map<FieldRef, Datum, FieldRef::Hash>> ExtractKnownFieldValues(
+    const Expression& guaranteed_true_predicate) {
   auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate);
-  std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash> known_values;
+  std::unordered_map<FieldRef, Datum, FieldRef::Hash> known_values;
   RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values));
   return known_values;
 }
 
 Result<Expression> ReplaceFieldsWithKnownValues(
-    const std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash>& known_values,
+    const std::unordered_map<FieldRef, Datum, FieldRef::Hash>& known_values,
     Expression expr) {
   if (!expr.IsBound()) {
     return Status::Invalid(
@@ -751,11 +751,7 @@ Result<Expression> ReplaceFieldsWithKnownValues(
         if (auto ref = expr.field_ref()) {
           auto it = known_values.find(*ref);
           if (it != known_values.end()) {
-            const auto& known_value = it->second;
-            if (!known_value.concrete()) {
-              return expr;
-            }
-            auto lit = known_value.datum;
+            Datum lit = it->second;
             if (expr.type()->id() == Type::DICTIONARY) {
               if (lit.is_scalar()) {
                 // FIXME the "right" way to support this is adding support for scalars to
@@ -775,22 +771,6 @@ Result<Expression> ReplaceFieldsWithKnownValues(
             ARROW_ASSIGN_OR_RAISE(lit, compute::Cast(lit, expr.type()));
             return literal(std::move(lit));
           }
-        } else if (auto call = expr.call()) {
-          if (call->function_name == "is_null") {
-            if (auto ref = call->arguments[0].field_ref()) {
-              auto it = known_values.find(*ref);
-              if (it != known_values.end()) {
-                return literal(!it->second.valid);
-              }
-            }
-          } else if (call->function_name == "is_valid") {
-            if (auto ref = call->arguments[0].field_ref()) {
-              auto it = known_values.find(*ref);
-              if (it != known_values.end()) {
-                return literal(it->second.valid);
-              }
-            }
-          }
         }
         return expr;
       },
@@ -967,7 +947,7 @@ Result<Expression> SimplifyWithGuarantee(Expression expr,
                                          const Expression& guaranteed_true_predicate) {
   auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate);
 
-  std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash> known_values;
+  std::unordered_map<FieldRef, Datum, FieldRef::Hash> known_values;
   RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values));
 
   ARROW_ASSIGN_OR_RAISE(expr,
diff --git a/cpp/src/arrow/dataset/expression.h b/cpp/src/arrow/dataset/expression.h
index b6b47fb8a2e..8bdcb4a0ffa 100644
--- a/cpp/src/arrow/dataset/expression.h
+++ b/cpp/src/arrow/dataset/expression.h
@@ -159,27 +159,10 @@ Expression call(std::string function, std::vector<Expression> arguments,
 ARROW_DS_EXPORT
 std::vector<FieldRef> FieldsInExpression(const Expression&);
 
-/// Represents either a concrete value or a hint that a field is valid/invalid
-struct KnownFieldValue {
-  Datum datum;
-  bool valid;
-
-  KnownFieldValue() : datum(), valid(false) {}
-  KnownFieldValue(const Datum& datum)  // NOLINT implicit conversion
-      : datum(datum), valid(datum.length() != datum.null_count()) {}
-  KnownFieldValue(bool is_valid)  // NOLINT implicit conversion
-      : datum(), valid(is_valid) {}
-
-  inline bool concrete() const { return datum.kind() != Datum::Kind::NONE; }
-  bool operator==(const KnownFieldValue& other) const {
-    return datum == other.datum && valid == other.valid;
-  }
-};
-
 /// Assemble a mapping from field references to known values.
 ARROW_DS_EXPORT
-Result<std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash>>
-ExtractKnownFieldValues(const Expression& guaranteed_true_predicate);
+Result<std::unordered_map<FieldRef, Datum, FieldRef::Hash>> ExtractKnownFieldValues(
+    const Expression& guaranteed_true_predicate);
 
 /// \defgroup expression-passes Functions for modification of Expressions
 ///
@@ -208,8 +191,7 @@ Result<Expression> FoldConstants(Expression);
 /// Simplify Expressions by replacing with known values of the fields which it references.
 ARROW_DS_EXPORT
 Result<Expression> ReplaceFieldsWithKnownValues(
-    const std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash>& known_values,
-    Expression);
+    const std::unordered_map<FieldRef, Datum, FieldRef::Hash>& known_values, Expression);
 
 /// Simplify an expression by replacing subexpressions based on a guarantee:
 /// a boolean expression which is guaranteed to evaluate to `true`. For example, this is
diff --git a/cpp/src/arrow/dataset/expression_test.cc b/cpp/src/arrow/dataset/expression_test.cc
index adaf6c3410d..c837c5be893 100644
--- a/cpp/src/arrow/dataset/expression_test.cc
+++ b/cpp/src/arrow/dataset/expression_test.cc
@@ -680,9 +680,8 @@ TEST(Expression, FoldConstantsBoolean) {
 
 TEST(Expression, ExtractKnownFieldValues) {
   struct {
-    void operator()(
-        Expression guarantee,
-        std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash> expected) {
+    void operator()(Expression guarantee,
+                    std::unordered_map<FieldRef, Datum, FieldRef::Hash> expected) {
       ASSERT_OK_AND_ASSIGN(auto actual, ExtractKnownFieldValues(guarantee));
       EXPECT_THAT(actual, UnorderedElementsAreArray(expected))
           << "  guarantee: " << guarantee.ToString();
@@ -730,20 +729,20 @@ TEST(Expression, ExtractKnownFieldValues) {
 }
 
 TEST(Expression, ReplaceFieldsWithKnownValues) {
-  auto ExpectReplacesTo = [](Expression expr,
-                             const std::unordered_map<FieldRef, KnownFieldValue,
-                                                      FieldRef::Hash>& known_values,
-                             Expression unbound_expected) {
-    ASSERT_OK_AND_ASSIGN(expr, expr.Bind(*kBoringSchema));
-    ASSERT_OK_AND_ASSIGN(auto expected, unbound_expected.Bind(*kBoringSchema));
-    ASSERT_OK_AND_ASSIGN(auto replaced, ReplaceFieldsWithKnownValues(known_values, expr));
+  auto ExpectReplacesTo =
+      [](Expression expr,
+         const std::unordered_map<FieldRef, Datum, FieldRef::Hash>& known_values,
+         Expression unbound_expected) {
+        ASSERT_OK_AND_ASSIGN(expr, expr.Bind(*kBoringSchema));
+        ASSERT_OK_AND_ASSIGN(auto expected, unbound_expected.Bind(*kBoringSchema));
+        ASSERT_OK_AND_ASSIGN(auto replaced,
+                             ReplaceFieldsWithKnownValues(known_values, expr));
 
-    EXPECT_EQ(replaced, expected);
-    ExpectIdenticalIfUnchanged(replaced, expr);
-  };
+        EXPECT_EQ(replaced, expected);
+        ExpectIdenticalIfUnchanged(replaced, expr);
+      };
 
-  std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash> i32_is_3{
-      {"i32", Datum(3)}};
+  std::unordered_map<FieldRef, Datum, FieldRef::Hash> i32_is_3{{"i32", Datum(3)}};
 
   ExpectReplacesTo(literal(1), i32_is_3, literal(1));
 
@@ -776,13 +775,18 @@ TEST(Expression, ReplaceFieldsWithKnownValues) {
                                    literal(2),
                                }));
 
-  std::unordered_map<FieldRef, KnownFieldValue, FieldRef::Hash> a_valid_b_invalid{
-      {"a", true}, {"b", false}};
+  std::unordered_map<FieldRef, Datum, FieldRef::Hash> i32_valid_str_null{
+      {"i32", Datum(3)}, {"str", MakeNullScalar(utf8())}};
+
+  ExpectReplacesTo(is_null(field_ref("i32")), i32_valid_str_null, is_null(literal(3)));
 
-  ExpectReplacesTo(is_null(field_ref("a")), a_valid_b_invalid, literal(false));
-  ExpectReplacesTo(is_valid(field_ref("a")), a_valid_b_invalid, literal(true));
-  ExpectReplacesTo(is_null(field_ref("b")), a_valid_b_invalid, literal(true));
-  ExpectReplacesTo(is_valid(field_ref("b")), a_valid_b_invalid, literal(false));
+  ExpectReplacesTo(is_valid(field_ref("i32")), i32_valid_str_null, is_valid(literal(3)));
+
+  ExpectReplacesTo(is_null(field_ref("str")), i32_valid_str_null,
+                   is_null(null_literal(utf8())));
+
+  ExpectReplacesTo(is_valid(field_ref("str")), i32_valid_str_null,
+                   is_valid(null_literal(utf8())));
 }
 
 struct {
@@ -1046,7 +1050,7 @@ TEST(Expression, SimplifyWithGuarantee) {
 
   Simplify{is_valid(field_ref("i32"))}
       .WithGuarantee(is_valid(field_ref("i32")))
-      .Expect(literal(true));
+      .Expect(is_valid(field_ref("i32")));
 }
 
 TEST(Expression, SimplifyThenExecute) {
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index f96002b36b2..522dbbeb5d2 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -74,31 +74,20 @@ Status KeyValuePartitioning::SetDefaultValuesFromKeys(const Expression& expr,
                                                       RecordBatchProjector* projector) {
   ARROW_ASSIGN_OR_RAISE(auto known_values, ExtractKnownFieldValues(expr));
   for (const auto& ref_value : known_values) {
-    const auto& known_value = ref_value.second;
-    if (known_value.concrete() && !known_value.datum.is_scalar()) {
-      return Status::Invalid("non-scalar partition key ", known_value.datum.ToString());
+    if (!ref_value.second.is_scalar()) {
+      return Status::Invalid("non-scalar partition key ", ref_value.second.ToString());
     }
 
     ARROW_ASSIGN_OR_RAISE(auto match,
                           ref_value.first.FindOneOrNone(*projector->schema()));
 
     if (match.empty()) continue;
-
-    const auto& field = projector->schema()->field(match[0]);
-    if (known_value.concrete()) {
-      RETURN_NOT_OK(projector->SetDefaultValue(match, known_value.datum.scalar()));
-    } else if (known_value.valid) {
-      // We know some information about the value but nothing concrete enough to set.  Can
-      // happen if expression is something like is_valid(field_ref("a"))
-      continue;
-    } else {
-      RETURN_NOT_OK(projector->SetDefaultValue(match, MakeNullScalar(field->type())));
-    }
+    RETURN_NOT_OK(projector->SetDefaultValue(match, ref_value.second.scalar()));
   }
   return Status::OK();
 }
 
-Expression ConjunctionFromGroupingRow(Scalar* row) {
+inline Expression ConjunctionFromGroupingRow(Scalar* row) {
   ScalarVector* values = &checked_cast<StructScalar*>(row)->value;
   std::vector<Expression> equality_expressions(values->size());
   for (size_t i = 0; i < values->size(); ++i) {
@@ -213,34 +202,37 @@ Result<std::string> KeyValuePartitioning::Format(const Expression& expr) const {
 
   ARROW_ASSIGN_OR_RAISE(auto known_values, ExtractKnownFieldValues(expr));
   for (const auto& ref_value : known_values) {
-    const auto& known_value = ref_value.second;
-    if (known_value.concrete() && !known_value.datum.is_scalar()) {
-      return Status::Invalid("non-scalar partition key ", known_value.datum.ToString());
+    if (!ref_value.second.is_scalar()) {
+      return Status::Invalid("non-scalar partition key ", ref_value.second.ToString());
     }
 
     ARROW_ASSIGN_OR_RAISE(auto match, ref_value.first.FindOneOrNone(*schema_));
     if (match.empty()) continue;
 
-    const auto& field = schema_->field(match[0]);
-
-    if (known_value.concrete()) {
-      auto value = known_value.datum.scalar();
-      if (!value->type->Equals(field->type())) {
-        return Status::TypeError("scalar ", value->ToString(), " (of type ", *value->type,
-                                 ") is invalid for ", field->ToString());
-      }
+    auto value = ref_value.second.scalar();
 
-      if (value->type->id() == Type::DICTIONARY) {
-        ARROW_ASSIGN_OR_RAISE(
-            value, checked_cast<const DictionaryScalar&>(*value).GetEncodedValue());
+    const auto& field = schema_->field(match[0]);
+    if (!value->type->Equals(field->type())) {
+      if (value->is_valid) {
+        auto maybe_converted = compute::Cast(value, field->type());
+        if (!maybe_converted.ok()) {
+          return Status::TypeError("Error converting scalar ", value->ToString(),
+                                   " (of type ", *value->type,
+                                   ") to a partition key for ", field->ToString(), ": ",
+                                   maybe_converted.status().message());
+        }
+        value = maybe_converted->scalar();
+      } else {
+        value = MakeNullScalar(field->type());
       }
+    }
 
-      values[match[0]] = std::move(value);
-    } else {
-      if (!known_value.valid) {
-        values[match[0]] = MakeNullScalar(field->type());
-      }
+    if (value->type->id() == Type::DICTIONARY) {
+      ARROW_ASSIGN_OR_RAISE(
+          value, checked_cast<const DictionaryScalar&>(*value).GetEncodedValue());
     }
+
+    values[match[0]] = std::move(value);
   }
 
   return FormatValues(values);
diff --git a/cpp/src/arrow/dataset/projector.cc b/cpp/src/arrow/dataset/projector.cc
index 2ba679ce6e7..ba0eb2ddff5 100644
--- a/cpp/src/arrow/dataset/projector.cc
+++ b/cpp/src/arrow/dataset/projector.cc
@@ -23,6 +23,7 @@
 #include <vector>
 
 #include "arrow/array.h"
+#include "arrow/compute/cast.h"
 #include "arrow/dataset/type_fwd.h"
 #include "arrow/record_batch.h"
 #include "arrow/result.h"
@@ -88,9 +89,18 @@ Status RecordBatchProjector::SetDefaultValue(FieldRef ref,
 
   auto field_type = to_->field(index)->type();
   if (!field_type->Equals(scalar->type)) {
-    return Status::TypeError("field ", to_->field(index)->ToString(),
-                             " cannot be materialized from scalar of type ",
-                             *scalar->type);
+    if (scalar->is_valid) {
+      auto maybe_converted = compute::Cast(scalar, field_type);
+      if (!maybe_converted.ok()) {
+        return Status::TypeError("Field ", to_->field(index)->ToString(),
+                                 " cannot be materialized from scalar of type ",
+                                 *scalar->type,
+                                 ". Cast error: ", maybe_converted.status().message());
+      }
+      scalar = maybe_converted->scalar();
+    } else {
+      scalar = MakeNullScalar(field_type);
+    }
   }
 
   scalars_[index] = std::move(scalar);
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 104a47b98c5..1c4e5d302c5 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -2361,17 +2361,14 @@ def _get_partition_keys(Expression partition_expression):
     """
     cdef:
         CExpression expr = partition_expression.unwrap()
-        pair[CFieldRef, CKnownFieldValue] ref_val
+        pair[CFieldRef, CDatum] ref_val
 
     out = {}
     for ref_val in GetResultValue(CExtractKnownFieldValues(expr)):
         assert ref_val.first.name() != nullptr
-        if ref_val.second.valid:
-            assert ref_val.second.datum.kind() == DatumType_SCALAR
-            val = pyarrow_wrap_scalar(ref_val.second.datum.scalar())
-            out[frombytes(deref(ref_val.first.name()))] = val.as_py()
-        else:
-            out[frombytes(deref(ref_val.first.name()))] = None
+        assert ref_val.second.kind() == DatumType_SCALAR
+        val = pyarrow_wrap_scalar(ref_val.second.scalar())
+        out[frombytes(deref(ref_val.first.name()))] = val.as_py()
     return out
 
 
diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
index 2127b3dccff..93bc0edddc1 100644
--- a/python/pyarrow/includes/libarrow_dataset.pxd
+++ b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -315,14 +315,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
             const CExpression& partition_expression,
             CRecordBatchProjector* projector)
 
-    cdef cppclass CKnownFieldValue "arrow::dataset::KnownFieldValue":
-        CDatum datum
-        c_bool valid
-        CKnownFieldValue(CDatum datum)
-        CKnownFieldValue(c_bool valid)
-        c_bool operator==(const CKnownFieldValue&) const
-
-    cdef CResult[unordered_map[CFieldRef, CKnownFieldValue, CFieldRefHash]] \
+    cdef CResult[unordered_map[CFieldRef, CDatum, CFieldRefHash]] \
         CExtractKnownFieldValues "arrow::dataset::ExtractKnownFieldValues"(
             const CExpression& partition_expression)
 
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index aa738f9aaea..998af512c55 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -251,6 +251,9 @@ cdef api object pyarrow_wrap_scalar(const shared_ptr[CScalar]& sp_scalar):
     if data_type == NULL:
         raise ValueError('Scalar data type was NULL')
 
+    if data_type.id() == _Type_NA:
+        return _NULL
+
     if data_type.id() not in _scalar_classes:
         raise ValueError('Scalar type not supported')
 
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index e12f802e610..67870b1e70d 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -26,6 +26,7 @@
 import pytest
 
 import pyarrow as pa
+import pyarrow.csv
 import pyarrow.fs as fs
 from pyarrow.tests.util import change_cwd, _filesystem_uri
 

From d3bfe09500d40a8c95874734cfd34bad507ec540 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Mon, 22 Feb 2021 14:57:43 -1000
Subject: [PATCH 31/33] constexpr not supported in this context in all gcc
 versions due to gcc bugs.  Pulling out for a second

---
 cpp/src/arrow/compute/kernels/vector_hash.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index 754d8fba83b..de4d3ee3022 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -152,7 +152,7 @@ class ValueCountsAction final : ActionBase {
     }
   }
 
-  constexpr bool ShouldEncodeNulls() const { return true; }
+  bool ShouldEncodeNulls() const { return true; }
 
  private:
   Int64Builder count_builder_;

From f18c70110ddc04cde6099ae04fd9e05a83c24ff8 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Tue, 23 Feb 2021 09:22:37 -1000
Subject: [PATCH 32/33] Missed one of the merge conflicts

---
 python/pyarrow/tests/test_dataset.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 67870b1e70d..57179f391de 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -548,12 +548,8 @@ def test_file_format_pickling():
         'subdir/2/yyy/file1.parquet',
     ]
 ])
-<<<<<<< HEAD
 @pytest.mark.parametrize('pre_buffer', [False, True])
 def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer):
-=======
-def test_filesystem_factory(mockfs, paths_or_selector):
->>>>>>> Final lint pass.  Turns out I was relying on black which was messing up everything
     format = ds.ParquetFileFormat(
         read_options=ds.ParquetReadOptions(dictionary_columns={"str"},
                                            pre_buffer=pre_buffer)

From 591021e53ea141a006bd6a30c7be6966becf040e Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Tue, 23 Feb 2021 10:51:53 -1000
Subject: [PATCH 33/33] Putting in suggestion from Ben.  It got lost on rebase
 / force-push

---
 cpp/src/arrow/dataset/expression.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/dataset/expression.cc b/cpp/src/arrow/dataset/expression.cc
index 9764700816c..5ddb270451a 100644
--- a/cpp/src/arrow/dataset/expression.cc
+++ b/cpp/src/arrow/dataset/expression.cc
@@ -718,7 +718,7 @@ Status ExtractKnownFieldValuesImpl(
       known_values->emplace(*ref, *lit);
     } else if (call->function_name == "is_null") {
       auto ref = call->arguments[0].field_ref();
-      known_values->emplace(*ref, std::make_shared<NullScalar>());
+      known_values->emplace(*ref, Datum(std::make_shared<NullScalar>()));
     }
   }