From febebf33afb7180078c4a188154c6f07dcd209e5 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Thu, 11 Mar 2021 12:06:44 -0500
Subject: [PATCH 1/3] ARROW-11260: [C++][Dataset] Don't require dictionaries
 when specifying explicit partition schema

---
 cpp/src/arrow/dataset/expression.cc          |  1 +
 cpp/src/arrow/dataset/partition.cc           | 55 +++++++++++----
 cpp/src/arrow/dataset/partition.h            |  4 ++
 cpp/src/arrow/dataset/partition_test.cc      | 74 ++++++++++++++++++++
 python/pyarrow/_dataset.pyx                  | 27 +++++--
 python/pyarrow/includes/libarrow_dataset.pxd |  2 +
 python/pyarrow/tests/test_dataset.py         | 46 ++++++++++++
 7 files changed, 192 insertions(+), 17 deletions(-)
diff --git a/cpp/src/arrow/dataset/expression.cc b/cpp/src/arrow/dataset/expression.cc
index 6e71aa17e74..236d3bb3347 100644
--- a/cpp/src/arrow/dataset/expression.cc
+++ b/cpp/src/arrow/dataset/expression.cc
@@ -33,6 +33,7 @@
 #include "arrow/util/optional.h"
 #include "arrow/util/string.h"
 #include "arrow/util/value_parsing.h"
+#include "arrow/visitor_inline.h"
 
 namespace arrow {
 
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index ec974787cae..37c48cb35bb 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -145,8 +145,8 @@ Result<Expression> KeyValuePartitioning::ConvertKey(const Key& key) const {
     DictionaryScalar::ValueType value;
     value.dictionary = dictionaries_[field_index];
 
-    if (!value.dictionary->type()->Equals(
-            checked_cast<const DictionaryType&>(*field->type()).value_type())) {
+    const auto& dictionary_type = checked_cast<const DictionaryType&>(*field->type());
+    if (!value.dictionary->type()->Equals(dictionary_type.value_type())) {
       return Status::TypeError("Dictionary supplied for field ", field->ToString(),
                                " had incorrect type ",
                                value.dictionary->type()->ToString());
@@ -155,6 +155,8 @@ Result<Expression> KeyValuePartitioning::ConvertKey(const Key& key) const {
     // look up the partition value in the dictionary
     ARROW_ASSIGN_OR_RAISE(converted, Scalar::Parse(value.dictionary->type(), *key.value));
     ARROW_ASSIGN_OR_RAISE(auto index, compute::IndexIn(converted, value.dictionary));
+    auto to_index_type = compute::CastOptions::Safe(dictionary_type.index_type());
+    ARROW_ASSIGN_OR_RAISE(index, compute::Cast(index, to_index_type));
     value.index = index.scalar();
     if (!value.index->is_valid) {
       return Status::Invalid("Dictionary supplied for field ", field->ToString(),
@@ -300,10 +302,18 @@ class KeyValuePartitioningFactory : public PartitioningFactory {
     return repr_memos_[index]->GetOrInsert<StringType>(repr, &dummy);
   }
 
-  Result<std::shared_ptr<Schema>> DoInpsect() {
+  Result<std::shared_ptr<Schema>> DoInspect() {
     dictionaries_.assign(name_to_index_.size(), nullptr);
 
     std::vector<std::shared_ptr<Field>> fields(name_to_index_.size());
+    if (options_.schema) {
+      const auto requested_size = options_.schema->fields().size();
+      const auto inferred_size = fields.size();
+      if (inferred_size != requested_size) {
+        return Status::Invalid("Requested schema has ", requested_size,
+                               " fields, but only ", inferred_size, " were detected");
+      }
+    }
 
     for (const auto& name_index : name_to_index_) {
       const auto& name = name_index.first;
@@ -317,15 +327,34 @@ class KeyValuePartitioningFactory : public PartitioningFactory {
                                "'; couldn't infer type");
       }
 
-      // try casting to int32, otherwise bail and just use the string reprs
-      auto dict = compute::Cast(reprs, int32()).ValueOr(reprs).make_array();
-      auto type = dict->type();
-      if (options_.infer_dictionary) {
-        // wrap the inferred type in dictionary()
-        type = dictionary(int32(), std::move(type));
+      std::shared_ptr<Field> current_field;
+      std::shared_ptr<Array> dict;
+      if (options_.schema) {
+        // if we have a schema, use the schema type.
+        current_field = options_.schema->field(index);
+        auto cast_target = current_field->type();
+        if (is_dictionary(cast_target->id())) {
+          cast_target = checked_pointer_cast<DictionaryType>(cast_target)->value_type();
+        }
+        auto maybe_dict = compute::Cast(reprs, cast_target);
+        if (!maybe_dict.ok()) {
+          return Status::Invalid("Could not cast segments for partition field ",
+                                 current_field->name(), " to requested type ",
+                                 current_field->type()->ToString(),
+                                 " because: ", maybe_dict.status());
+        }
+        dict = maybe_dict.ValueOrDie().make_array();
+      } else {
+        // try casting to int32, otherwise bail and just use the string reprs
+        dict = compute::Cast(reprs, int32()).ValueOr(reprs).make_array();
+        auto type = dict->type();
+        if (options_.infer_dictionary) {
+          // wrap the inferred type in dictionary()
+          type = dictionary(int32(), std::move(type));
+        }
+        current_field = field(name, std::move(type));
       }
-
-      fields[index] = field(name, std::move(type));
+      fields[index] = current_field;
       dictionaries_[index] = std::move(dict);
     }
 
@@ -379,7 +408,7 @@ class DirectoryPartitioningFactory : public KeyValuePartitioningFactory {
       }
     }
 
-    return DoInpsect();
+    return DoInspect();
   }
 
   Result<std::shared_ptr<Partitioning>> Finish(
@@ -480,7 +509,7 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory {
     }
 
     field_names_ = FieldNames();
-    return DoInpsect();
+    return DoInspect();
   }
 
   Result<std::shared_ptr<Partitioning>> Finish(
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index eff1f2609e8..1561fcefceb 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -90,6 +90,10 @@ struct PartitioningFactoryOptions {
   /// columns, and Expressions parsed by the finished Partitioning will include
   /// dictionaries of all unique inspected values for each field.
   bool infer_dictionary = false;
+  /// Optionally, an expected schema can be provided, in which case inference
+  /// will only check discovered fields against the schema and update internal
+  /// state (such as dictionaries).
+  std::shared_ptr<Schema> schema = nullptr;
 };
 
 struct HivePartitioningFactoryOptions : PartitioningFactoryOptions {
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index cf97507deac..456b2852311 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -225,6 +225,19 @@ TEST_F(TestPartitioning, DirectoryPartitioningFormatDictionary) {
   AssertFormat(equal(field_ref("alpha"), literal(dict_hello)), "hello");
 }
 
+TEST_F(TestPartitioning, DirectoryPartitioningFormatDictionaryCustomIndex) {
+  // Make sure a non-int32 index type is properly cast to, else we fail a CHECK when
+  // we construct a dictionary array with the wrong index type
+  auto dict_type = dictionary(int8(), utf8());
+  auto dictionary = ArrayFromJSON(utf8(), R"(["hello", "world"])");
+  partitioning_ = std::make_shared<DirectoryPartitioning>(
+      schema({field("alpha", dict_type)}), ArrayVector{dictionary});
+  written_schema_ = partitioning_->schema();
+
+  ASSERT_OK_AND_ASSIGN(auto dict_hello, MakeScalar("hello")->CastTo(dict_type));
+  AssertFormat(equal(field_ref("alpha"), literal(dict_hello)), "hello");
+}
+
 TEST_F(TestPartitioning, DirectoryPartitioningWithTemporal) {
   for (auto temporal : {timestamp(TimeUnit::SECOND), date32()}) {
     partitioning_ = std::make_shared<DirectoryPartitioning>(
@@ -464,6 +477,67 @@ TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) {
   AssertParseError("/alpha=yosemite");  // not in inspected dictionary
 }
 
+TEST_F(TestPartitioning, ExistingSchemaDirectory) {
+  // Infer dictionary values but with a given schema
+  auto dict_type = dictionary(int8(), utf8());
+  PartitioningFactoryOptions options;
+  options.schema = schema({field("alpha", int64()), field("beta", dict_type)});
+  factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options);
+
+  AssertInspect({"/0/1"}, options.schema->fields());
+  AssertInspect({"/0/1/what"}, options.schema->fields());
+
+  // fail if any segment is not parseable as schema type
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"),
+                                  factory_->Inspect({"/0/1", "/hello/1"}));
+  factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options);
+
+  // Now we don't fail since our type is large enough
+  AssertInspect({"/3760212050/1"}, options.schema->fields());
+  // If there are still too many digits, fail
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"),
+                                  factory_->Inspect({"/1038581385102940193760212050/1"}));
+  factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options);
+
+  AssertInspect({"/0/1", "/2"}, options.schema->fields());
+}
+
+TEST_F(TestPartitioning, ExistingSchemaHive) {
+  // Infer dictionary values but with a given schema
+  auto dict_type = dictionary(int8(), utf8());
+  HivePartitioningFactoryOptions options;
+  options.schema = schema({field("a", int64()), field("b", dict_type)});
+  factory_ = HivePartitioning::MakeFactory(options);
+
+  AssertInspect({"/a=0/b=1"}, options.schema->fields());
+  AssertInspect({"/a=0/b=1/what"}, options.schema->fields());
+  AssertInspect({"/a=0", "/b=1"}, options.schema->fields());
+
+  // fail if any segment for field alpha is not parseable as schema type
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid,
+      ::testing::HasSubstr(
+          "Could not cast segments for partition field a to requested type int64"),
+      factory_->Inspect({"/a=0/b=1", "/a=hello/b=1"}));
+  factory_ = HivePartitioning::MakeFactory(options);
+
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid,
+      ::testing::HasSubstr("Requested schema has 2 fields, but only 1 were detected"),
+      factory_->Inspect({"/a=0", "/a=hello"}));
+  factory_ = HivePartitioning::MakeFactory(options);
+
+  // Now we don't fail since our type is large enough
+  AssertInspect({"/a=3760212050/b=1"}, options.schema->fields());
+  // If there are still too many digits, fail
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("Failed to parse string"),
+      factory_->Inspect({"/a=1038581385102940193760212050/b=1"}));
+  factory_ = HivePartitioning::MakeFactory(options);
+
+  AssertInspect({"/a=0/b=1", "/b=2"}, options.schema->fields());
+}
+
 TEST_F(TestPartitioning, EtlThenHive) {
   FieldVector etl_fields{field("year", int16()), field("month", int8()),
                          field("day", int8()), field("hour", int8())};
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 1c4e5d302c5..871e7d6a6cb 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -1528,8 +1528,9 @@ cdef class DirectoryPartitioning(Partitioning):
         self.directory_partitioning = <CDirectoryPartitioning*> sp.get()
 
     @staticmethod
-    def discover(field_names, infer_dictionary=False,
-                 max_partition_dictionary_size=0):
+    def discover(field_names=None, infer_dictionary=False,
+                 max_partition_dictionary_size=0,
+                 schema=None):
         """
         Discover a DirectoryPartitioning.
 
@@ -1537,6 +1538,7 @@ cdef class DirectoryPartitioning(Partitioning):
         ----------
         field_names : list of str
             The names to associate with the values from the subdirectory names.
+            If schema is given, will be populated from the schema.
         infer_dictionary : bool, default False
             When inferring a schema for partition fields, yield dictionary
             encoded types instead of plain types. This can be more efficient
@@ -1547,6 +1549,9 @@ cdef class DirectoryPartitioning(Partitioning):
             Synonymous with infer_dictionary for backwards compatibility with
             1.0: setting this to -1 or None is equivalent to passing
             infer_dictionary=True.
+        schema : Schema, default None
+            Do not infer the schema, but confirm that partition values match
+            this schema and infer dictionary values as appropriate.
 
         Returns
         -------
@@ -1566,7 +1571,14 @@ cdef class DirectoryPartitioning(Partitioning):
         if infer_dictionary:
             c_options.infer_dictionary = True
 
-        c_field_names = [tobytes(s) for s in field_names]
+        if schema:
+            c_options.schema = pyarrow_unwrap_schema(schema)
+            c_field_names = [tobytes(f.name) for f in schema]
+        elif not field_names:
+            raise TypeError(
+                "field_names must be passed if schema is not given")
+        else:
+            c_field_names = [tobytes(s) for s in field_names]
         return PartitioningFactory.wrap(
             CDirectoryPartitioning.MakeFactory(c_field_names, c_options))
 
@@ -1637,7 +1649,8 @@ cdef class HivePartitioning(Partitioning):
     @staticmethod
     def discover(infer_dictionary=False,
                  max_partition_dictionary_size=0,
-                 null_fallback="__HIVE_DEFAULT_PARTITION__"):
+                 null_fallback="__HIVE_DEFAULT_PARTITION__",
+                 schema=None):
         """
         Discover a HivePartitioning.
 
@@ -1657,6 +1670,9 @@ cdef class HivePartitioning(Partitioning):
             When inferring a schema for partition fields this value will be
             replaced by null.  The default is set to __HIVE_DEFAULT_PARTITION__
             for compatibility with Spark
+        schema : Schema, default None
+            Do not infer the schema, but confirm that partition values match
+            this schema and infer dictionary values as appropriate.
 
         Returns
         -------
@@ -1677,6 +1693,9 @@ cdef class HivePartitioning(Partitioning):
 
         c_options.null_fallback = tobytes(null_fallback)
 
+        if schema:
+            c_options.schema = pyarrow_unwrap_schema(schema)
+
         return PartitioningFactory.wrap(
             CHivePartitioning.MakeFactory(c_options))
 
diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
index bbe545cf794..8eb2d0f1e7a 100644
--- a/python/pyarrow/includes/libarrow_dataset.pxd
+++ b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -268,11 +268,13 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
     cdef cppclass CPartitioningFactoryOptions \
             "arrow::dataset::PartitioningFactoryOptions":
         c_bool infer_dictionary
+        shared_ptr[CSchema] schema
 
     cdef cppclass CHivePartitioningFactoryOptions \
             "arrow::dataset::HivePartitioningFactoryOptions":
         c_bool infer_dictionary,
         c_string null_fallback
+        shared_ptr[CSchema] schema
 
     cdef cppclass CPartitioningFactory "arrow::dataset::PartitioningFactory":
         pass
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 09cad5d917b..3560e6032ac 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -1339,6 +1339,52 @@ def test_partitioning_function():
         ds.partitioning(schema, flavor="unsupported")
 
 
+def test_directory_partitioning_dictionary_key(mockfs):
+    # ARROW-8088 specifying partition key as dictionary type
+    schema = pa.schema([
+        pa.field('group', pa.dictionary(pa.int8(), pa.int32())),
+        pa.field('key', pa.dictionary(pa.int8(), pa.string()))
+    ])
+    part = ds.DirectoryPartitioning.discover(schema=schema)
+
+    dataset = ds.dataset(
+        "subdir", format="parquet", filesystem=mockfs, partitioning=part
+    )
+    table = dataset.to_table()
+
+    assert table.column('group').type.equals(schema.types[0])
+    assert table.column('group').to_pylist() == [1] * 5 + [2] * 5
+    assert table.column('key').type.equals(schema.types[1])
+    assert table.column('key').to_pylist() == ['xxx'] * 5 + ['yyy'] * 5
+
+
+def test_hive_partitioning_dictionary_key(multisourcefs):
+    # ARROW-8088 specifying partition key as dictionary type
+    schema = pa.schema([
+        pa.field('year', pa.dictionary(pa.int8(), pa.int16())),
+        pa.field('month', pa.dictionary(pa.int8(), pa.int16()))
+    ])
+    part = ds.HivePartitioning.discover(schema=schema)
+
+    dataset = ds.dataset(
+        "hive", format="parquet", filesystem=multisourcefs, partitioning=part
+    )
+    table = dataset.to_table()
+
+    year_dictionary = list(range(2006, 2011))
+    month_dictionary = list(range(1, 13))
+    assert table.column('year').type.equals(schema.types[0])
+    for chunk in table.column('year').chunks:
+        actual = chunk.dictionary.to_pylist()
+        actual.sort()
+        assert actual == year_dictionary
+    assert table.column('month').type.equals(schema.types[1])
+    for chunk in table.column('month').chunks:
+        actual = chunk.dictionary.to_pylist()
+        actual.sort()
+        assert actual == month_dictionary
+
+
 def _create_single_file(base_dir, table=None, row_group_size=None):
     import pyarrow.parquet as pq
     if table is None:

From 097fd3dbb9e9ab5f2c54a90609557b97bc8e881b Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Thu, 11 Mar 2021 16:55:51 -0500
Subject: [PATCH 2/3] Apply suggestions from code review

Co-authored-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/arrow/dataset/expression.cc |  1 -
 cpp/src/arrow/dataset/partition.cc  |  2 +-
 cpp/src/arrow/dataset/partition.h   |  2 +-
 python/pyarrow/_dataset.pyx         | 15 +++++++++------
 4 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/cpp/src/arrow/dataset/expression.cc b/cpp/src/arrow/dataset/expression.cc
index 236d3bb3347..6e71aa17e74 100644
--- a/cpp/src/arrow/dataset/expression.cc
+++ b/cpp/src/arrow/dataset/expression.cc
@@ -33,7 +33,6 @@
 #include "arrow/util/optional.h"
 #include "arrow/util/string.h"
 #include "arrow/util/value_parsing.h"
-#include "arrow/visitor_inline.h"
 
 namespace arrow {
 
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 37c48cb35bb..ec4a28c8a0e 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -354,7 +354,7 @@ class KeyValuePartitioningFactory : public PartitioningFactory {
         }
         current_field = field(name, std::move(type));
       }
-      fields[index] = current_field;
+      fields[index] = std::move(current_field);
       dictionaries_[index] = std::move(dict);
     }
 
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index 1561fcefceb..c49ac5e923e 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -93,7 +93,7 @@ struct PartitioningFactoryOptions {
   /// Optionally, an expected schema can be provided, in which case inference
   /// will only check discovered fields against the schema and update internal
   /// state (such as dictionaries).
-  std::shared_ptr<Schema> schema = nullptr;
+  std::shared_ptr<Schema> schema;
 };
 
 struct HivePartitioningFactoryOptions : PartitioningFactoryOptions {
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 871e7d6a6cb..f1e5168c7be 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -1550,8 +1550,9 @@ cdef class DirectoryPartitioning(Partitioning):
             1.0: setting this to -1 or None is equivalent to passing
             infer_dictionary=True.
         schema : Schema, default None
-            Do not infer the schema, but confirm that partition values match
-            this schema and infer dictionary values as appropriate.
+            Use this schema instead of inferring a schema from partition
+            values. Partition values will be validated against this schema
+            before accumulation into the Partitioning's dictionary.
 
         Returns
         -------
@@ -1575,8 +1576,9 @@ cdef class DirectoryPartitioning(Partitioning):
             c_options.schema = pyarrow_unwrap_schema(schema)
             c_field_names = [tobytes(f.name) for f in schema]
         elif not field_names:
-            raise TypeError(
-                "field_names must be passed if schema is not given")
+            raise ValueError(
+                "Neither field_names nor schema was passed; "
+                "cannot infer field_names")
         else:
             c_field_names = [tobytes(s) for s in field_names]
         return PartitioningFactory.wrap(
@@ -1671,8 +1673,9 @@ cdef class HivePartitioning(Partitioning):
             replaced by null.  The default is set to __HIVE_DEFAULT_PARTITION__
             for compatibility with Spark
         schema : Schema, default None
-            Do not infer the schema, but confirm that partition values match
-            this schema and infer dictionary values as appropriate.
+            Use this schema instead of inferring a schema from partition
+            values. Partition values will be validated against this schema
+            before accumulation into the Partitioning's dictionary.
 
         Returns
         -------

From f1c2f351209bf0f5d0200d426abe196c7f1391f5 Mon Sep 17 00:00:00 2001
From: David Li <li.davidm96@gmail.com>
Date: Fri, 12 Mar 2021 15:17:42 -0500
Subject: [PATCH 3/3] ARROW-11260: [Python][Dataset] Don't require dictionaries
 when specifying explicit partition schema

---
 python/pyarrow/dataset.py            | 31 ++++++++++++++++++++++++++--
 python/pyarrow/tests/test_dataset.py |  4 ++++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
index 2042a522918..a2cb87a1f7a 100644
--- a/python/pyarrow/dataset.py
+++ b/python/pyarrow/dataset.py
@@ -122,11 +122,13 @@ def partitioning(schema=None, field_names=None, flavor=None,
     flavor : str, default None
         The default is DirectoryPartitioning. Specify ``flavor="hive"`` for
         a HivePartitioning.
-    dictionaries : List[Array]
+    dictionaries : Dict[str, Array]
         If the type of any field of `schema` is a dictionary type, the
         corresponding entry of `dictionaries` must be an array containing
         every value which may be taken by the corresponding column or an
-        error will be raised in parsing.
+        error will be raised in parsing. Alternatively, pass `infer` to have
+        Arrow discover the dictionary values, in which case a
+        PartitioningFactory is returned.
 
     Returns
     -------
@@ -146,6 +148,27 @@ def partitioning(schema=None, field_names=None, flavor=None,
     For paths like "/2009/June", the year will be inferred as int32 while month
     will be inferred as string.
 
+    Specify a Schema with dictionary encoding, providing dictionary values:
+
+    >>> partitioning(
+    ...     pa.schema([
+    ...         ("year", pa.int16()),
+    ...         ("month", pa.dictionary(pa.int8(), pa.string()))
+    ...     ]),
+    ...     dictionaries={
+    ...         "month": pa.array(["January", "February", "March"]),
+    ...     })
+
+    Alternatively, specify a Schema with dictionary encoding, but have Arrow
+    infer the dictionary values:
+
+    >>> partitioning(
+    ...     pa.schema([
+    ...         ("year", pa.int16()),
+    ...         ("month", pa.dictionary(pa.int8(), pa.string()))
+    ...     ]),
+    ...     dictionaries="infer")
+
     Create a Hive scheme for a path like "/year=2009/month=11":
 
     >>> partitioning(
@@ -164,6 +187,8 @@ def partitioning(schema=None, field_names=None, flavor=None,
             if field_names is not None:
                 raise ValueError(
                     "Cannot specify both 'schema' and 'field_names'")
+            if dictionaries == 'infer':
+                return DirectoryPartitioning.discover(schema=schema)
             return DirectoryPartitioning(schema, dictionaries)
         elif field_names is not None:
             if isinstance(field_names, list):
@@ -181,6 +206,8 @@ def partitioning(schema=None, field_names=None, flavor=None,
             raise ValueError("Cannot specify 'field_names' for flavor 'hive'")
         elif schema is not None:
             if isinstance(schema, pa.Schema):
+                if dictionaries == 'infer':
+                    return HivePartitioning.discover(schema=schema)
                 return HivePartitioning(schema, dictionaries)
             else:
                 raise ValueError(
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 3560e6032ac..7c544af075c 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -1313,6 +1313,8 @@ def test_partitioning_function():
     # default DirectoryPartitioning
     part = ds.partitioning(schema)
     assert isinstance(part, ds.DirectoryPartitioning)
+    part = ds.partitioning(schema, dictionaries="infer")
+    assert isinstance(part, ds.PartitioningFactory)
     part = ds.partitioning(field_names=names)
     assert isinstance(part, ds.PartitioningFactory)
     # needs schema or list of names
@@ -1326,6 +1328,8 @@ def test_partitioning_function():
     # Hive partitioning
     part = ds.partitioning(schema, flavor="hive")
     assert isinstance(part, ds.HivePartitioning)
+    part = ds.partitioning(schema, dictionaries="infer", flavor="hive")
+    assert isinstance(part, ds.PartitioningFactory)
     part = ds.partitioning(flavor="hive")
     assert isinstance(part, ds.PartitioningFactory)
     # cannot pass list of names