From 2c3cfb0b3e4932abda358d15ebc2cca13649dd06 Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Mon, 25 Apr 2022 04:12:46 +0530
Subject: [PATCH 01/24] fix: use StripPrefix() for FilenamePartitioning

---
 cpp/src/arrow/dataset/discovery.cc | 7 ++++++-
 cpp/src/arrow/dataset/partition.cc | 7 ++++++-
 cpp/src/arrow/dataset/partition.h  | 5 +++++
 python/pyarrow/dataset.py          | 1 -
 4 files changed, 17 insertions(+), 3 deletions(-)
diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc
index f8382aa4052..2632b05ffa0 100644
--- a/cpp/src/arrow/dataset/discovery.cc
+++ b/cpp/src/arrow/dataset/discovery.cc
@@ -278,8 +278,13 @@ Result<std::shared_ptr<Dataset>> FileSystemDatasetFactory::Finish(FinishOptions
   }
 
   std::vector<std::shared_ptr<FileFragment>> fragments;
+  std::string fixed_path;
   for (const auto& info : files_) {
-    auto fixed_path = StripPrefixAndFilename(info.path(), options_.partition_base_dir);
+    if (partitioning->type_name() == "filename") {
+      fixed_path = StripPrefix(info.path(), options_.partition_base_dir);
+    } else {
+      fixed_path = StripPrefixAndFilename(info.path(), options_.partition_base_dir);
+    }
     ARROW_ASSIGN_OR_RAISE(auto partition, partitioning->Parse(fixed_path));
     ARROW_ASSIGN_OR_RAISE(auto fragment, format_->MakeFragment({info, fs_}, partition));
     fragments.push_back(fragment);
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 763ea0bc6ce..f175b047fc4 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -805,9 +805,14 @@ std::shared_ptr<PartitioningFactory> HivePartitioning::MakeFactory(
   return std::shared_ptr<PartitioningFactory>(new HivePartitioningFactory(options));
 }
 
-std::string StripPrefixAndFilename(const std::string& path, const std::string& prefix) {
+std::string StripPrefix(const std::string& path, const std::string& prefix) {
   auto maybe_base_less = fs::internal::RemoveAncestor(prefix, path);
   auto base_less = maybe_base_less ? std::string(*maybe_base_less) : path;
+  return base_less;
+}
+
+std::string StripPrefixAndFilename(const std::string& path, const std::string& prefix) {
+  auto base_less = StripPrefix(path, prefix);
   auto basename_filename = fs::internal::GetAbstractPathParent(base_less);
   return basename_filename.first;
 }
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index 6a679f34822..d2ea5db061a 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -358,6 +358,11 @@ class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning {
   Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
 };
 
+/// \brief Remove only the prefix of a path.
+///
+/// e.g., `StripPrefix("/data/2019_c.txt", "/data") -> "2019_c.txt"`
+ARROW_DS_EXPORT std::string StripPrefix(const std::string& path,
+                                        const std::string& prefix);
 /// \brief Remove a prefix and the filename of a path.
 ///
 /// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") -> "year=2019"`
diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
index b22a3d032f0..0498dc7ab6f 100644
--- a/python/pyarrow/dataset.py
+++ b/python/pyarrow/dataset.py
@@ -218,7 +218,6 @@ def partitioning(schema=None, field_names=None, flavor=None,
                 "For the default directory flavor, need to specify "
                 "a Schema or a list of field names")
     if flavor == "filename":
-        # default flavor
         if schema is not None:
             if field_names is not None:
                 raise ValueError(

From ba752aae67b074a2ee10a7e9fb7296606de4ef8a Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Mon, 25 Apr 2022 05:20:42 +0530
Subject: [PATCH 02/24] test: roundtrip test for partitioning

---
 python/pyarrow/tests/test_dataset.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 3bc905f2f97..56bc19f4d48 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -662,6 +662,24 @@ def test_partitioning():
     assert partitioning.dictionaries[1].to_pylist() == [
         "first", "second", "third"]
 
+    # test partitioning roundtrip
+    table = pa.table([
+        pa.array(range(20)), pa.array(np.random.randn(20)),
+        pa.array(np.repeat(['a', 'b'], 10))],
+        names=["f1", "f2", "part"]
+    )
+    partitioning_schema = pa.schema([("part", pa.string())])
+    for klass in [ds.DirectoryPartitioning, ds.HivePartitioning,
+                  ds.FilenamePartitioning]:
+        with tempfile.TemporaryDirectory() as tempdir:
+            partitioning = klass(partitioning_schema)
+            ds.write_dataset(table, tempdir,
+                             format='ipc', partitioning=partitioning)
+            load_back = ds.dataset(tempdir, format='ipc',
+                                   partitioning=partitioning)
+            load_back_table = load_back.to_table()
+            assert load_back_table.equals(table)
+
 
 def test_expression_arithmetic_operators():
     dataset = ds.dataset(pa.table({'a': [1, 2, 3], 'b': [2, 2, 2]}))

From d666ce97ea0ac5a870b8a30c7374aeeb91204113 Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Wed, 27 Apr 2022 01:31:34 +0530
Subject: [PATCH 03/24] feat: modify Parse() to accept both directory and
 filename prefix

---
 cpp/src/arrow/dataset/discovery.cc           |  9 ++--
 cpp/src/arrow/dataset/file_parquet.cc        |  6 ++-
 cpp/src/arrow/dataset/partition.cc           | 47 +++++++++-----------
 cpp/src/arrow/dataset/partition.h            | 40 +++++++++--------
 cpp/src/arrow/dataset/partition_test.cc      | 22 ++++-----
 python/pyarrow/_dataset.pyx                  |  4 +-
 python/pyarrow/includes/libarrow_dataset.pxd |  2 +-
 python/pyarrow/tests/test_dataset.py         |  4 +-
 8 files changed, 66 insertions(+), 68 deletions(-)

diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc
index 2632b05ffa0..a46ea926535 100644
--- a/cpp/src/arrow/dataset/discovery.cc
+++ b/cpp/src/arrow/dataset/discovery.cc
@@ -280,12 +280,9 @@ Result<std::shared_ptr<Dataset>> FileSystemDatasetFactory::Finish(FinishOptions
   std::vector<std::shared_ptr<FileFragment>> fragments;
   std::string fixed_path;
   for (const auto& info : files_) {
-    if (partitioning->type_name() == "filename") {
-      fixed_path = StripPrefix(info.path(), options_.partition_base_dir);
-    } else {
-      fixed_path = StripPrefixAndFilename(info.path(), options_.partition_base_dir);
-    }
-    ARROW_ASSIGN_OR_RAISE(auto partition, partitioning->Parse(fixed_path));
+    auto fixed_path = StripPrefixAndFilename(info.path(), options_.partition_base_dir);
+    ARROW_ASSIGN_OR_RAISE(auto partition,
+                          partitioning->Parse(fixed_path.directory, fixed_path.prefix));
     ARROW_ASSIGN_OR_RAISE(auto fragment, format_->MakeFragment({info, fs_}, partition));
     fragments.push_back(fragment);
   }
diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
index 9f914b57f01..0340b3e08dc 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -894,7 +894,8 @@ ParquetDatasetFactory::CollectParquetFragments(const Partitioning& partitioning)
     auto row_groups = Iota(metadata_subset->num_row_groups());
 
     auto partition_expression =
-        partitioning.Parse(StripPrefixAndFilename(path, options_.partition_base_dir))
+        partitioning
+            .Parse(StripPrefixAndFilename(path, options_.partition_base_dir).directory)
             .ValueOr(compute::literal(true));
 
     ARROW_ASSIGN_OR_RAISE(
@@ -920,7 +921,8 @@ Result<std::vector<std::shared_ptr<Schema>>> ParquetDatasetFactory::InspectSchem
 
     size_t i = 0;
     for (const auto& e : paths_with_row_group_ids_) {
-      stripped[i++] = StripPrefixAndFilename(e.first, options_.partition_base_dir);
+      stripped[i++] =
+          StripPrefixAndFilename(e.first, options_.partition_base_dir).directory;
     }
     ARROW_ASSIGN_OR_RAISE(auto partition_schema, factory->Inspect(stripped));
 
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index f175b047fc4..67797a99daa 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -82,7 +82,8 @@ std::shared_ptr<Partitioning> Partitioning::Default() {
 
     std::string type_name() const override { return "default"; }
 
-    Result<compute::Expression> Parse(const std::string& path) const override {
+    Result<compute::Expression> Parse(const std::string& directory,
+                                      const std::string& prefix) const override {
       return compute::literal(true);
     }
 
@@ -250,10 +251,11 @@ Result<compute::Expression> KeyValuePartitioning::ConvertKey(const Key& key) con
                         compute::literal(std::move(converted)));
 }
 
-Result<compute::Expression> KeyValuePartitioning::Parse(const std::string& path) const {
+Result<compute::Expression> KeyValuePartitioning::Parse(const std::string& directory,
+                                                        const std::string& prefix) const {
   std::vector<compute::Expression> expressions;
 
-  ARROW_ASSIGN_OR_RAISE(auto parsed, ParseKeys(path));
+  ARROW_ASSIGN_OR_RAISE(auto parsed, ParseKeys(directory, prefix));
   for (const Key& key : parsed) {
     ARROW_ASSIGN_OR_RAISE(auto expr, ConvertKey(key));
     if (expr == compute::literal(true)) continue;
@@ -263,7 +265,7 @@ Result<compute::Expression> KeyValuePartitioning::Parse(const std::string& path)
   return and_(std::move(expressions));
 }
 
-Result<Partitioning::PartitionPathFormat> KeyValuePartitioning::Format(
+Result<PartitionPathFormat> KeyValuePartitioning::Format(
     const compute::Expression& expr) const {
   ScalarVector values{static_cast<size_t>(schema_->num_fields()), nullptr};
 
@@ -376,8 +378,8 @@ DirectoryPartitioning::DirectoryPartitioning(std::shared_ptr<Schema> schema,
 }
 
 Result<std::vector<KeyValuePartitioning::Key>> DirectoryPartitioning::ParseKeys(
-    const std::string& path) const {
-  std::vector<std::string> segments = fs::internal::SplitAbstractPath(path);
+    const std::string& directory, const std::string& prefix) const {
+  std::vector<std::string> segments = fs::internal::SplitAbstractPath(directory);
   return ParsePartitionSegments(segments);
 }
 
@@ -389,24 +391,24 @@ FilenamePartitioning::FilenamePartitioning(std::shared_ptr<Schema> schema,
 }
 
 Result<std::vector<KeyValuePartitioning::Key>> FilenamePartitioning::ParseKeys(
-    const std::string& path) const {
+    const std::string& directory, const std::string& prefix) const {
   std::vector<std::string> segments =
-      fs::internal::SplitAbstractPath(StripNonPrefix(path), kFilenamePartitionSep);
+      fs::internal::SplitAbstractPath(StripNonPrefix(prefix), kFilenamePartitionSep);
   return ParsePartitionSegments(segments);
 }
 
-Result<Partitioning::PartitionPathFormat> DirectoryPartitioning::FormatValues(
+Result<PartitionPathFormat> DirectoryPartitioning::FormatValues(
     const ScalarVector& values) const {
   std::vector<std::string> segments;
   ARROW_ASSIGN_OR_RAISE(segments, FormatPartitionSegments(values));
   return PartitionPathFormat{fs::internal::JoinAbstractPath(std::move(segments)), ""};
 }
 
-Result<Partitioning::PartitionPathFormat> FilenamePartitioning::FormatValues(
+Result<PartitionPathFormat> FilenamePartitioning::FormatValues(
     const ScalarVector& values) const {
   std::vector<std::string> segments;
   ARROW_ASSIGN_OR_RAISE(segments, FormatPartitionSegments(values));
-  return Partitioning::PartitionPathFormat{
+  return PartitionPathFormat{
       "", fs::internal::JoinAbstractPath(std::move(segments), kFilenamePartitionSep) +
               kFilenamePartitionSep};
 }
@@ -718,10 +720,10 @@ Result<util::optional<KeyValuePartitioning::Key>> HivePartitioning::ParseKey(
 }
 
 Result<std::vector<KeyValuePartitioning::Key>> HivePartitioning::ParseKeys(
-    const std::string& path) const {
+    const std::string& directory, const std::string& prefix) const {
   std::vector<Key> keys;
 
-  for (const auto& segment : fs::internal::SplitAbstractPath(path)) {
+  for (const auto& segment : fs::internal::SplitAbstractPath(directory)) {
     ARROW_ASSIGN_OR_RAISE(auto maybe_key, ParseKey(segment, hive_options_));
     if (auto key = maybe_key) {
       keys.push_back(std::move(*key));
@@ -731,7 +733,7 @@ Result<std::vector<KeyValuePartitioning::Key>> HivePartitioning::ParseKeys(
   return keys;
 }
 
-Result<Partitioning::PartitionPathFormat> HivePartitioning::FormatValues(
+Result<PartitionPathFormat> HivePartitioning::FormatValues(
     const ScalarVector& values) const {
   std::vector<std::string> segments(static_cast<size_t>(schema_->num_fields()));
 
@@ -749,8 +751,7 @@ Result<Partitioning::PartitionPathFormat> HivePartitioning::FormatValues(
     }
   }
 
-  return Partitioning::PartitionPathFormat{
-      fs::internal::JoinAbstractPath(std::move(segments)), ""};
+  return PartitionPathFormat{fs::internal::JoinAbstractPath(std::move(segments)), ""};
 }
 
 class HivePartitioningFactory : public KeyValuePartitioningFactory {
@@ -805,16 +806,12 @@ std::shared_ptr<PartitioningFactory> HivePartitioning::MakeFactory(
   return std::shared_ptr<PartitioningFactory>(new HivePartitioningFactory(options));
 }
 
-std::string StripPrefix(const std::string& path, const std::string& prefix) {
+PartitionPathFormat StripPrefixAndFilename(const std::string& path,
+                                           const std::string& prefix) {
   auto maybe_base_less = fs::internal::RemoveAncestor(prefix, path);
   auto base_less = maybe_base_less ? std::string(*maybe_base_less) : path;
-  return base_less;
-}
-
-std::string StripPrefixAndFilename(const std::string& path, const std::string& prefix) {
-  auto base_less = StripPrefix(path, prefix);
   auto basename_filename = fs::internal::GetAbstractPathParent(base_less);
-  return basename_filename.first;
+  return PartitionPathFormat{basename_filename.first, basename_filename.second};
 }
 
 std::vector<std::string> StripPrefixAndFilename(const std::vector<std::string>& paths,
@@ -822,7 +819,7 @@ std::vector<std::string> StripPrefixAndFilename(const std::vector<std::string>&
   std::vector<std::string> result;
   result.reserve(paths.size());
   for (const auto& path : paths) {
-    result.emplace_back(StripPrefixAndFilename(path, prefix));
+    result.emplace_back(StripPrefixAndFilename(path, prefix).directory);
   }
   return result;
 }
@@ -832,7 +829,7 @@ std::vector<std::string> StripPrefixAndFilename(const std::vector<fs::FileInfo>&
   std::vector<std::string> result;
   result.reserve(files.size());
   for (const auto& info : files) {
-    result.emplace_back(StripPrefixAndFilename(info.path(), prefix));
+    result.emplace_back(StripPrefixAndFilename(info.path(), prefix).directory);
   }
   return result;
 }
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index d2ea5db061a..b76bc7f9ab1 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -38,6 +38,10 @@ namespace dataset {
 
 constexpr char kFilenamePartitionSep = '_';
 
+struct PartitionPathFormat {
+  std::string directory, prefix;
+};
+
 // ----------------------------------------------------------------------
 // Partitioning
 
@@ -76,11 +80,8 @@ class ARROW_DS_EXPORT Partitioning {
       const std::shared_ptr<RecordBatch>& batch) const = 0;
 
   /// \brief Parse a path into a partition expression
-  virtual Result<compute::Expression> Parse(const std::string& path) const = 0;
-
-  struct PartitionPathFormat {
-    std::string directory, prefix;
-  };
+  virtual Result<compute::Expression> Parse(const std::string& directory = "",
+                                            const std::string& prefix = "") const = 0;
 
   virtual Result<PartitionPathFormat> Format(const compute::Expression& expr) const = 0;
 
@@ -174,7 +175,8 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
   Result<PartitionedBatches> Partition(
       const std::shared_ptr<RecordBatch>& batch) const override;
 
-  Result<compute::Expression> Parse(const std::string& path) const override;
+  Result<compute::Expression> Parse(const std::string& directory = "",
+                                    const std::string& prefix = "") const override;
 
   Result<PartitionPathFormat> Format(const compute::Expression& expr) const override;
 
@@ -191,7 +193,8 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
     }
   }
 
-  virtual Result<std::vector<Key>> ParseKeys(const std::string& path) const = 0;
+  virtual Result<std::vector<Key>> ParseKeys(const std::string& directory,
+                                             const std::string& prefix) const = 0;
 
   virtual Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const = 0;
 
@@ -231,7 +234,8 @@ class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning {
       std::vector<std::string> field_names, PartitioningFactoryOptions = {});
 
  private:
-  Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
+  Result<std::vector<Key>> ParseKeys(const std::string& directory,
+                                     const std::string& prefix) const override;
 
   Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
 };
@@ -288,7 +292,8 @@ class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning {
 
  private:
   const HivePartitioningOptions hive_options_;
-  Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
+  Result<std::vector<Key>> ParseKeys(const std::string& directory,
+                                     const std::string& prefix) const override;
 
   Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
 };
@@ -310,8 +315,9 @@ class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning {
 
   std::string type_name() const override { return name_; }
 
-  Result<compute::Expression> Parse(const std::string& path) const override {
-    return parse_impl_(path);
+  Result<compute::Expression> Parse(const std::string& directory = "",
+                                    const std::string& prefix = "") const override {
+    return parse_impl_(directory);
   }
 
   Result<PartitionPathFormat> Format(const compute::Expression& expr) const override {
@@ -353,21 +359,17 @@ class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning {
       std::vector<std::string> field_names, PartitioningFactoryOptions = {});
 
  private:
-  Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
+  Result<std::vector<Key>> ParseKeys(const std::string& directory,
+                                     const std::string& prefix) const override;
 
   Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
 };
 
-/// \brief Remove only the prefix of a path.
-///
-/// e.g., `StripPrefix("/data/2019_c.txt", "/data") -> "2019_c.txt"`
-ARROW_DS_EXPORT std::string StripPrefix(const std::string& path,
-                                        const std::string& prefix);
 /// \brief Remove a prefix and the filename of a path.
 ///
 /// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") -> "year=2019"`
-ARROW_DS_EXPORT std::string StripPrefixAndFilename(const std::string& path,
-                                                   const std::string& prefix);
+ARROW_DS_EXPORT PartitionPathFormat StripPrefixAndFilename(const std::string& path,
+                                                           const std::string& prefix);
 
 /// \brief Vector version of StripPrefixAndFilename.
 ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 87f6f89ccd9..51939ac7092 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -873,11 +873,12 @@ class RangePartitioning : public Partitioning {
 
   std::string type_name() const override { return "range"; }
 
-  Result<compute::Expression> Parse(const std::string& path) const override {
+  Result<compute::Expression> Parse(const std::string& directory,
+                                    const std::string& prefix) const override {
     std::vector<compute::Expression> ranges;
 
     HivePartitioningOptions options;
-    for (auto segment : fs::internal::SplitAbstractPath(path)) {
+    for (auto segment : fs::internal::SplitAbstractPath(directory)) {
       ARROW_ASSIGN_OR_RAISE(auto key, HivePartitioning::ParseKey(segment, options));
       if (!key) {
         return Status::Invalid("can't parse '", segment, "' as a range");
@@ -919,9 +920,8 @@ class RangePartitioning : public Partitioning {
     return Status::OK();
   }
 
-  Result<Partitioning::PartitionPathFormat> Format(
-      const compute::Expression&) const override {
-    return Partitioning::PartitionPathFormat{"", ""};
+  Result<PartitionPathFormat> Format(const compute::Expression&) const override {
+    return PartitionPathFormat{"", ""};
   }
   Result<PartitionedBatches> Partition(
       const std::shared_ptr<RecordBatch>&) const override {
@@ -943,12 +943,12 @@ TEST_F(TestPartitioning, Range) {
 }
 
 TEST(TestStripPrefixAndFilename, Basic) {
-  ASSERT_EQ(StripPrefixAndFilename("", ""), "");
-  ASSERT_EQ(StripPrefixAndFilename("a.csv", ""), "");
-  ASSERT_EQ(StripPrefixAndFilename("a/b.csv", ""), "a");
-  ASSERT_EQ(StripPrefixAndFilename("/a/b/c.csv", "/a"), "b");
-  ASSERT_EQ(StripPrefixAndFilename("/a/b/c/d.csv", "/a"), "b/c");
-  ASSERT_EQ(StripPrefixAndFilename("/a/b/c.csv", "/a/b"), "");
+  ASSERT_EQ(StripPrefixAndFilename("", "").directory, "");
+  ASSERT_EQ(StripPrefixAndFilename("a.csv", "").directory, "");
+  ASSERT_EQ(StripPrefixAndFilename("a/b.csv", "").directory, "a");
+  ASSERT_EQ(StripPrefixAndFilename("/a/b/c.csv", "/a").directory, "b");
+  ASSERT_EQ(StripPrefixAndFilename("/a/b/c/d.csv", "/a").directory, "b/c");
+  ASSERT_EQ(StripPrefixAndFilename("/a/b/c.csv", "/a/b").directory, "");
 
   std::vector<std::string> input{"/data/year=2019/file.parquet",
                                  "/data/year=2019/month=12/file.parquet",
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 10aeafbdbaa..6b1b8a4ced3 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -1319,9 +1319,9 @@ cdef class Partitioning(_Weakrefable):
     cdef inline shared_ptr[CPartitioning] unwrap(self):
         return self.wrapped
 
-    def parse(self, path):
+    def parse(self, directory="", prefix=""):
         cdef CResult[CExpression] result
-        result = self.partitioning.Parse(tobytes(path))
+        result = self.partitioning.Parse(tobytes(directory), tobytes(prefix))
         return Expression.wrap(GetResultValue(result))
 
     @property
diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
index 64e59e05613..bd7375aad81 100644
--- a/python/pyarrow/includes/libarrow_dataset.pxd
+++ b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -279,7 +279,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
 
     cdef cppclass CPartitioning "arrow::dataset::Partitioning":
         c_string type_name() const
-        CResult[CExpression] Parse(const c_string & path) const
+        CResult[CExpression] Parse(const c_string & directory, const c_string & prefix) const
         const shared_ptr[CSchema] & schema()
 
     cdef cppclass CSegmentEncoding" arrow::dataset::SegmentEncoding":
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 56bc19f4d48..ca9ed3f2a1d 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -629,14 +629,14 @@ def test_partitioning():
     )
     assert len(partitioning.dictionaries) == 2
     assert all(x is None for x in partitioning.dictionaries)
-    expr = partitioning.parse('3_3.14_')
+    expr = partitioning.parse('', '3_3.14_')
     assert isinstance(expr, ds.Expression)
 
     expected = (ds.field('group') == 3) & (ds.field('key') == 3.14)
     assert expr.equals(expected)
 
     with pytest.raises(pa.ArrowInvalid):
-        partitioning.parse('prefix_3_aaa_')
+        partitioning.parse('', 'prefix_3_aaa_')
 
     partitioning = ds.DirectoryPartitioning(
         pa.schema([

From 8da2d5ef434c009256fadf1ac24ea8cf2ede3b5f Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Wed, 27 Apr 2022 18:52:45 +0530
Subject: [PATCH 04/24] feat: ARROW_EXPORT PartitionPathFormat

---
 cpp/src/arrow/dataset/partition.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index b76bc7f9ab1..a8af1291ddc 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -38,7 +38,7 @@ namespace dataset {
 
 constexpr char kFilenamePartitionSep = '_';
 
-struct PartitionPathFormat {
+struct ARROW_DS_EXPORT PartitionPathFormat {
   std::string directory, prefix;
 };
 

From ab1e3873c1a656e5a45b20ecee5c2d8f9ce5cd8e Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Wed, 27 Apr 2022 18:53:35 +0530
Subject: [PATCH 05/24] fix: remove shadowed variable fixed_path

---
 cpp/src/arrow/dataset/discovery.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc
index a46ea926535..25e1f72482a 100644
--- a/cpp/src/arrow/dataset/discovery.cc
+++ b/cpp/src/arrow/dataset/discovery.cc
@@ -278,7 +278,6 @@ Result<std::shared_ptr<Dataset>> FileSystemDatasetFactory::Finish(FinishOptions
   }
 
   std::vector<std::shared_ptr<FileFragment>> fragments;
-  std::string fixed_path;
   for (const auto& info : files_) {
     auto fixed_path = StripPrefixAndFilename(info.path(), options_.partition_base_dir);
     ARROW_ASSIGN_OR_RAISE(auto partition,

From 52ec5eec611a0d606ae76bc14a667d4d334e09a1 Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Thu, 28 Apr 2022 13:45:45 +0530
Subject: [PATCH 06/24] feat: use PartitionPathFormat as argument to Parse
 methods

---
 cpp/src/arrow/dataset/discovery.cc      |  2 +-
 cpp/src/arrow/dataset/file_parquet.cc   |  2 +-
 cpp/src/arrow/dataset/partition.cc      | 20 +++++++++-----------
 cpp/src/arrow/dataset/partition.h       | 23 ++++++++---------------
 cpp/src/arrow/dataset/partition_test.cc | 23 +++++++++++------------
 5 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc
index 25e1f72482a..668624ac09a 100644
--- a/cpp/src/arrow/dataset/discovery.cc
+++ b/cpp/src/arrow/dataset/discovery.cc
@@ -281,7 +281,7 @@ Result<std::shared_ptr<Dataset>> FileSystemDatasetFactory::Finish(FinishOptions
   for (const auto& info : files_) {
     auto fixed_path = StripPrefixAndFilename(info.path(), options_.partition_base_dir);
     ARROW_ASSIGN_OR_RAISE(auto partition,
-                          partitioning->Parse(fixed_path.directory, fixed_path.prefix));
+                          partitioning->Parse(fixed_path));
     ARROW_ASSIGN_OR_RAISE(auto fragment, format_->MakeFragment({info, fs_}, partition));
     fragments.push_back(fragment);
   }
diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
index 0340b3e08dc..8f9590a6e17 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -895,7 +895,7 @@ ParquetDatasetFactory::CollectParquetFragments(const Partitioning& partitioning)
 
     auto partition_expression =
         partitioning
-            .Parse(StripPrefixAndFilename(path, options_.partition_base_dir).directory)
+            .Parse(StripPrefixAndFilename(path, options_.partition_base_dir))
             .ValueOr(compute::literal(true));
 
     ARROW_ASSIGN_OR_RAISE(
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 67797a99daa..861fd6374ee 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -82,8 +82,7 @@ std::shared_ptr<Partitioning> Partitioning::Default() {
 
     std::string type_name() const override { return "default"; }
 
-    Result<compute::Expression> Parse(const std::string& directory,
-                                      const std::string& prefix) const override {
+    Result<compute::Expression> Parse(const PartitionPathFormat& path) const override {
       return compute::literal(true);
     }
 
@@ -251,11 +250,10 @@ Result<compute::Expression> KeyValuePartitioning::ConvertKey(const Key& key) con
                         compute::literal(std::move(converted)));
 }
 
-Result<compute::Expression> KeyValuePartitioning::Parse(const std::string& directory,
-                                                        const std::string& prefix) const {
+Result<compute::Expression> KeyValuePartitioning::Parse(const PartitionPathFormat& path) const {
   std::vector<compute::Expression> expressions;
 
-  ARROW_ASSIGN_OR_RAISE(auto parsed, ParseKeys(directory, prefix));
+  ARROW_ASSIGN_OR_RAISE(auto parsed, ParseKeys(path));
   for (const Key& key : parsed) {
     ARROW_ASSIGN_OR_RAISE(auto expr, ConvertKey(key));
     if (expr == compute::literal(true)) continue;
@@ -378,8 +376,8 @@ DirectoryPartitioning::DirectoryPartitioning(std::shared_ptr<Schema> schema,
 }
 
 Result<std::vector<KeyValuePartitioning::Key>> DirectoryPartitioning::ParseKeys(
-    const std::string& directory, const std::string& prefix) const {
-  std::vector<std::string> segments = fs::internal::SplitAbstractPath(directory);
+    const PartitionPathFormat& path) const {
+  std::vector<std::string> segments = fs::internal::SplitAbstractPath(path.directory);
   return ParsePartitionSegments(segments);
 }
 
@@ -391,9 +389,9 @@ FilenamePartitioning::FilenamePartitioning(std::shared_ptr<Schema> schema,
 }
 
 Result<std::vector<KeyValuePartitioning::Key>> FilenamePartitioning::ParseKeys(
-    const std::string& directory, const std::string& prefix) const {
+    const PartitionPathFormat& path) const {
   std::vector<std::string> segments =
-      fs::internal::SplitAbstractPath(StripNonPrefix(prefix), kFilenamePartitionSep);
+      fs::internal::SplitAbstractPath(StripNonPrefix(path.prefix), kFilenamePartitionSep);
   return ParsePartitionSegments(segments);
 }
 
@@ -720,10 +718,10 @@ Result<util::optional<KeyValuePartitioning::Key>> HivePartitioning::ParseKey(
 }
 
 Result<std::vector<KeyValuePartitioning::Key>> HivePartitioning::ParseKeys(
-    const std::string& directory, const std::string& prefix) const {
+    const PartitionPathFormat& path) const {
   std::vector<Key> keys;
 
-  for (const auto& segment : fs::internal::SplitAbstractPath(directory)) {
+  for (const auto& segment : fs::internal::SplitAbstractPath(path.directory)) {
     ARROW_ASSIGN_OR_RAISE(auto maybe_key, ParseKey(segment, hive_options_));
     if (auto key = maybe_key) {
       keys.push_back(std::move(*key));
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index a8af1291ddc..897581809eb 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -80,8 +80,7 @@ class ARROW_DS_EXPORT Partitioning {
       const std::shared_ptr<RecordBatch>& batch) const = 0;
 
   /// \brief Parse a path into a partition expression
-  virtual Result<compute::Expression> Parse(const std::string& directory = "",
-                                            const std::string& prefix = "") const = 0;
+  virtual Result<compute::Expression> Parse(const PartitionPathFormat& path) const = 0;
 
   virtual Result<PartitionPathFormat> Format(const compute::Expression& expr) const = 0;
 
@@ -175,8 +174,7 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
   Result<PartitionedBatches> Partition(
       const std::shared_ptr<RecordBatch>& batch) const override;
 
-  Result<compute::Expression> Parse(const std::string& directory = "",
-                                    const std::string& prefix = "") const override;
+  Result<compute::Expression> Parse(const PartitionPathFormat& path) const override;
 
   Result<PartitionPathFormat> Format(const compute::Expression& expr) const override;
 
@@ -193,8 +191,7 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
     }
   }
 
-  virtual Result<std::vector<Key>> ParseKeys(const std::string& directory,
-                                             const std::string& prefix) const = 0;
+  virtual Result<std::vector<Key>> ParseKeys(const PartitionPathFormat& path) const = 0;
 
   virtual Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const = 0;
 
@@ -234,8 +231,7 @@ class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning {
       std::vector<std::string> field_names, PartitioningFactoryOptions = {});
 
  private:
-  Result<std::vector<Key>> ParseKeys(const std::string& directory,
-                                     const std::string& prefix) const override;
+  Result<std::vector<Key>> ParseKeys(const PartitionPathFormat& path) const override;
 
   Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
 };
@@ -292,8 +288,7 @@ class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning {
 
  private:
   const HivePartitioningOptions hive_options_;
-  Result<std::vector<Key>> ParseKeys(const std::string& directory,
-                                     const std::string& prefix) const override;
+  Result<std::vector<Key>> ParseKeys(const PartitionPathFormat& path) const override;
 
   Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
 };
@@ -315,9 +310,8 @@ class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning {
 
   std::string type_name() const override { return name_; }
 
-  Result<compute::Expression> Parse(const std::string& directory = "",
-                                    const std::string& prefix = "") const override {
-    return parse_impl_(directory);
+  Result<compute::Expression> Parse(const PartitionPathFormat& path) const override {
+    return parse_impl_(path.directory);
   }
 
   Result<PartitionPathFormat> Format(const compute::Expression& expr) const override {
@@ -359,8 +353,7 @@ class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning {
       std::vector<std::string> field_names, PartitioningFactoryOptions = {});
 
  private:
-  Result<std::vector<Key>> ParseKeys(const std::string& directory,
-                                     const std::string& prefix) const override;
+  Result<std::vector<Key>> ParseKeys(const PartitionPathFormat& path) const override;
 
   Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
 };
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 51939ac7092..efb1ecf6b25 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -44,11 +44,11 @@ namespace dataset {
 class TestPartitioning : public ::testing::Test {
  public:
   void AssertParseError(const std::string& path) {
-    ASSERT_RAISES(Invalid, partitioning_->Parse(path));
+    ASSERT_RAISES(Invalid, partitioning_->Parse(PartitionPathFormat{path,""}));
   }
 
   void AssertParse(const std::string& path, compute::Expression expected) {
-    ASSERT_OK_AND_ASSIGN(auto parsed, partitioning_->Parse(path));
+    ASSERT_OK_AND_ASSIGN(auto parsed, partitioning_->Parse(PartitionPathFormat{path,""}));
     ASSERT_EQ(parsed, expected);
   }
 
@@ -68,7 +68,7 @@ class TestPartitioning : public ::testing::Test {
     // ensure the formatted path round trips the relevant components of the partition
     // expression: roundtripped should be a subset of expr
     ASSERT_OK_AND_ASSIGN(compute::Expression roundtripped,
-                         partitioning_->Parse(formatted.directory));
+                         partitioning_->Parse(formatted));
 
     ASSERT_OK_AND_ASSIGN(roundtripped, roundtripped.Bind(*written_schema_));
     ASSERT_OK_AND_ASSIGN(auto simplified, SimplifyWithGuarantee(roundtripped, expr));
@@ -656,7 +656,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
                                   factory_->Inspect({"/%AF/%BF/%CF"}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-                                  partitioning_->Parse({"/%AF/%BF/%CF"}));
+                                  partitioning_->Parse({"/%AF/%BF/%CF",""}));
 
   options.segment_encoding = SegmentEncoding::None;
   options.schema =
@@ -706,7 +706,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
                                   factory_->Inspect({"/date=%AF/time=%BF/str=%CF"}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-                                  partitioning_->Parse({"/date=%AF/time=%BF/str=%CF"}));
+                                  partitioning_->Parse({"/date=%AF/time=%BF/str=%CF",""}));
 
   options.segment_encoding = SegmentEncoding::None;
   options.schema =
@@ -729,7 +729,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
                                   factory_->Inspect({"/date=\xAF/time=\xBF/str=\xCF"}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-      partitioning_->Parse({"/date=\xAF/time=\xBF/str=\xCF"}));
+      partitioning_->Parse({"/date=\xAF/time=\xBF/str=\xCF",""}));
 }
 
 TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
@@ -776,7 +776,7 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
       factory_->Inspect({"/%AF=2021-05-04/time=2021-05-04 07%3A27%3A00/str=%24"}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-      partitioning_->Parse({"/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24"}));
+      partitioning_->Parse({"/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24",""}));
 }
 
 TEST_F(TestPartitioning, EtlThenHive) {
@@ -801,12 +801,12 @@ TEST_F(TestPartitioning, EtlThenHive) {
         auto etl_segments_end = segments.begin() + etl_fields.size();
         auto etl_path =
             fs::internal::JoinAbstractPath(segments.begin(), etl_segments_end);
-        ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_part.Parse(etl_path));
+        ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_part.Parse({etl_path,""}));
 
         auto alphabeta_segments_end = etl_segments_end + alphabeta_fields.size();
         auto alphabeta_path =
             fs::internal::JoinAbstractPath(etl_segments_end, alphabeta_segments_end);
-        ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr, alphabeta_part.Parse(alphabeta_path));
+        ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr, alphabeta_part.Parse({alphabeta_path,""}));
 
         return and_(etl_expr, alphabeta_expr);
       });
@@ -873,12 +873,11 @@ class RangePartitioning : public Partitioning {
 
   std::string type_name() const override { return "range"; }
 
-  Result<compute::Expression> Parse(const std::string& directory,
-                                    const std::string& prefix) const override {
+  Result<compute::Expression> Parse(const PartitionPathFormat& path) const override {
     std::vector<compute::Expression> ranges;
 
     HivePartitioningOptions options;
-    for (auto segment : fs::internal::SplitAbstractPath(directory)) {
+    for (auto segment : fs::internal::SplitAbstractPath(path.directory)) {
       ARROW_ASSIGN_OR_RAISE(auto key, HivePartitioning::ParseKey(segment, options));
       if (!key) {
         return Status::Invalid("can't parse '", segment, "' as a range");

From ecaffb7413b2a83d610d64b277b5394a79c0c54a Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Fri, 29 Apr 2022 14:08:16 +0530
Subject: [PATCH 07/24] test: Parse tests to use PartitionPathArgument object

---
 cpp/src/arrow/dataset/partition_test.cc | 107 ++++++++++++++----------
 cpp/src/arrow/filesystem/path_util.cc   |   8 +-
 2 files changed, 65 insertions(+), 50 deletions(-)

diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index efb1ecf6b25..4209152e6f0 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -43,12 +43,12 @@ namespace dataset {
 
 class TestPartitioning : public ::testing::Test {
  public:
-  void AssertParseError(const std::string& path) {
-    ASSERT_RAISES(Invalid, partitioning_->Parse(PartitionPathFormat{path,""}));
+  void AssertParseError(const PartitionPathFormat& path) {
+    ASSERT_RAISES(Invalid, partitioning_->Parse(path));
   }
 
-  void AssertParse(const std::string& path, compute::Expression expected) {
-    ASSERT_OK_AND_ASSIGN(auto parsed, partitioning_->Parse(PartitionPathFormat{path,""}));
+  void AssertParse(const PartitionPathFormat& path, compute::Expression expected) {
+    ASSERT_OK_AND_ASSIGN(auto parsed, partitioning_->Parse(path));
     ASSERT_EQ(parsed, expected);
   }
 
@@ -69,7 +69,6 @@ class TestPartitioning : public ::testing::Test {
     // expression: roundtripped should be a subset of expr
     ASSERT_OK_AND_ASSIGN(compute::Expression roundtripped,
                          partitioning_->Parse(formatted));
-
     ASSERT_OK_AND_ASSIGN(roundtripped, roundtripped.Bind(*written_schema_));
     ASSERT_OK_AND_ASSIGN(auto simplified, SimplifyWithGuarantee(roundtripped, expr));
     ASSERT_EQ(simplified, literal(true));
@@ -188,19 +187,35 @@ TEST_F(TestPartitioning, DirectoryPartitioning) {
   partitioning_ = std::make_shared<DirectoryPartitioning>(
       schema({field("alpha", int32()), field("beta", utf8())}));
 
-  AssertParse("/0/hello", and_(equal(field_ref("alpha"), literal(0)),
+  AssertParse({"/0/hello",""}, and_(equal(field_ref("alpha"), literal(0)),
                                equal(field_ref("beta"), literal("hello"))));
-  AssertParse("/3", equal(field_ref("alpha"), literal(3)));
-  AssertParseError("/world/0");    // reversed order
-  AssertParseError("/0.0/foo");    // invalid alpha
-  AssertParseError("/3.25");       // invalid alpha with missing beta
-  AssertParse("", literal(true));  // no segments to parse
+  AssertParse({"/3",""}, equal(field_ref("alpha"), literal(3)));
+  AssertParseError({"/world/0",""});    // reversed order
+  AssertParseError({"/0.0/foo",""});    // invalid alpha
+  AssertParseError({"/3.25",""});       // invalid alpha with missing beta
+  AssertParse({"",""}, literal(true));  // no segments to parse
 
   // gotcha someday:
-  AssertParse("/0/dat.parquet", and_(equal(field_ref("alpha"), literal(0)),
+  AssertParse({"/0/dat.parquet",""}, and_(equal(field_ref("alpha"), literal(0)),
                                      equal(field_ref("beta"), literal("dat.parquet"))));
 
-  AssertParse("/0/foo/ignored=2341", and_(equal(field_ref("alpha"), literal(0)),
+  AssertParse({"/0/foo/ignored=2341",""}, and_(equal(field_ref("alpha"), literal(0)),
+                                          equal(field_ref("beta"), literal("foo"))));
+}
+
+TEST_F(TestPartitioning, FilenamePartitioning) {
+  partitioning_ = std::make_shared<FilenamePartitioning>(
+      schema({field("alpha", int32()), field("beta", utf8())}));
+
+  AssertParse({"","0_hello_"}, and_(equal(field_ref("alpha"), literal(0)),
+                               equal(field_ref("beta"), literal("hello"))));
+  AssertParse({"","0_"},equal(field_ref("alpha"), literal(0)));
+  AssertParseError({"","world_0_"});    // reversed order
+  AssertParseError({"","0.0_foo_"});    // invalid alpha
+  AssertParseError({"","3.25_"});       // invalid alpha with missing beta
+  AssertParse({"",""}, literal(true));  // no segments to parse
+
+  AssertParse({"","0_foo_ignored=2341"}, and_(equal(field_ref("alpha"), literal(0)),
                                           equal(field_ref("beta"), literal("foo"))));
 }
 
@@ -267,7 +282,7 @@ TEST_F(TestPartitioning, DirectoryPartitioningWithTemporal) {
         schema({field("year", int32()), field("month", int8()), field("day", temporal)}));
 
     ASSERT_OK_AND_ASSIGN(auto day, StringScalar("2020-06-08").CastTo(temporal));
-    AssertParse("/2020/06/2020-06-08",
+    AssertParse({"/2020/06/2020-06-08",""},
                 and_({equal(field_ref("year"), literal(2020)),
                       equal(field_ref("month"), literal<int8_t>(6)),
                       equal(field_ref("day"), literal(day))}));
@@ -369,10 +384,10 @@ TEST_F(TestPartitioning, DictionaryHasUniqueValues) {
         std::make_shared<DictionaryScalar>(index_and_dictionary, alpha->type());
 
     auto path = "/" + expected_dictionary->GetString(i);
-    AssertParse(path, equal(field_ref("alpha"), literal(dictionary_scalar)));
+    AssertParse({path,""}, equal(field_ref("alpha"), literal(dictionary_scalar)));
   }
 
-  AssertParseError("/yosemite");  // not in inspected dictionary
+  AssertParseError({"/yosemite",""});  // not in inspected dictionary
 }
 
 TEST_F(TestPartitioning, DiscoverSchemaSegfault) {
@@ -385,27 +400,27 @@ TEST_F(TestPartitioning, HivePartitioning) {
   partitioning_ = std::make_shared<HivePartitioning>(
       schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz");
 
-  AssertParse("/alpha=0/beta=3.25", and_(equal(field_ref("alpha"), literal(0)),
+  AssertParse({"/alpha=0/beta=3.25",""}, and_(equal(field_ref("alpha"), literal(0)),
                                          equal(field_ref("beta"), literal(3.25f))));
-  AssertParse("/beta=3.25/alpha=0", and_(equal(field_ref("beta"), literal(3.25f)),
+  AssertParse({"/beta=3.25/alpha=0",""}, and_(equal(field_ref("beta"), literal(3.25f)),
                                          equal(field_ref("alpha"), literal(0))));
-  AssertParse("/alpha=0", equal(field_ref("alpha"), literal(0)));
-  AssertParse("/alpha=xyz/beta=3.25", and_(is_null(field_ref("alpha")),
+  AssertParse({"/alpha=0",""}, equal(field_ref("alpha"), literal(0)));
+  AssertParse({"/alpha=xyz/beta=3.25",""}, and_(is_null(field_ref("alpha")),
                                            equal(field_ref("beta"), literal(3.25f))));
-  AssertParse("/beta=3.25", equal(field_ref("beta"), literal(3.25f)));
-  AssertParse("", literal(true));
+  AssertParse({"/beta=3.25",""}, equal(field_ref("beta"), literal(3.25f)));
+  AssertParse({"",""}, literal(true));
 
-  AssertParse("/alpha=0/unexpected/beta=3.25",
+  AssertParse({"/alpha=0/unexpected/beta=3.25",""},
               and_(equal(field_ref("alpha"), literal(0)),
                    equal(field_ref("beta"), literal(3.25f))));
 
-  AssertParse("/alpha=0/beta=3.25/ignored=2341",
+  AssertParse({"/alpha=0/beta=3.25/ignored=2341",""},
               and_(equal(field_ref("alpha"), literal(0)),
                    equal(field_ref("beta"), literal(3.25f))));
 
-  AssertParse("/ignored=2341", literal(true));
+  AssertParse({"/ignored=2341",""}, literal(true));
 
-  AssertParseError("/alpha=0.0/beta=3.25");  // conversion of "0.0" to int32 fails
+  AssertParseError({"/alpha=0.0/beta=3.25",""});  // conversion of "0.0" to int32 fails
 }
 
 TEST_F(TestPartitioning, HivePartitioningFormat) {
@@ -543,10 +558,10 @@ TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) {
         std::make_shared<DictionaryScalar>(index_and_dictionary, alpha->type());
 
     auto path = "/alpha=" + expected_dictionary->GetString(i);
-    AssertParse(path, equal(field_ref("alpha"), literal(dictionary_scalar)));
+    AssertParse({path,""}, equal(field_ref("alpha"), literal(dictionary_scalar)));
   }
 
-  AssertParseError("/alpha=yosemite");  // not in inspected dictionary
+  AssertParseError({"/alpha=yosemite",""});  // not in inspected dictionary
 }
 
 TEST_F(TestPartitioning, ExistingSchemaDirectory) {
@@ -647,7 +662,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
   auto date = std::make_shared<TimestampScalar>(1620086400, ts);
   auto time = std::make_shared<TimestampScalar>(1620113220, ts);
   partitioning_ = std::make_shared<DirectoryPartitioning>(options.schema, ArrayVector());
-  AssertParse("/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24",
+  AssertParse({"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24",""},
               and_({equal(field_ref("date"), literal(date)),
                     equal(field_ref("time"), literal(time)),
                     equal(field_ref("str"), literal("$"))}));
@@ -667,7 +682,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
                 options.schema->fields());
   partitioning_ = std::make_shared<DirectoryPartitioning>(
       options.schema, ArrayVector(), options.AsPartitioningOptions());
-  AssertParse("/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24",
+  AssertParse({"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24",""},
               and_({equal(field_ref("date"), literal("2021-05-04 00%3A00%3A00")),
                     equal(field_ref("time"), literal("2021-05-04 07%3A27%3A00")),
                     equal(field_ref("str"), literal("%24"))}));
@@ -690,15 +705,15 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
   auto time = std::make_shared<TimestampScalar>(1620113220, ts);
   partitioning_ = std::make_shared<HivePartitioning>(options.schema, ArrayVector(),
                                                      options.AsHivePartitioningOptions());
-  AssertParse("/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$",
+  AssertParse({"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$",""},
               and_({equal(field_ref("date"), literal(date)),
                     equal(field_ref("time"), literal(time)), is_null(field_ref("str"))}));
-  AssertParse("/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE",
+  AssertParse({"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE",""},
               and_({equal(field_ref("date"), literal(date)),
                     equal(field_ref("time"), literal(time)),
                     equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))}));
   // URL-encoded null fallback value
-  AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24",
+  AssertParse({"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24",""},
               and_({equal(field_ref("date"), literal(date)),
                     equal(field_ref("time"), literal(time)), is_null(field_ref("str"))}));
 
@@ -719,7 +734,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
       options.schema->fields());
   partitioning_ = std::make_shared<HivePartitioning>(options.schema, ArrayVector(),
                                                      options.AsHivePartitioningOptions());
-  AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24",
+  AssertParse({"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24",""},
               and_({equal(field_ref("date"), literal("2021-05-04 00%3A00%3A00")),
                     equal(field_ref("time"), literal("2021-05-04 07%3A27%3A00")),
                     equal(field_ref("str"), literal("%24"))}));
@@ -753,20 +768,20 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
   partitioning_ = std::make_shared<HivePartitioning>(options.schema, ArrayVector(),
                                                      options.AsHivePartitioningOptions());
   AssertParse(
-      "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
-      "07:27:00/str=$",
+      {"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
+      "07:27:00/str=$",""},
       and_({equal(field_ref("test'; date"), literal(date)),
             equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))}));
   AssertParse(
-      "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
-      "07:27:00/str=%E3%81%8F%E3%81%BE",
+      {"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
+      "07:27:00/str=%E3%81%8F%E3%81%BE",""},
       and_({equal(field_ref("test'; date"), literal(date)),
             equal(field_ref("test'; time"), literal(time)),
             equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))}));
   // URL-encoded null fallback value
   AssertParse(
-      "/test%27%3B%20date=2021-05-04 00%3A00%3A00/test%27%3B%20time=2021-05-04 "
-      "07%3A27%3A00/str=%24",
+      {"/test%27%3B%20date=2021-05-04 00%3A00%3A00/test%27%3B%20time=2021-05-04 "
+      "07%3A27%3A00/str=%24",""},
       and_({equal(field_ref("test'; date"), literal(date)),
             equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))}));
 
@@ -811,7 +826,7 @@ TEST_F(TestPartitioning, EtlThenHive) {
         return and_(etl_expr, alphabeta_expr);
       });
 
-  AssertParse("/1999/12/31/00/alpha=0/beta=3.25",
+  AssertParse({"/1999/12/31/00/alpha=0/beta=3.25",""},
               and_({equal(field_ref("year"), literal<int16_t>(1999)),
                     equal(field_ref("month"), literal<int8_t>(12)),
                     equal(field_ref("day"), literal<int8_t>(31)),
@@ -819,7 +834,7 @@ TEST_F(TestPartitioning, EtlThenHive) {
                     and_(equal(field_ref("alpha"), literal<int32_t>(0)),
                          equal(field_ref("beta"), literal<float>(3.25f)))}));
 
-  AssertParseError("/20X6/03/21/05/alpha=0/beta=3.25");
+  AssertParseError({"/20X6/03/21/05/alpha=0/beta=3.25",""});
 }
 
 TEST_F(TestPartitioning, Set) {
@@ -860,9 +875,9 @@ TEST_F(TestPartitioning, Set) {
   auto x_in = [&](std::vector<int32_t> set) {
     return call("is_in", {field_ref("x")}, compute::SetLookupOptions{ints(set)});
   };
-  AssertParse("/x in [1]", x_in({1}));
-  AssertParse("/x in [1 4 5]", x_in({1, 4, 5}));
-  AssertParse("/x in []", x_in({}));
+  AssertParse({"/x in [1]",""}, x_in({1}));
+  AssertParse({"/x in [1 4 5]",""}, x_in({1, 4, 5}));
+  AssertParse({"/x in []",""}, x_in({}));
 }
 
 // An adhoc partitioning which parses segments like "/x=[-3.25, 0.0)"
@@ -932,7 +947,7 @@ TEST_F(TestPartitioning, Range) {
   partitioning_ = std::make_shared<RangePartitioning>(
       schema({field("x", float64()), field("y", float64()), field("z", float64())}));
 
-  AssertParse("/x=[-1.5 0.0)/y=[0.0 1.5)/z=(1.5 3.0]",
+  AssertParse({"/x=[-1.5 0.0)/y=[0.0 1.5)/z=(1.5 3.0]",""},
               and_({and_(greater_equal(field_ref("x"), literal(-1.5)),
                          less(field_ref("x"), literal(0.0))),
                     and_(greater_equal(field_ref("y"), literal(0.0)),
diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc
index 83ff069ecff..04484dd5bda 100644
--- a/cpp/src/arrow/filesystem/path_util.cc
+++ b/cpp/src/arrow/filesystem/path_util.cc
@@ -34,12 +34,12 @@ namespace internal {
 std::vector<std::string> SplitAbstractPath(const std::string& path, char sep) {
   std::vector<std::string> parts;
   auto v = util::string_view(path);
-  // Strip trailing slash
-  if (v.length() > 0 && v.back() == kSep) {
+  // Strip trailing separator
+  if (v.length() > 0 && v.back() == sep) {
     v = v.substr(0, v.length() - 1);
   }
-  // Strip leading slash
-  if (v.length() > 0 && v.front() == kSep) {
+  // Strip leading separator
+  if (v.length() > 0 && v.front() == sep) {
     v = v.substr(1);
   }
   if (v.length() == 0) {

From 9d277cd54786d5e656aa7e1b752f046321ef01c7 Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Fri, 29 Apr 2022 21:48:18 +0530
Subject: [PATCH 08/24] feat: modify Parse() method in PyArrow to use
 PartitionPathFormat

---
 cpp/src/arrow/dataset/discovery.cc           |   3 +-
 cpp/src/arrow/dataset/file_parquet.cc        |   3 +-
 cpp/src/arrow/dataset/partition.cc           |   5 +-
 cpp/src/arrow/dataset/partition_test.cc      | 146 ++++++++++---------
 python/pyarrow/_dataset.pyx                  |   5 +-
 python/pyarrow/includes/libarrow_dataset.pxd |   6 +-
 6 files changed, 91 insertions(+), 77 deletions(-)

diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc
index 668624ac09a..f8382aa4052 100644
--- a/cpp/src/arrow/dataset/discovery.cc
+++ b/cpp/src/arrow/dataset/discovery.cc
@@ -280,8 +280,7 @@ Result<std::shared_ptr<Dataset>> FileSystemDatasetFactory::Finish(FinishOptions
   std::vector<std::shared_ptr<FileFragment>> fragments;
   for (const auto& info : files_) {
     auto fixed_path = StripPrefixAndFilename(info.path(), options_.partition_base_dir);
-    ARROW_ASSIGN_OR_RAISE(auto partition,
-                          partitioning->Parse(fixed_path));
+    ARROW_ASSIGN_OR_RAISE(auto partition, partitioning->Parse(fixed_path));
     ARROW_ASSIGN_OR_RAISE(auto fragment, format_->MakeFragment({info, fs_}, partition));
     fragments.push_back(fragment);
   }
diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
index 8f9590a6e17..a1e9698538e 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -894,8 +894,7 @@ ParquetDatasetFactory::CollectParquetFragments(const Partitioning& partitioning)
     auto row_groups = Iota(metadata_subset->num_row_groups());
 
     auto partition_expression =
-        partitioning
-            .Parse(StripPrefixAndFilename(path, options_.partition_base_dir))
+        partitioning.Parse(StripPrefixAndFilename(path, options_.partition_base_dir))
             .ValueOr(compute::literal(true));
 
     ARROW_ASSIGN_OR_RAISE(
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 861fd6374ee..3a9294fe0d5 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -250,7 +250,8 @@ Result<compute::Expression> KeyValuePartitioning::ConvertKey(const Key& key) con
                         compute::literal(std::move(converted)));
 }
 
-Result<compute::Expression> KeyValuePartitioning::Parse(const PartitionPathFormat& path) const {
+Result<compute::Expression> KeyValuePartitioning::Parse(
+    const PartitionPathFormat& path) const {
   std::vector<compute::Expression> expressions;
 
   ARROW_ASSIGN_OR_RAISE(auto parsed, ParseKeys(path));
@@ -809,7 +810,7 @@ PartitionPathFormat StripPrefixAndFilename(const std::string& path,
   auto maybe_base_less = fs::internal::RemoveAncestor(prefix, path);
   auto base_less = maybe_base_less ? std::string(*maybe_base_less) : path;
   auto basename_filename = fs::internal::GetAbstractPathParent(base_less);
-  return PartitionPathFormat{basename_filename.first, basename_filename.second};
+  return PartitionPathFormat{std::move(basename_filename.first), std::move(basename_filename.second)};
 }
 
 std::vector<std::string> StripPrefixAndFilename(const std::vector<std::string>& paths,
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 4209152e6f0..728ad83931a 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -187,36 +187,38 @@ TEST_F(TestPartitioning, DirectoryPartitioning) {
   partitioning_ = std::make_shared<DirectoryPartitioning>(
       schema({field("alpha", int32()), field("beta", utf8())}));
 
-  AssertParse({"/0/hello",""}, and_(equal(field_ref("alpha"), literal(0)),
-                               equal(field_ref("beta"), literal("hello"))));
-  AssertParse({"/3",""}, equal(field_ref("alpha"), literal(3)));
-  AssertParseError({"/world/0",""});    // reversed order
-  AssertParseError({"/0.0/foo",""});    // invalid alpha
-  AssertParseError({"/3.25",""});       // invalid alpha with missing beta
-  AssertParse({"",""}, literal(true));  // no segments to parse
+  AssertParse({"/0/hello", ""}, and_(equal(field_ref("alpha"), literal(0)),
+                                     equal(field_ref("beta"), literal("hello"))));
+  AssertParse({"/3", ""}, equal(field_ref("alpha"), literal(3)));
+  AssertParseError({"/world/0", ""});    // reversed order
+  AssertParseError({"/0.0/foo", ""});    // invalid alpha
+  AssertParseError({"/3.25", ""});       // invalid alpha with missing beta
+  AssertParse({"", ""}, literal(true));  // no segments to parse
 
   // gotcha someday:
-  AssertParse({"/0/dat.parquet",""}, and_(equal(field_ref("alpha"), literal(0)),
-                                     equal(field_ref("beta"), literal("dat.parquet"))));
+  AssertParse({"/0/dat.parquet", ""},
+              and_(equal(field_ref("alpha"), literal(0)),
+                   equal(field_ref("beta"), literal("dat.parquet"))));
 
-  AssertParse({"/0/foo/ignored=2341",""}, and_(equal(field_ref("alpha"), literal(0)),
-                                          equal(field_ref("beta"), literal("foo"))));
+  AssertParse({"/0/foo/ignored=2341", ""},
+              and_(equal(field_ref("alpha"), literal(0)),
+                   equal(field_ref("beta"), literal("foo"))));
 }
 
 TEST_F(TestPartitioning, FilenamePartitioning) {
   partitioning_ = std::make_shared<FilenamePartitioning>(
       schema({field("alpha", int32()), field("beta", utf8())}));
 
-  AssertParse({"","0_hello_"}, and_(equal(field_ref("alpha"), literal(0)),
-                               equal(field_ref("beta"), literal("hello"))));
-  AssertParse({"","0_"},equal(field_ref("alpha"), literal(0)));
-  AssertParseError({"","world_0_"});    // reversed order
-  AssertParseError({"","0.0_foo_"});    // invalid alpha
-  AssertParseError({"","3.25_"});       // invalid alpha with missing beta
-  AssertParse({"",""}, literal(true));  // no segments to parse
+  AssertParse({"", "0_hello_"}, and_(equal(field_ref("alpha"), literal(0)),
+                                     equal(field_ref("beta"), literal("hello"))));
+  AssertParse({"", "0_"}, equal(field_ref("alpha"), literal(0)));
+  AssertParseError({"", "world_0_"});    // reversed order
+  AssertParseError({"", "0.0_foo_"});    // invalid alpha
+  AssertParseError({"", "3.25_"});       // invalid alpha with missing beta
+  AssertParse({"", ""}, literal(true));  // no segments to parse
 
-  AssertParse({"","0_foo_ignored=2341"}, and_(equal(field_ref("alpha"), literal(0)),
-                                          equal(field_ref("beta"), literal("foo"))));
+  AssertParse({"", "0_foo_ignored=2341"}, and_(equal(field_ref("alpha"), literal(0)),
+                                               equal(field_ref("beta"), literal("foo"))));
 }
 
 TEST_F(TestPartitioning, DirectoryPartitioningFormat) {
@@ -282,7 +284,7 @@ TEST_F(TestPartitioning, DirectoryPartitioningWithTemporal) {
         schema({field("year", int32()), field("month", int8()), field("day", temporal)}));
 
     ASSERT_OK_AND_ASSIGN(auto day, StringScalar("2020-06-08").CastTo(temporal));
-    AssertParse({"/2020/06/2020-06-08",""},
+    AssertParse({"/2020/06/2020-06-08", ""},
                 and_({equal(field_ref("year"), literal(2020)),
                       equal(field_ref("month"), literal<int8_t>(6)),
                       equal(field_ref("day"), literal(day))}));
@@ -384,10 +386,10 @@ TEST_F(TestPartitioning, DictionaryHasUniqueValues) {
         std::make_shared<DictionaryScalar>(index_and_dictionary, alpha->type());
 
     auto path = "/" + expected_dictionary->GetString(i);
-    AssertParse({path,""}, equal(field_ref("alpha"), literal(dictionary_scalar)));
+    AssertParse({path, ""}, equal(field_ref("alpha"), literal(dictionary_scalar)));
   }
 
-  AssertParseError({"/yosemite",""});  // not in inspected dictionary
+  AssertParseError({"/yosemite", ""});  // not in inspected dictionary
 }
 
 TEST_F(TestPartitioning, DiscoverSchemaSegfault) {
@@ -400,27 +402,28 @@ TEST_F(TestPartitioning, HivePartitioning) {
   partitioning_ = std::make_shared<HivePartitioning>(
       schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz");
 
-  AssertParse({"/alpha=0/beta=3.25",""}, and_(equal(field_ref("alpha"), literal(0)),
-                                         equal(field_ref("beta"), literal(3.25f))));
-  AssertParse({"/beta=3.25/alpha=0",""}, and_(equal(field_ref("beta"), literal(3.25f)),
-                                         equal(field_ref("alpha"), literal(0))));
-  AssertParse({"/alpha=0",""}, equal(field_ref("alpha"), literal(0)));
-  AssertParse({"/alpha=xyz/beta=3.25",""}, and_(is_null(field_ref("alpha")),
-                                           equal(field_ref("beta"), literal(3.25f))));
-  AssertParse({"/beta=3.25",""}, equal(field_ref("beta"), literal(3.25f)));
-  AssertParse({"",""}, literal(true));
-
-  AssertParse({"/alpha=0/unexpected/beta=3.25",""},
+  AssertParse({"/alpha=0/beta=3.25", ""}, and_(equal(field_ref("alpha"), literal(0)),
+                                               equal(field_ref("beta"), literal(3.25f))));
+  AssertParse({"/beta=3.25/alpha=0", ""}, and_(equal(field_ref("beta"), literal(3.25f)),
+                                               equal(field_ref("alpha"), literal(0))));
+  AssertParse({"/alpha=0", ""}, equal(field_ref("alpha"), literal(0)));
+  AssertParse(
+      {"/alpha=xyz/beta=3.25", ""},
+      and_(is_null(field_ref("alpha")), equal(field_ref("beta"), literal(3.25f))));
+  AssertParse({"/beta=3.25", ""}, equal(field_ref("beta"), literal(3.25f)));
+  AssertParse({"", ""}, literal(true));
+
+  AssertParse({"/alpha=0/beta=3.25/ignored=2341", ""},
               and_(equal(field_ref("alpha"), literal(0)),
                    equal(field_ref("beta"), literal(3.25f))));
 
-  AssertParse({"/alpha=0/beta=3.25/ignored=2341",""},
+  AssertParse({"/alpha=0/beta=3.25/ignored=2341", ""},
               and_(equal(field_ref("alpha"), literal(0)),
                    equal(field_ref("beta"), literal(3.25f))));
 
-  AssertParse({"/ignored=2341",""}, literal(true));
+  AssertParse({"/ignored=2341", ""}, literal(true));
 
-  AssertParseError({"/alpha=0.0/beta=3.25",""});  // conversion of "0.0" to int32 fails
+  AssertParseError({"/alpha=0.0/beta=3.25", ""});  // conversion of "0.0" to int32 fails
 }
 
 TEST_F(TestPartitioning, HivePartitioningFormat) {
@@ -558,10 +561,10 @@ TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) {
         std::make_shared<DictionaryScalar>(index_and_dictionary, alpha->type());
 
     auto path = "/alpha=" + expected_dictionary->GetString(i);
-    AssertParse({path,""}, equal(field_ref("alpha"), literal(dictionary_scalar)));
+    AssertParse({path, ""}, equal(field_ref("alpha"), literal(dictionary_scalar)));
   }
 
-  AssertParseError({"/alpha=yosemite",""});  // not in inspected dictionary
+  AssertParseError({"/alpha=yosemite", ""});  // not in inspected dictionary
 }
 
 TEST_F(TestPartitioning, ExistingSchemaDirectory) {
@@ -662,7 +665,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
   auto date = std::make_shared<TimestampScalar>(1620086400, ts);
   auto time = std::make_shared<TimestampScalar>(1620113220, ts);
   partitioning_ = std::make_shared<DirectoryPartitioning>(options.schema, ArrayVector());
-  AssertParse({"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24",""},
+  AssertParse({"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24", ""},
               and_({equal(field_ref("date"), literal(date)),
                     equal(field_ref("time"), literal(time)),
                     equal(field_ref("str"), literal("$"))}));
@@ -671,7 +674,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
                                   factory_->Inspect({"/%AF/%BF/%CF"}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-                                  partitioning_->Parse({"/%AF/%BF/%CF",""}));
+                                  partitioning_->Parse({"/%AF/%BF/%CF", ""}));
 
   options.segment_encoding = SegmentEncoding::None;
   options.schema =
@@ -682,7 +685,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
                 options.schema->fields());
   partitioning_ = std::make_shared<DirectoryPartitioning>(
       options.schema, ArrayVector(), options.AsPartitioningOptions());
-  AssertParse({"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24",""},
+  AssertParse({"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24", ""},
               and_({equal(field_ref("date"), literal("2021-05-04 00%3A00%3A00")),
                     equal(field_ref("time"), literal("2021-05-04 07%3A27%3A00")),
                     equal(field_ref("str"), literal("%24"))}));
@@ -705,23 +708,25 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
   auto time = std::make_shared<TimestampScalar>(1620113220, ts);
   partitioning_ = std::make_shared<HivePartitioning>(options.schema, ArrayVector(),
                                                      options.AsHivePartitioningOptions());
-  AssertParse({"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$",""},
+  AssertParse({"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", ""},
               and_({equal(field_ref("date"), literal(date)),
                     equal(field_ref("time"), literal(time)), is_null(field_ref("str"))}));
-  AssertParse({"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE",""},
-              and_({equal(field_ref("date"), literal(date)),
-                    equal(field_ref("time"), literal(time)),
-                    equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))}));
+  AssertParse(
+      {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", ""},
+      and_({equal(field_ref("date"), literal(date)),
+            equal(field_ref("time"), literal(time)),
+            equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))}));
   // URL-encoded null fallback value
-  AssertParse({"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24",""},
+  AssertParse({"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", ""},
               and_({equal(field_ref("date"), literal(date)),
                     equal(field_ref("time"), literal(time)), is_null(field_ref("str"))}));
 
   // Invalid UTF-8
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
                                   factory_->Inspect({"/date=%AF/time=%BF/str=%CF"}));
-  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-                                  partitioning_->Parse({"/date=%AF/time=%BF/str=%CF",""}));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("was not valid UTF-8"),
+      partitioning_->Parse({"/date=%AF/time=%BF/str=%CF", ""}));
 
   options.segment_encoding = SegmentEncoding::None;
   options.schema =
@@ -734,7 +739,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
       options.schema->fields());
   partitioning_ = std::make_shared<HivePartitioning>(options.schema, ArrayVector(),
                                                      options.AsHivePartitioningOptions());
-  AssertParse({"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24",""},
+  AssertParse({"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", ""},
               and_({equal(field_ref("date"), literal("2021-05-04 00%3A00%3A00")),
                     equal(field_ref("time"), literal("2021-05-04 07%3A27%3A00")),
                     equal(field_ref("str"), literal("%24"))}));
@@ -744,7 +749,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
                                   factory_->Inspect({"/date=\xAF/time=\xBF/str=\xCF"}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-      partitioning_->Parse({"/date=\xAF/time=\xBF/str=\xCF",""}));
+      partitioning_->Parse({"/date=\xAF/time=\xBF/str=\xCF", ""}));
 }
 
 TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
@@ -769,19 +774,21 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
                                                      options.AsHivePartitioningOptions());
   AssertParse(
       {"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
-      "07:27:00/str=$",""},
+       "07:27:00/str=$",
+       ""},
       and_({equal(field_ref("test'; date"), literal(date)),
             equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))}));
-  AssertParse(
-      {"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
-      "07:27:00/str=%E3%81%8F%E3%81%BE",""},
-      and_({equal(field_ref("test'; date"), literal(date)),
-            equal(field_ref("test'; time"), literal(time)),
-            equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))}));
+  AssertParse({"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
+               "07:27:00/str=%E3%81%8F%E3%81%BE",
+               ""},
+              and_({equal(field_ref("test'; date"), literal(date)),
+                    equal(field_ref("test'; time"), literal(time)),
+                    equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))}));
   // URL-encoded null fallback value
   AssertParse(
       {"/test%27%3B%20date=2021-05-04 00%3A00%3A00/test%27%3B%20time=2021-05-04 "
-      "07%3A27%3A00/str=%24",""},
+       "07%3A27%3A00/str=%24",
+       ""},
       and_({equal(field_ref("test'; date"), literal(date)),
             equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))}));
 
@@ -791,7 +798,7 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
       factory_->Inspect({"/%AF=2021-05-04/time=2021-05-04 07%3A27%3A00/str=%24"}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-      partitioning_->Parse({"/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24",""}));
+      partitioning_->Parse({"/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24", ""}));
 }
 
 TEST_F(TestPartitioning, EtlThenHive) {
@@ -816,17 +823,18 @@ TEST_F(TestPartitioning, EtlThenHive) {
         auto etl_segments_end = segments.begin() + etl_fields.size();
         auto etl_path =
             fs::internal::JoinAbstractPath(segments.begin(), etl_segments_end);
-        ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_part.Parse({etl_path,""}));
+        ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_part.Parse({etl_path, ""}));
 
         auto alphabeta_segments_end = etl_segments_end + alphabeta_fields.size();
         auto alphabeta_path =
             fs::internal::JoinAbstractPath(etl_segments_end, alphabeta_segments_end);
-        ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr, alphabeta_part.Parse({alphabeta_path,""}));
+        ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr,
+                              alphabeta_part.Parse({alphabeta_path, ""}));
 
         return and_(etl_expr, alphabeta_expr);
       });
 
-  AssertParse({"/1999/12/31/00/alpha=0/beta=3.25",""},
+  AssertParse({"/1999/12/31/00/alpha=0/beta=3.25", ""},
               and_({equal(field_ref("year"), literal<int16_t>(1999)),
                     equal(field_ref("month"), literal<int8_t>(12)),
                     equal(field_ref("day"), literal<int8_t>(31)),
@@ -834,7 +842,7 @@ TEST_F(TestPartitioning, EtlThenHive) {
                     and_(equal(field_ref("alpha"), literal<int32_t>(0)),
                          equal(field_ref("beta"), literal<float>(3.25f)))}));
 
-  AssertParseError({"/20X6/03/21/05/alpha=0/beta=3.25",""});
+  AssertParseError({"/20X6/03/21/05/alpha=0/beta=3.25", ""});
 }
 
 TEST_F(TestPartitioning, Set) {
@@ -875,9 +883,9 @@ TEST_F(TestPartitioning, Set) {
   auto x_in = [&](std::vector<int32_t> set) {
     return call("is_in", {field_ref("x")}, compute::SetLookupOptions{ints(set)});
   };
-  AssertParse({"/x in [1]",""}, x_in({1}));
-  AssertParse({"/x in [1 4 5]",""}, x_in({1, 4, 5}));
-  AssertParse({"/x in []",""}, x_in({}));
+  AssertParse({"/x in [1]", ""}, x_in({1}));
+  AssertParse({"/x in [1 4 5]", ""}, x_in({1, 4, 5}));
+  AssertParse({"/x in []", ""}, x_in({}));
 }
 
 // An adhoc partitioning which parses segments like "/x=[-3.25, 0.0)"
@@ -947,7 +955,7 @@ TEST_F(TestPartitioning, Range) {
   partitioning_ = std::make_shared<RangePartitioning>(
       schema({field("x", float64()), field("y", float64()), field("z", float64())}));
 
-  AssertParse({"/x=[-1.5 0.0)/y=[0.0 1.5)/z=(1.5 3.0]",""},
+  AssertParse({"/x=[-1.5 0.0)/y=[0.0 1.5)/z=(1.5 3.0]", ""},
               and_({and_(greater_equal(field_ref("x"), literal(-1.5)),
                          less(field_ref("x"), literal(0.0))),
                     and_(greater_equal(field_ref("y"), literal(0.0)),
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 6b1b8a4ced3..7fff11b9231 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -1321,7 +1321,10 @@ cdef class Partitioning(_Weakrefable):
 
     def parse(self, directory="", prefix=""):
         cdef CResult[CExpression] result
-        result = self.partitioning.Parse(tobytes(directory), tobytes(prefix))
+        cdef CPartitionPathFormat path
+        path.directory = tobytes(directory)
+        path.prefix = tobytes(prefix)
+        result = self.partitioning.Parse(path)
         return Expression.wrap(GetResultValue(result))
 
     @property
diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
index bd7375aad81..71b76c6c48c 100644
--- a/python/pyarrow/includes/libarrow_dataset.pxd
+++ b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -277,9 +277,13 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
         CCSVConvertOptions convert_options
         CCSVReadOptions read_options
 
+    cdef cppclass CPartitionPathFormat "arrow::dataset::PartitionPathFormat":
+        c_string directory
+        c_string prefix
+
     cdef cppclass CPartitioning "arrow::dataset::Partitioning":
         c_string type_name() const
-        CResult[CExpression] Parse(const c_string & directory, const c_string & prefix) const
+        CResult[CExpression] Parse(const CPartitionPathFormat & path) const
         const shared_ptr[CSchema] & schema()
 
     cdef cppclass CSegmentEncoding" arrow::dataset::SegmentEncoding":

From 6d8c4f272ea7275fbf9403e0051bcdca62b86f45 Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Fri, 29 Apr 2022 21:52:55 +0530
Subject: [PATCH 09/24] docs: update docstring of StripPrefixAndFilename

---
 cpp/src/arrow/dataset/partition.cc | 3 ++-
 cpp/src/arrow/dataset/partition.h  | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 3a9294fe0d5..5fba5610381 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -810,7 +810,8 @@ PartitionPathFormat StripPrefixAndFilename(const std::string& path,
   auto maybe_base_less = fs::internal::RemoveAncestor(prefix, path);
   auto base_less = maybe_base_less ? std::string(*maybe_base_less) : path;
   auto basename_filename = fs::internal::GetAbstractPathParent(base_less);
-  return PartitionPathFormat{std::move(basename_filename.first), std::move(basename_filename.second)};
+  return PartitionPathFormat{std::move(basename_filename.first),
+                             std::move(basename_filename.second)};
 }
 
 std::vector<std::string> StripPrefixAndFilename(const std::vector<std::string>& paths,
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index 897581809eb..e4846ef7d08 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -358,9 +358,10 @@ class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning {
   Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
 };
 
-/// \brief Remove a prefix and the filename of a path.
+/// \brief Extracts the directory and filename and removes the prefix of a path
 ///
-/// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") -> "year=2019"`
+/// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") ->
+/// {"year=2019","c.txt"}`
 ARROW_DS_EXPORT PartitionPathFormat StripPrefixAndFilename(const std::string& path,
                                                            const std::string& prefix);
 

From 6876f21589aacee059178d67b597dff73428153f Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Fri, 29 Apr 2022 23:56:25 +0530
Subject: [PATCH 10/24] refactor: rename prefix to filename in
 PartitionPathFormat

---
 cpp/src/arrow/dataset/partition.cc           | 4 ++--
 cpp/src/arrow/dataset/partition.h            | 2 +-
 cpp/src/arrow/dataset/partition_test.cc      | 4 ++--
 python/pyarrow/_dataset.pyx                  | 4 ++--
 python/pyarrow/includes/libarrow_dataset.pxd | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 5fba5610381..6e636b88cc0 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -391,8 +391,8 @@ FilenamePartitioning::FilenamePartitioning(std::shared_ptr<Schema> schema,
 
 Result<std::vector<KeyValuePartitioning::Key>> FilenamePartitioning::ParseKeys(
     const PartitionPathFormat& path) const {
-  std::vector<std::string> segments =
-      fs::internal::SplitAbstractPath(StripNonPrefix(path.prefix), kFilenamePartitionSep);
+  std::vector<std::string> segments = fs::internal::SplitAbstractPath(
+      StripNonPrefix(path.filename), kFilenamePartitionSep);
   return ParsePartitionSegments(segments);
 }
 
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index e4846ef7d08..e18c1d6ddbf 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -39,7 +39,7 @@ namespace dataset {
 constexpr char kFilenamePartitionSep = '_';
 
 struct ARROW_DS_EXPORT PartitionPathFormat {
-  std::string directory, prefix;
+  std::string directory, filename;
 };
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 728ad83931a..157a9c3af44 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -58,12 +58,12 @@ class TestPartitioning : public ::testing::Test {
   }
 
   void AssertFormat(compute::Expression expr, const std::string& expected_directory,
-                    const std::string& expected_prefix = "") {
+                    const std::string& expected_filename = "") {
     // formatted partition expressions are bound to the schema of the dataset being
     // written
     ASSERT_OK_AND_ASSIGN(auto formatted, partitioning_->Format(expr));
     ASSERT_EQ(formatted.directory, expected_directory);
-    ASSERT_EQ(formatted.prefix, expected_prefix);
+    ASSERT_EQ(formatted.filename, expected_filename);
 
     // ensure the formatted path round trips the relevant components of the partition
     // expression: roundtripped should be a subset of expr
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 7fff11b9231..d75caeacdc4 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -1319,11 +1319,11 @@ cdef class Partitioning(_Weakrefable):
     cdef inline shared_ptr[CPartitioning] unwrap(self):
         return self.wrapped
 
-    def parse(self, directory="", prefix=""):
+    def parse(self, directory="", filename=""):
         cdef CResult[CExpression] result
         cdef CPartitionPathFormat path
         path.directory = tobytes(directory)
-        path.prefix = tobytes(prefix)
+        path.filename = tobytes(filename)
         result = self.partitioning.Parse(path)
         return Expression.wrap(GetResultValue(result))
 
diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
index 71b76c6c48c..8a831a3a734 100644
--- a/python/pyarrow/includes/libarrow_dataset.pxd
+++ b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -279,7 +279,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
 
     cdef cppclass CPartitionPathFormat "arrow::dataset::PartitionPathFormat":
         c_string directory
-        c_string prefix
+        c_string filename
 
     cdef cppclass CPartitioning "arrow::dataset::Partitioning":
         c_string type_name() const

From ec9357956bac0d19c0a66e1402214a7e1c4435ea Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Sat, 30 Apr 2022 03:38:43 +0530
Subject: [PATCH 11/24] feat: Inspect() methods to use PartitionPathFormat

---
 cpp/src/arrow/dataset/file_parquet.cc   |   5 +-
 cpp/src/arrow/dataset/partition.cc      |  32 ++--
 cpp/src/arrow/dataset/partition.h       |   9 +-
 cpp/src/arrow/dataset/partition_test.cc | 217 +++++++++++++-----------
 4 files changed, 145 insertions(+), 118 deletions(-)

diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
index a1e9698538e..cd0f1973081 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -916,12 +916,11 @@ Result<std::vector<std::shared_ptr<Schema>>> ParquetDatasetFactory::InspectSchem
 
   if (auto factory = options_.partitioning.factory()) {
     // Gather paths found in RowGroups' ColumnChunks.
-    std::vector<std::string> stripped(paths_with_row_group_ids_.size());
+    std::vector<PartitionPathFormat> stripped(paths_with_row_group_ids_.size());
 
     size_t i = 0;
     for (const auto& e : paths_with_row_group_ids_) {
-      stripped[i++] =
-          StripPrefixAndFilename(e.first, options_.partition_base_dir).directory;
+      stripped[i++] = StripPrefixAndFilename(e.first, options_.partition_base_dir);
     }
     ARROW_ASSIGN_OR_RAISE(auto partition_schema, factory->Inspect(stripped));
 
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 6e636b88cc0..5d7457141cf 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -581,10 +581,10 @@ class DirectoryPartitioningFactory : public KeyValuePartitioningFactory {
   std::string type_name() const override { return "directory"; }
 
   Result<std::shared_ptr<Schema>> Inspect(
-      const std::vector<std::string>& paths) override {
+      const std::vector<PartitionPathFormat>& paths) override {
     for (auto path : paths) {
       std::vector<std::string> segments;
-      segments = fs::internal::SplitAbstractPath(path);
+      segments = fs::internal::SplitAbstractPath(path.directory);
       RETURN_NOT_OK(InspectPartitionSegments(segments, field_names_));
     }
     return DoInspect();
@@ -628,11 +628,11 @@ class FilenamePartitioningFactory : public KeyValuePartitioningFactory {
   std::string type_name() const override { return "filename"; }
 
   Result<std::shared_ptr<Schema>> Inspect(
-      const std::vector<std::string>& paths) override {
+      const std::vector<PartitionPathFormat>& paths) override {
     for (const auto& path : paths) {
       std::vector<std::string> segments;
-      segments =
-          fs::internal::SplitAbstractPath(StripNonPrefix(path), kFilenamePartitionSep);
+      segments = fs::internal::SplitAbstractPath(StripNonPrefix(path.filename),
+                                                 kFilenamePartitionSep);
       RETURN_NOT_OK(InspectPartitionSegments(segments, field_names_));
     }
     return DoInspect();
@@ -761,10 +761,10 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory {
   std::string type_name() const override { return "hive"; }
 
   Result<std::shared_ptr<Schema>> Inspect(
-      const std::vector<std::string>& paths) override {
+      const std::vector<PartitionPathFormat>& paths) override {
     auto options = options_.AsHivePartitioningOptions();
     for (auto path : paths) {
-      for (auto&& segment : fs::internal::SplitAbstractPath(path)) {
+      for (auto&& segment : fs::internal::SplitAbstractPath(path.directory)) {
         ARROW_ASSIGN_OR_RAISE(auto maybe_key,
                               HivePartitioning::ParseKey(segment, options));
         if (auto key = maybe_key) {
@@ -814,28 +814,28 @@ PartitionPathFormat StripPrefixAndFilename(const std::string& path,
                              std::move(basename_filename.second)};
 }
 
-std::vector<std::string> StripPrefixAndFilename(const std::vector<std::string>& paths,
-                                                const std::string& prefix) {
-  std::vector<std::string> result;
+std::vector<PartitionPathFormat> StripPrefixAndFilename(
+    const std::vector<std::string>& paths, const std::string& prefix) {
+  std::vector<PartitionPathFormat> result;
   result.reserve(paths.size());
   for (const auto& path : paths) {
-    result.emplace_back(StripPrefixAndFilename(path, prefix).directory);
+    result.emplace_back(StripPrefixAndFilename(path, prefix));
   }
   return result;
 }
 
-std::vector<std::string> StripPrefixAndFilename(const std::vector<fs::FileInfo>& files,
-                                                const std::string& prefix) {
-  std::vector<std::string> result;
+std::vector<PartitionPathFormat> StripPrefixAndFilename(
+    const std::vector<fs::FileInfo>& files, const std::string& prefix) {
+  std::vector<PartitionPathFormat> result;
   result.reserve(files.size());
   for (const auto& info : files) {
-    result.emplace_back(StripPrefixAndFilename(info.path(), prefix).directory);
+    result.emplace_back(StripPrefixAndFilename(info.path(), prefix));
   }
   return result;
 }
 
 Result<std::shared_ptr<Schema>> PartitioningOrFactory::GetOrInferSchema(
-    const std::vector<std::string>& paths) {
+    const std::vector<PartitionPathFormat>& paths) {
   if (auto part = partitioning()) {
     return part->schema();
   }
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index e18c1d6ddbf..fe20e837619 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -152,7 +152,7 @@ class ARROW_DS_EXPORT PartitioningFactory {
   /// Get the schema for the resulting Partitioning.
   /// This may reset internal state, for example dictionaries of unique representations.
   virtual Result<std::shared_ptr<Schema>> Inspect(
-      const std::vector<std::string>& paths) = 0;
+      const std::vector<PartitionPathFormat>& paths) = 0;
 
   /// Create a partitioning using the provided schema
   /// (fields may be dropped).
@@ -366,11 +366,11 @@ ARROW_DS_EXPORT PartitionPathFormat StripPrefixAndFilename(const std::string& pa
                                                            const std::string& prefix);
 
 /// \brief Vector version of StripPrefixAndFilename.
-ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
+ARROW_DS_EXPORT std::vector<PartitionPathFormat> StripPrefixAndFilename(
     const std::vector<std::string>& paths, const std::string& prefix);
 
 /// \brief Vector version of StripPrefixAndFilename.
-ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
+ARROW_DS_EXPORT std::vector<PartitionPathFormat> StripPrefixAndFilename(
     const std::vector<fs::FileInfo>& files, const std::string& prefix);
 
 /// \brief Either a Partitioning or a PartitioningFactory
@@ -397,7 +397,8 @@ class ARROW_DS_EXPORT PartitioningOrFactory {
   const std::shared_ptr<PartitioningFactory>& factory() const { return factory_; }
 
   /// \brief Get the partition schema, inferring it with the given factory if needed.
-  Result<std::shared_ptr<Schema>> GetOrInferSchema(const std::vector<std::string>& paths);
+  Result<std::shared_ptr<Schema>> GetOrInferSchema(
+      const std::vector<PartitionPathFormat>& paths);
 
  private:
   std::shared_ptr<PartitioningFactory> factory_;
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 157a9c3af44..fe84da843cf 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -74,7 +74,7 @@ class TestPartitioning : public ::testing::Test {
     ASSERT_EQ(simplified, literal(true));
   }
 
-  void AssertInspect(const std::vector<std::string>& paths,
+  void AssertInspect(const std::vector<PartitionPathFormat>& paths,
                      const std::vector<std::shared_ptr<Field>>& expected) {
     ASSERT_OK_AND_ASSIGN(auto actual, factory_->Inspect(paths));
     ASSERT_EQ(*actual, Schema(expected));
@@ -122,7 +122,7 @@ class TestPartitioning : public ::testing::Test {
     AssertPartition(partitioning, record_batch, expected_batches, expected_expressions);
   }
 
-  void AssertInspectError(const std::vector<std::string>& paths) {
+  void AssertInspectError(const std::vector<PartitionPathFormat>& paths) {
     ASSERT_RAISES(Invalid, factory_->Inspect(paths));
   }
 
@@ -295,39 +295,39 @@ TEST_F(TestPartitioning, DiscoverSchemaDirectory) {
   factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"});
 
   // type is int32 if possible
-  AssertInspect({"/0/1"}, {Int("alpha"), Int("beta")});
+  AssertInspect({{"/0/1", ""}}, {Int("alpha"), Int("beta")});
 
   // extra segments are ignored
-  AssertInspect({"/0/1/what"}, {Int("alpha"), Int("beta")});
+  AssertInspect({{"/0/1/what", ""}}, {Int("alpha"), Int("beta")});
 
   // fall back to string if any segment for field alpha is not parseable as int
-  AssertInspect({"/0/1", "/hello/1"}, {Str("alpha"), Int("beta")});
+  AssertInspect({{"/0/1", ""}, {"/hello/1", ""}}, {Str("alpha"), Int("beta")});
 
   // If there are too many digits fall back to string
-  AssertInspect({"/3760212050/1"}, {Str("alpha"), Int("beta")});
+  AssertInspect({{"/3760212050/1", ""}}, {Str("alpha"), Int("beta")});
 
   // missing segment for beta doesn't cause an error or fallback
-  AssertInspect({"/0/1", "/hello"}, {Str("alpha"), Int("beta")});
+  AssertInspect({{"/0/1", ""}, {"/hello", ""}}, {Str("alpha"), Int("beta")});
 }
 
 TEST_F(TestPartitioning, DiscoverSchemaFilename) {
   factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"});
 
   // type is int32 if possible
-  AssertInspect({"0_1_"}, {Int("alpha"), Int("beta")});
+  AssertInspect({{"", "0_1_"}}, {Int("alpha"), Int("beta")});
 
   // extra segments are ignored
-  AssertInspect({"0_1_what_"}, {Int("alpha"), Int("beta")});
+  AssertInspect({{"", "0_1_what_"}}, {Int("alpha"), Int("beta")});
 
   // fall back to string if any segment for field alpha is not parseable as int
-  AssertInspect({"0_1_", "hello_1_"}, {Str("alpha"), Int("beta")});
+  AssertInspect({{"", "0_1_"}, {"", "hello_1_"}}, {Str("alpha"), Int("beta")});
 
   // If there are too many digits fall back to string
-  AssertInspect({"3760212050_1_"}, {Str("alpha"), Int("beta")});
+  AssertInspect({{"", "3760212050_1_"}}, {Str("alpha"), Int("beta")});
 
   // Invalid syntax
-  AssertInspectError({"234-12"});
-  AssertInspectError({"hello"});
+  AssertInspectError({{"", "234-12"}});
+  AssertInspectError({{"", "hello"}});
 }
 
 TEST_F(TestPartitioning, DirectoryDictionaryInference) {
@@ -336,18 +336,21 @@ TEST_F(TestPartitioning, DirectoryDictionaryInference) {
   factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options);
 
   // type is still int32 if possible
-  AssertInspect({"/0/1"}, {DictInt("alpha"), DictInt("beta")});
+  AssertInspect({{"/0/1", ""}}, {DictInt("alpha"), DictInt("beta")});
 
   // If there are too many digits fall back to string
-  AssertInspect({"/3760212050/1"}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({{"/3760212050/1", ""}}, {DictStr("alpha"), DictInt("beta")});
 
   // successful dictionary inference
-  AssertInspect({"/a/0"}, {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({"/a/0", "/a/1"}, {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({"/a/0", "/a"}, {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({"/0/a", "/1"}, {DictInt("alpha"), DictStr("beta")});
-  AssertInspect({"/a/0", "/b/0", "/a/1", "/b/1"}, {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({"/a/-", "/b/-", "/a/_", "/b/_"}, {DictStr("alpha"), DictStr("beta")});
+  AssertInspect({{"/a/0", ""}}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({{"/a/0", ""}, {"/a/1", ""}}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({{"/a/0", ""}, {"/a", ""}}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({{"/a/0", ""}, {"/b/0", ""}, {"/a/1", ""}, {"/b/1", ""}},
+                {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({{"/a/-", ""}, {"/b/-", ""}, {"/a/_", ""}, {"/b/_", ""}},
+                {DictStr("alpha"), DictStr("beta")});
+  AssertInspect({{"/a/-", ""}, {"/b/-", ""}, {"/a/_", ""}, {"/b/_", ""}},
+                {DictStr("alpha"), DictStr("beta")});
 }
 
 TEST_F(TestPartitioning, FilenameDictionaryInference) {
@@ -356,15 +359,16 @@ TEST_F(TestPartitioning, FilenameDictionaryInference) {
   factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}, options);
 
   // type is still int32 if possible
-  AssertInspect({"0_1_"}, {DictInt("alpha"), DictInt("beta")});
+  AssertInspect({{"", "0_1_"}}, {DictInt("alpha"), DictInt("beta")});
 
   // If there are too many digits fall back to string
-  AssertInspect({"3760212050_1_"}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({{"", "3760212050_1_"}}, {DictStr("alpha"), DictInt("beta")});
 
   // successful dictionary inference
-  AssertInspect({"a_0_"}, {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({"a_0_", "a_1_"}, {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({"a_0_", "b_0_", "a_1_", "b_1_"}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({{"", "a_0_"}}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({{"", "a_0_"}, {"", "a_1_"}}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({{"", "a_0_"}, {"", "b_0_"}, {"", "a_1_"}, {"", "b_1_"}},
+                {DictStr("alpha"), DictInt("beta")});
 }
 
 TEST_F(TestPartitioning, DictionaryHasUniqueValues) {
@@ -373,7 +377,8 @@ TEST_F(TestPartitioning, DictionaryHasUniqueValues) {
   factory_ = DirectoryPartitioning::MakeFactory({"alpha"}, options);
 
   auto alpha = DictStr("alpha");
-  AssertInspect({"/a", "/b", "/a", "/b", "/c", "/a"}, {alpha});
+  AssertInspect({{"/a", ""}, {"/b", ""}, {"/a", ""}, {"/b", ""}, {"/c", ""}, {"/a", ""}},
+                {alpha});
   ASSERT_OK_AND_ASSIGN(auto partitioning, factory_->Finish(schema({alpha})));
 
   auto expected_dictionary =
@@ -395,7 +400,7 @@ TEST_F(TestPartitioning, DictionaryHasUniqueValues) {
 TEST_F(TestPartitioning, DiscoverSchemaSegfault) {
   // ARROW-7638
   factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"});
-  AssertInspectError({"oops.txt"});
+  AssertInspectError({{"oops.txt", ""}});
 }
 
 TEST_F(TestPartitioning, HivePartitioning) {
@@ -481,27 +486,29 @@ TEST_F(TestPartitioning, DiscoverHiveSchema) {
   factory_ = HivePartitioning::MakeFactory(options);
 
   // type is int32 if possible
-  AssertInspect({"/alpha=0/beta=1"}, {Int("alpha"), Int("beta")});
+  AssertInspect({{"/alpha=0/beta=1", ""}}, {Int("alpha"), Int("beta")});
 
   // extra segments are ignored
-  AssertInspect({"/gamma=0/unexpected/delta=1/dat.parquet"},
+  AssertInspect({{"/gamma=0/unexpected/delta=1/dat.parquet", ""}},
                 {Int("gamma"), Int("delta")});
 
   // schema field names are in order of first occurrence
   // (...so ensure your partitions are ordered the same for all paths)
-  AssertInspect({"/alpha=0/beta=1", "/beta=2/alpha=3"}, {Int("alpha"), Int("beta")});
+  AssertInspect({{"/alpha=0/beta=1", ""}, {"/beta=2/alpha=3", ""}},
+                {Int("alpha"), Int("beta")});
 
   // Null fallback strings shouldn't interfere with type inference
-  AssertInspect({"/alpha=xyz/beta=x", "/alpha=7/beta=xyz"}, {Int("alpha"), Str("beta")});
+  AssertInspect({{"/alpha=xyz/beta=x", ""}, {"/alpha=7/beta=xyz", ""}},
+                {Int("alpha"), Str("beta")});
 
   // Cannot infer if the only values are null
-  AssertInspectError({"/alpha=xyz"});
+  AssertInspectError({{"/alpha=xyz", ""}});
 
   // If there are too many digits fall back to string
-  AssertInspect({"/alpha=3760212050"}, {Str("alpha")});
+  AssertInspect({{"/alpha=3760212050", ""}}, {Str("alpha")});
 
   // missing path segments will not cause an error
-  AssertInspect({"/alpha=0/beta=1", "/beta=2/alpha=3", "/gamma=what"},
+  AssertInspect({{"/alpha=0/beta=1", ""}, {"/beta=2/alpha=3", ""}, {"/gamma=what", ""}},
                 {Int("alpha"), Int("beta"), Str("gamma")});
 }
 
@@ -512,22 +519,27 @@ TEST_F(TestPartitioning, HiveDictionaryInference) {
   factory_ = HivePartitioning::MakeFactory(options);
 
   // type is still int32 if possible
-  AssertInspect({"/alpha=0/beta=1"}, {DictInt("alpha"), DictInt("beta")});
+  AssertInspect({{"/alpha=0/beta=1", ""}}, {DictInt("alpha"), DictInt("beta")});
 
   // If there are too many digits fall back to string
-  AssertInspect({"/alpha=3760212050"}, {DictStr("alpha")});
+  AssertInspect({{"/alpha=3760212050", ""}}, {DictStr("alpha")});
 
   // successful dictionary inference
-  AssertInspect({"/alpha=a/beta=0"}, {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({"/alpha=a/beta=0", "/alpha=a/1"}, {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({"/alpha=a/beta=0", "/alpha=xyz/beta=xyz"},
+  AssertInspect({{"/alpha=a/beta=0", ""}}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({{"/alpha=a/beta=0", ""}, {"/alpha=a/1", ""}},
                 {DictStr("alpha"), DictInt("beta")});
-  AssertInspect(
-      {"/alpha=a/beta=0", "/alpha=b/beta=0", "/alpha=a/beta=1", "/alpha=b/beta=1"},
-      {DictStr("alpha"), DictInt("beta")});
-  AssertInspect(
-      {"/alpha=a/beta=-", "/alpha=b/beta=-", "/alpha=a/beta=_", "/alpha=b/beta=_"},
-      {DictStr("alpha"), DictStr("beta")});
+  AssertInspect({{"/alpha=a/beta=0", ""}, {"/alpha=xyz/beta=xyz", ""}},
+                {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({{"/alpha=a/beta=0", ""},
+                 {"/alpha=b/beta=0", ""},
+                 {"/alpha=a/beta=1", ""},
+                 {"/alpha=b/beta=1", ""}},
+                {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({{"/alpha=a/beta=-", ""},
+                 {"/alpha=b/beta=-", ""},
+                 {"/alpha=a/beta=_", ""},
+                 {"/alpha=b/beta=_", ""}},
+                {DictStr("alpha"), DictStr("beta")});
 }
 
 TEST_F(TestPartitioning, HiveNullFallbackPassedOn) {
@@ -535,7 +547,7 @@ TEST_F(TestPartitioning, HiveNullFallbackPassedOn) {
   options.null_fallback = "xyz";
   factory_ = HivePartitioning::MakeFactory(options);
 
-  EXPECT_OK_AND_ASSIGN(auto schema, factory_->Inspect({"/alpha=a/beta=0"}));
+  EXPECT_OK_AND_ASSIGN(auto schema, factory_->Inspect({{"/alpha=a/beta=0", ""}}));
   EXPECT_OK_AND_ASSIGN(auto partitioning, factory_->Finish(schema));
   ASSERT_EQ("xyz",
             std::static_pointer_cast<HivePartitioning>(partitioning)->null_fallback());
@@ -547,7 +559,12 @@ TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) {
   factory_ = HivePartitioning::MakeFactory(options);
 
   auto alpha = DictStr("alpha");
-  AssertInspect({"/alpha=a", "/alpha=b", "/alpha=a", "/alpha=b", "/alpha=c", "/alpha=a"},
+  AssertInspect({{"/alpha=a", ""},
+                 {"/alpha=b", ""},
+                 {"/alpha=a", ""},
+                 {"/alpha=b", ""},
+                 {"/alpha=c", ""},
+                 {"/alpha=a", ""}},
                 {alpha});
   ASSERT_OK_AND_ASSIGN(auto partitioning, factory_->Finish(schema({alpha})));
 
@@ -574,22 +591,23 @@ TEST_F(TestPartitioning, ExistingSchemaDirectory) {
   options.schema = schema({field("alpha", int64()), field("beta", dict_type)});
   factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options);
 
-  AssertInspect({"/0/1"}, options.schema->fields());
-  AssertInspect({"/0/1/what"}, options.schema->fields());
+  AssertInspect({{"/0/1", ""}}, options.schema->fields());
+  AssertInspect({{"/0/1/what", ""}}, options.schema->fields());
 
   // fail if any segment is not parseable as schema type
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"),
-                                  factory_->Inspect({"/0/1", "/hello/1"}));
+                                  factory_->Inspect({{"/0/1", ""}, {"/hello/1", ""}}));
   factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options);
 
   // Now we don't fail since our type is large enough
-  AssertInspect({"/3760212050/1"}, options.schema->fields());
+  AssertInspect({{"/3760212050/1", ""}}, options.schema->fields());
   // If there are still too many digits, fail
-  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"),
-                                  factory_->Inspect({"/1038581385102940193760212050/1"}));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("Failed to parse string"),
+      factory_->Inspect({{"/1038581385102940193760212050/1", ""}}));
   factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options);
 
-  AssertInspect({"/0/1", "/2"}, options.schema->fields());
+  AssertInspect({{"/0/1", ""}, {"/2", ""}}, options.schema->fields());
 }
 
 TEST_F(TestPartitioning, ExistingSchemaHive) {
@@ -599,33 +617,33 @@ TEST_F(TestPartitioning, ExistingSchemaHive) {
   options.schema = schema({field("a", int64()), field("b", dict_type)});
   factory_ = HivePartitioning::MakeFactory(options);
 
-  AssertInspect({"/a=0/b=1"}, options.schema->fields());
-  AssertInspect({"/a=0/b=1/what"}, options.schema->fields());
-  AssertInspect({"/a=0", "/b=1"}, options.schema->fields());
+  AssertInspect({{"/a=0/b=1", ""}}, options.schema->fields());
+  AssertInspect({{"/a=0/b=1/what", ""}}, options.schema->fields());
+  AssertInspect({{"/a=0", ""}, {"/b=1", ""}}, options.schema->fields());
 
   // fail if any segment for field alpha is not parseable as schema type
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid,
       ::testing::HasSubstr(
           "Could not cast segments for partition field a to requested type int64"),
-      factory_->Inspect({"/a=0/b=1", "/a=hello/b=1"}));
+      factory_->Inspect({{"/a=0/b=1", ""}, {"/a=hello/b=1", ""}}));
   factory_ = HivePartitioning::MakeFactory(options);
 
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid,
       ::testing::HasSubstr("Requested schema has 2 fields, but only 1 were detected"),
-      factory_->Inspect({"/a=0", "/a=hello"}));
+      factory_->Inspect({{"/a=0", ""}, {"/a=hello", ""}}));
   factory_ = HivePartitioning::MakeFactory(options);
 
   // Now we don't fail since our type is large enough
-  AssertInspect({"/a=3760212050/b=1"}, options.schema->fields());
+  AssertInspect({{"/a=3760212050/b=1", ""}}, options.schema->fields());
   // If there are still too many digits, fail
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("Failed to parse string"),
-      factory_->Inspect({"/a=1038581385102940193760212050/b=1"}));
+      factory_->Inspect({{"/a=1038581385102940193760212050/b=1", ""}}));
   factory_ = HivePartitioning::MakeFactory(options);
 
-  AssertInspect({"/a=0/b=1", "/b=2"}, options.schema->fields());
+  AssertInspect({{"/a=0/b=1", ""}, {"/b=2", ""}}, options.schema->fields());
 }
 
 TEST_F(TestPartitioning, ExistingSchemaFilename) {
@@ -635,22 +653,23 @@ TEST_F(TestPartitioning, ExistingSchemaFilename) {
   options.schema = schema({field("alpha", int64()), field("beta", dict_type)});
   factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}, options);
 
-  AssertInspect({"0_1_"}, options.schema->fields());
-  AssertInspect({"0_1_what_"}, options.schema->fields());
+  AssertInspect({{"", "0_1_"}}, options.schema->fields());
+  AssertInspect({{"", "0_1_what_"}}, options.schema->fields());
 
   // fail if any segment is not parseable as schema type
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"),
-                                  factory_->Inspect({"0_1_", "hello_1_"}));
+                                  factory_->Inspect({{"", "0_1_"}, {"", "hello_1_"}}));
   factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}, options);
 
   // Now we don't fail since our type is large enough
-  AssertInspect({"3760212050_1_"}, options.schema->fields());
+  AssertInspect({{"", "3760212050_1_"}}, options.schema->fields());
   // If there are still too many digits, fail
-  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"),
-                                  factory_->Inspect({"1038581385102940193760212050_1_"}));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("Failed to parse string"),
+      factory_->Inspect({{"", "1038581385102940193760212050_1_"}}));
   factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}, options);
 
-  AssertInspect({"0_1_", "2_"}, options.schema->fields());
+  AssertInspect({{"", "0_1_"}, {"", "2_"}}, options.schema->fields());
 }
 
 TEST_F(TestPartitioning, UrlEncodedDirectory) {
@@ -659,8 +678,8 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
   options.schema = schema({field("date", ts), field("time", ts), field("str", utf8())});
   factory_ = DirectoryPartitioning::MakeFactory(options.schema->field_names(), options);
 
-  AssertInspect({"/2021-05-04 00:00:00/2021-05-04 07:27:00/%24",
-                 "/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/foo"},
+  AssertInspect({{"/2021-05-04 00:00:00/2021-05-04 07:27:00/%24", ""},
+                 {"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/foo", ""}},
                 options.schema->fields());
   auto date = std::make_shared<TimestampScalar>(1620086400, ts);
   auto time = std::make_shared<TimestampScalar>(1620113220, ts);
@@ -672,7 +691,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
 
   // Invalid UTF-8
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-                                  factory_->Inspect({"/%AF/%BF/%CF"}));
+                                  factory_->Inspect({{"/%AF/%BF/%CF", ""}}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
                                   partitioning_->Parse({"/%AF/%BF/%CF", ""}));
 
@@ -680,8 +699,8 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
   options.schema =
       schema({field("date", utf8()), field("time", utf8()), field("str", utf8())});
   factory_ = DirectoryPartitioning::MakeFactory(options.schema->field_names(), options);
-  AssertInspect({"/2021-05-04 00:00:00/2021-05-04 07:27:00/%E3%81%8F%E3%81%BE",
-                 "/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/foo"},
+  AssertInspect({{"/2021-05-04 00:00:00/2021-05-04 07:27:00/%E3%81%8F%E3%81%BE", ""},
+                 {"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/foo", ""}},
                 options.schema->fields());
   partitioning_ = std::make_shared<DirectoryPartitioning>(
       options.schema, ArrayVector(), options.AsPartitioningOptions());
@@ -699,9 +718,9 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
   factory_ = HivePartitioning::MakeFactory(options);
 
   AssertInspect(
-      {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$",
-       "/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE",
-       "/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24"},
+      {{"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", ""},
+       {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", ""},
+       {"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", ""}},
       options.schema->fields());
 
   auto date = std::make_shared<TimestampScalar>(1620086400, ts);
@@ -722,8 +741,9 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
                     equal(field_ref("time"), literal(time)), is_null(field_ref("str"))}));
 
   // Invalid UTF-8
-  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-                                  factory_->Inspect({"/date=%AF/time=%BF/str=%CF"}));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("was not valid UTF-8"),
+      factory_->Inspect({{"/date=%AF/time=%BF/str=%CF", ""}}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("was not valid UTF-8"),
       partitioning_->Parse({"/date=%AF/time=%BF/str=%CF", ""}));
@@ -733,9 +753,9 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
       schema({field("date", utf8()), field("time", utf8()), field("str", utf8())});
   factory_ = HivePartitioning::MakeFactory(options);
   AssertInspect(
-      {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$",
-       "/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE",
-       "/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24"},
+      {{"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", ""},
+       {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", ""},
+       {"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", ""}},
       options.schema->fields());
   partitioning_ = std::make_shared<HivePartitioning>(options.schema, ArrayVector(),
                                                      options.AsHivePartitioningOptions());
@@ -745,8 +765,9 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
                     equal(field_ref("str"), literal("%24"))}));
 
   // Invalid UTF-8
-  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-                                  factory_->Inspect({"/date=\xAF/time=\xBF/str=\xCF"}));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("was not valid UTF-8"),
+      factory_->Inspect({{"/date=\xAF/time=\xBF/str=\xCF", ""}}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("was not valid UTF-8"),
       partitioning_->Parse({"/date=\xAF/time=\xBF/str=\xCF", ""}));
@@ -760,12 +781,15 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
   options.null_fallback = "$";
   factory_ = HivePartitioning::MakeFactory(options);
 
-  AssertInspect({"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
-                 "07:27:00/str=$",
-                 "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
-                 "07:27:00/str=%E3%81%8F%E3%81%BE",
-                 "/test%27%3B%20date=2021-05-04 "
-                 "00%3A00%3A00/test%27%3B%20time=2021-05-04 07%3A27%3A00/str=%24"},
+  AssertInspect({{"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
+                  "07:27:00/str=$",
+                  ""},
+                 {"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
+                  "07:27:00/str=%E3%81%8F%E3%81%BE",
+                  ""},
+                 {"/test%27%3B%20date=2021-05-04 "
+                  "00%3A00%3A00/test%27%3B%20time=2021-05-04 07%3A27%3A00/str=%24",
+                  ""}},
                 options.schema->fields());
 
   auto date = std::make_shared<TimestampScalar>(1620086400, ts);
@@ -795,7 +819,7 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
   // Invalid UTF-8
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-      factory_->Inspect({"/%AF=2021-05-04/time=2021-05-04 07%3A27%3A00/str=%24"}));
+      factory_->Inspect({{"/%AF=2021-05-04/time=2021-05-04 07%3A27%3A00/str=%24", ""}}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("was not valid UTF-8"),
       partitioning_->Parse({"/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24", ""}));
@@ -975,9 +999,12 @@ TEST(TestStripPrefixAndFilename, Basic) {
   std::vector<std::string> input{"/data/year=2019/file.parquet",
                                  "/data/year=2019/month=12/file.parquet",
                                  "/data/year=2019/month=12/day=01/file.parquet"};
-  EXPECT_THAT(StripPrefixAndFilename(input, "/data"),
-              testing::ElementsAre("year=2019", "year=2019/month=12",
-                                   "year=2019/month=12/day=01"));
+  auto paths = StripPrefixAndFilename(input, "/data");
+  std::vector<std::string> path_directories;
+  std::transform(paths.begin(), paths.end(), std::back_inserter(path_directories),
+                 [](PartitionPathFormat const& path) { return path.directory; });
+  EXPECT_THAT(path_directories, testing::ElementsAre("year=2019", "year=2019/month=12",
+                                                     "year=2019/month=12/day=01"));
 }
 
 }  // namespace dataset

From 83ea5fdc0d733734d93c2b60a2c84f5dc6e48f14 Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Sat, 30 Apr 2022 04:38:07 +0530
Subject: [PATCH 12/24] fix: use PartitionPathFormat in
 dataset___PartitioningFactory__Inspect for R

---
 r/src/dataset.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp
index aa44a0721e3..2abc7af2953 100644
--- a/r/src/dataset.cpp
+++ b/r/src/dataset.cpp
@@ -386,7 +386,7 @@ std::shared_ptr<ds::PartitioningFactory> dataset___HivePartitioning__MakeFactory
 // [[dataset::export]]
 std::shared_ptr<arrow::Schema> dataset___PartitioningFactory__Inspect(
     const std::shared_ptr<ds::PartitioningFactory>& factory,
-    const std::vector<std::string>& paths) {
+    const std::vector<arrow::dataset::PartitionPathFormat>& paths) {
   return ValueOrStop(factory->Inspect(paths));
 }
 

From 5b5cf95082cc5f41c8f5082e6cb203460831706b Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Tue, 3 May 2022 21:51:43 +0530
Subject: [PATCH 13/24] fix: using PartitionPathFormat in arrowExports for R

---
 .RData                 | Bin 0 -> 48 bytes
 r/src/arrowExports.cpp |   4 ++--
 r/src/arrow_types.h    |   1 +
 3 files changed, 3 insertions(+), 2 deletions(-)
 create mode 100644 .RData

diff --git a/.RData b/.RData
new file mode 100644
index 0000000000000000000000000000000000000000..e9cee15b7b2568b8d14df240eb4cb675ca76b36d
GIT binary patch
literal 48
zcmb2|=3oE=X6~X+gJ)e2k`fXU(h?GxCarN$W6sX#n7xj5!bg6F?=#lDQ~)Xj09iQ@
A4FCWD

literal 0
HcmV?d00001

diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 312b778aada..c878225ed25 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -1767,11 +1767,11 @@ extern "C" SEXP _arrow_dataset___HivePartitioning__MakeFactory(SEXP null_fallbac
 
 // dataset.cpp
 #if defined(ARROW_R_WITH_DATASET)
-std::shared_ptr<arrow::Schema> dataset___PartitioningFactory__Inspect(const std::shared_ptr<ds::PartitioningFactory>& factory, const std::vector<std::string>& paths);
+std::shared_ptr<arrow::Schema> dataset___PartitioningFactory__Inspect(const std::shared_ptr<ds::PartitioningFactory>& factory, const std::vector<ds::PartitionPathFormat>& paths);
 extern "C" SEXP _arrow_dataset___PartitioningFactory__Inspect(SEXP factory_sexp, SEXP paths_sexp){
 BEGIN_CPP11
 	arrow::r::Input<const std::shared_ptr<ds::PartitioningFactory>&>::type factory(factory_sexp);
-	arrow::r::Input<const std::vector<std::string>&>::type paths(paths_sexp);
+	arrow::r::Input<const std::vector<ds::PartitionPathFormat>&>::type paths(paths_sexp);
 	return cpp11::as_sexp(dataset___PartitioningFactory__Inspect(factory, paths));
 END_CPP11
 }
diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h
index d9fee37e7f1..5a931ccb8de 100644
--- a/r/src/arrow_types.h
+++ b/r/src/arrow_types.h
@@ -36,6 +36,7 @@
 
 #if defined(ARROW_R_WITH_DATASET)
 #include <arrow/dataset/type_fwd.h>
+#include <arrow/dataset/partition.h>
 #endif
 
 #include <arrow/filesystem/type_fwd.h>

From c5260651d636b0a010e624182b495f79bf6615a5 Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Wed, 4 May 2022 07:28:38 +0530
Subject: [PATCH 14/24] fix: creating PartitionPathFormat object only in
 dataset___PartitioningFactory__Inspect

---
 r/src/arrowExports.cpp | 4 ++--
 r/src/arrow_types.h    | 1 -
 r/src/dataset.cpp      | 8 ++++++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index c878225ed25..14eb6eb397a 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -1767,11 +1767,11 @@ extern "C" SEXP _arrow_dataset___HivePartitioning__MakeFactory(SEXP null_fallbac
 
 // dataset.cpp
 #if defined(ARROW_R_WITH_DATASET)
-std::shared_ptr<arrow::Schema> dataset___PartitioningFactory__Inspect(const std::shared_ptr<ds::PartitioningFactory>& factory, const std::vector<ds::PartitionPathFormat>& paths);
+std::shared_ptr<arrow::Schema> dataset___PartitioningFactory__Inspect(const std::shared_ptr<ds::PartitioningFactory>& factory, const std::vector<std::pair<std::string, std::string>>& paths);
 extern "C" SEXP _arrow_dataset___PartitioningFactory__Inspect(SEXP factory_sexp, SEXP paths_sexp){
 BEGIN_CPP11
 	arrow::r::Input<const std::shared_ptr<ds::PartitioningFactory>&>::type factory(factory_sexp);
-	arrow::r::Input<const std::vector<ds::PartitionPathFormat>&>::type paths(paths_sexp);
+	arrow::r::Input<const std::vector<std::pair<std::string, std::string>>&>::type paths(paths_sexp);
 	return cpp11::as_sexp(dataset___PartitioningFactory__Inspect(factory, paths));
 END_CPP11
 }
diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h
index 5a931ccb8de..d9fee37e7f1 100644
--- a/r/src/arrow_types.h
+++ b/r/src/arrow_types.h
@@ -36,7 +36,6 @@
 
 #if defined(ARROW_R_WITH_DATASET)
 #include <arrow/dataset/type_fwd.h>
-#include <arrow/dataset/partition.h>
 #endif
 
 #include <arrow/filesystem/type_fwd.h>
diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp
index 2abc7af2953..7a3273e9abc 100644
--- a/r/src/dataset.cpp
+++ b/r/src/dataset.cpp
@@ -386,8 +386,12 @@ std::shared_ptr<ds::PartitioningFactory> dataset___HivePartitioning__MakeFactory
 // [[dataset::export]]
 std::shared_ptr<arrow::Schema> dataset___PartitioningFactory__Inspect(
     const std::shared_ptr<ds::PartitioningFactory>& factory,
-    const std::vector<arrow::dataset::PartitionPathFormat>& paths) {
-  return ValueOrStop(factory->Inspect(paths));
+    const std::vector<std::pair<std::string, std::string>>& paths) {
+  std::vector<ds::PartitionPathFormat> partition_paths(paths.size());
+  for (auto& path : paths) {
+    partition_paths.emplace_back(ds::PartitionPathFormat{path.first, path.second});
+  }
+  return ValueOrStop(factory->Inspect(partition_paths));
 }
 
 // [[dataset::export]]

From c7d1136eddb4b0576995a050077e1df61ca01218 Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Thu, 5 May 2022 13:49:22 +0530
Subject: [PATCH 15/24] fix: Remove R implementation, instead put check in
 InspectSchemas

---
 .RData                                  | Bin 48 -> 0 bytes
 cpp/src/arrow/dataset/file_parquet.cc   |  15 +-
 cpp/src/arrow/dataset/partition.cc      |  32 ++--
 cpp/src/arrow/dataset/partition.h       |   9 +-
 cpp/src/arrow/dataset/partition_test.cc | 217 +++++++++++-------------
 r/autobrew                              |  59 +++++++
 r/src/arrowExports.cpp                  |   4 +-
 r/src/arrow_types.h                     |   1 +
 r/src/dataset.cpp                       |   8 +-
 9 files changed, 191 insertions(+), 154 deletions(-)
 delete mode 100644 .RData
 create mode 100644 r/autobrew

diff --git a/.RData b/.RData
deleted file mode 100644
index e9cee15b7b2568b8d14df240eb4cb675ca76b36d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 48
zcmb2|=3oE=X6~X+gJ)e2k`fXU(h?GxCarN$W6sX#n7xj5!bg6F?=#lDQ~)Xj09iQ@
A4FCWD

diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
index cd0f1973081..4b1ded92a4a 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -916,11 +916,20 @@ Result<std::vector<std::shared_ptr<Schema>>> ParquetDatasetFactory::InspectSchem
 
   if (auto factory = options_.partitioning.factory()) {
     // Gather paths found in RowGroups' ColumnChunks.
-    std::vector<PartitionPathFormat> stripped(paths_with_row_group_ids_.size());
+    std::vector<std::string> stripped(paths_with_row_group_ids_.size());
 
     size_t i = 0;
-    for (const auto& e : paths_with_row_group_ids_) {
-      stripped[i++] = StripPrefixAndFilename(e.first, options_.partition_base_dir);
+
+    if (options_.partitioning.factory()->type_name() == "filename") {
+      for (const auto& e : paths_with_row_group_ids_) {
+        stripped[i++] =
+            StripPrefixAndFilename(e.first, options_.partition_base_dir).filename;
+      }
+    } else {
+      for (const auto& e : paths_with_row_group_ids_) {
+        stripped[i++] =
+            StripPrefixAndFilename(e.first, options_.partition_base_dir).directory;
+      }
     }
     ARROW_ASSIGN_OR_RAISE(auto partition_schema, factory->Inspect(stripped));
 
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 5d7457141cf..6e636b88cc0 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -581,10 +581,10 @@ class DirectoryPartitioningFactory : public KeyValuePartitioningFactory {
   std::string type_name() const override { return "directory"; }
 
   Result<std::shared_ptr<Schema>> Inspect(
-      const std::vector<PartitionPathFormat>& paths) override {
+      const std::vector<std::string>& paths) override {
     for (auto path : paths) {
       std::vector<std::string> segments;
-      segments = fs::internal::SplitAbstractPath(path.directory);
+      segments = fs::internal::SplitAbstractPath(path);
       RETURN_NOT_OK(InspectPartitionSegments(segments, field_names_));
     }
     return DoInspect();
@@ -628,11 +628,11 @@ class FilenamePartitioningFactory : public KeyValuePartitioningFactory {
   std::string type_name() const override { return "filename"; }
 
   Result<std::shared_ptr<Schema>> Inspect(
-      const std::vector<PartitionPathFormat>& paths) override {
+      const std::vector<std::string>& paths) override {
     for (const auto& path : paths) {
       std::vector<std::string> segments;
-      segments = fs::internal::SplitAbstractPath(StripNonPrefix(path.filename),
-                                                 kFilenamePartitionSep);
+      segments =
+          fs::internal::SplitAbstractPath(StripNonPrefix(path), kFilenamePartitionSep);
       RETURN_NOT_OK(InspectPartitionSegments(segments, field_names_));
     }
     return DoInspect();
@@ -761,10 +761,10 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory {
   std::string type_name() const override { return "hive"; }
 
   Result<std::shared_ptr<Schema>> Inspect(
-      const std::vector<PartitionPathFormat>& paths) override {
+      const std::vector<std::string>& paths) override {
     auto options = options_.AsHivePartitioningOptions();
     for (auto path : paths) {
-      for (auto&& segment : fs::internal::SplitAbstractPath(path.directory)) {
+      for (auto&& segment : fs::internal::SplitAbstractPath(path)) {
         ARROW_ASSIGN_OR_RAISE(auto maybe_key,
                               HivePartitioning::ParseKey(segment, options));
         if (auto key = maybe_key) {
@@ -814,28 +814,28 @@ PartitionPathFormat StripPrefixAndFilename(const std::string& path,
                              std::move(basename_filename.second)};
 }
 
-std::vector<PartitionPathFormat> StripPrefixAndFilename(
-    const std::vector<std::string>& paths, const std::string& prefix) {
-  std::vector<PartitionPathFormat> result;
+std::vector<std::string> StripPrefixAndFilename(const std::vector<std::string>& paths,
+                                                const std::string& prefix) {
+  std::vector<std::string> result;
   result.reserve(paths.size());
   for (const auto& path : paths) {
-    result.emplace_back(StripPrefixAndFilename(path, prefix));
+    result.emplace_back(StripPrefixAndFilename(path, prefix).directory);
   }
   return result;
 }
 
-std::vector<PartitionPathFormat> StripPrefixAndFilename(
-    const std::vector<fs::FileInfo>& files, const std::string& prefix) {
-  std::vector<PartitionPathFormat> result;
+std::vector<std::string> StripPrefixAndFilename(const std::vector<fs::FileInfo>& files,
+                                                const std::string& prefix) {
+  std::vector<std::string> result;
   result.reserve(files.size());
   for (const auto& info : files) {
-    result.emplace_back(StripPrefixAndFilename(info.path(), prefix));
+    result.emplace_back(StripPrefixAndFilename(info.path(), prefix).directory);
   }
   return result;
 }
 
 Result<std::shared_ptr<Schema>> PartitioningOrFactory::GetOrInferSchema(
-    const std::vector<PartitionPathFormat>& paths) {
+    const std::vector<std::string>& paths) {
   if (auto part = partitioning()) {
     return part->schema();
   }
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index fe20e837619..e18c1d6ddbf 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -152,7 +152,7 @@ class ARROW_DS_EXPORT PartitioningFactory {
   /// Get the schema for the resulting Partitioning.
   /// This may reset internal state, for example dictionaries of unique representations.
   virtual Result<std::shared_ptr<Schema>> Inspect(
-      const std::vector<PartitionPathFormat>& paths) = 0;
+      const std::vector<std::string>& paths) = 0;
 
   /// Create a partitioning using the provided schema
   /// (fields may be dropped).
@@ -366,11 +366,11 @@ ARROW_DS_EXPORT PartitionPathFormat StripPrefixAndFilename(const std::string& pa
                                                            const std::string& prefix);
 
 /// \brief Vector version of StripPrefixAndFilename.
-ARROW_DS_EXPORT std::vector<PartitionPathFormat> StripPrefixAndFilename(
+ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
     const std::vector<std::string>& paths, const std::string& prefix);
 
 /// \brief Vector version of StripPrefixAndFilename.
-ARROW_DS_EXPORT std::vector<PartitionPathFormat> StripPrefixAndFilename(
+ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
     const std::vector<fs::FileInfo>& files, const std::string& prefix);
 
 /// \brief Either a Partitioning or a PartitioningFactory
@@ -397,8 +397,7 @@ class ARROW_DS_EXPORT PartitioningOrFactory {
   const std::shared_ptr<PartitioningFactory>& factory() const { return factory_; }
 
   /// \brief Get the partition schema, inferring it with the given factory if needed.
-  Result<std::shared_ptr<Schema>> GetOrInferSchema(
-      const std::vector<PartitionPathFormat>& paths);
+  Result<std::shared_ptr<Schema>> GetOrInferSchema(const std::vector<std::string>& paths);
 
  private:
   std::shared_ptr<PartitioningFactory> factory_;
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index fe84da843cf..157a9c3af44 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -74,7 +74,7 @@ class TestPartitioning : public ::testing::Test {
     ASSERT_EQ(simplified, literal(true));
   }
 
-  void AssertInspect(const std::vector<PartitionPathFormat>& paths,
+  void AssertInspect(const std::vector<std::string>& paths,
                      const std::vector<std::shared_ptr<Field>>& expected) {
     ASSERT_OK_AND_ASSIGN(auto actual, factory_->Inspect(paths));
     ASSERT_EQ(*actual, Schema(expected));
@@ -122,7 +122,7 @@ class TestPartitioning : public ::testing::Test {
     AssertPartition(partitioning, record_batch, expected_batches, expected_expressions);
   }
 
-  void AssertInspectError(const std::vector<PartitionPathFormat>& paths) {
+  void AssertInspectError(const std::vector<std::string>& paths) {
     ASSERT_RAISES(Invalid, factory_->Inspect(paths));
   }
 
@@ -295,39 +295,39 @@ TEST_F(TestPartitioning, DiscoverSchemaDirectory) {
   factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"});
 
   // type is int32 if possible
-  AssertInspect({{"/0/1", ""}}, {Int("alpha"), Int("beta")});
+  AssertInspect({"/0/1"}, {Int("alpha"), Int("beta")});
 
   // extra segments are ignored
-  AssertInspect({{"/0/1/what", ""}}, {Int("alpha"), Int("beta")});
+  AssertInspect({"/0/1/what"}, {Int("alpha"), Int("beta")});
 
   // fall back to string if any segment for field alpha is not parseable as int
-  AssertInspect({{"/0/1", ""}, {"/hello/1", ""}}, {Str("alpha"), Int("beta")});
+  AssertInspect({"/0/1", "/hello/1"}, {Str("alpha"), Int("beta")});
 
   // If there are too many digits fall back to string
-  AssertInspect({{"/3760212050/1", ""}}, {Str("alpha"), Int("beta")});
+  AssertInspect({"/3760212050/1"}, {Str("alpha"), Int("beta")});
 
   // missing segment for beta doesn't cause an error or fallback
-  AssertInspect({{"/0/1", ""}, {"/hello", ""}}, {Str("alpha"), Int("beta")});
+  AssertInspect({"/0/1", "/hello"}, {Str("alpha"), Int("beta")});
 }
 
 TEST_F(TestPartitioning, DiscoverSchemaFilename) {
   factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"});
 
   // type is int32 if possible
-  AssertInspect({{"", "0_1_"}}, {Int("alpha"), Int("beta")});
+  AssertInspect({"0_1_"}, {Int("alpha"), Int("beta")});
 
   // extra segments are ignored
-  AssertInspect({{"", "0_1_what_"}}, {Int("alpha"), Int("beta")});
+  AssertInspect({"0_1_what_"}, {Int("alpha"), Int("beta")});
 
   // fall back to string if any segment for field alpha is not parseable as int
-  AssertInspect({{"", "0_1_"}, {"", "hello_1_"}}, {Str("alpha"), Int("beta")});
+  AssertInspect({"0_1_", "hello_1_"}, {Str("alpha"), Int("beta")});
 
   // If there are too many digits fall back to string
-  AssertInspect({{"", "3760212050_1_"}}, {Str("alpha"), Int("beta")});
+  AssertInspect({"3760212050_1_"}, {Str("alpha"), Int("beta")});
 
   // Invalid syntax
-  AssertInspectError({{"", "234-12"}});
-  AssertInspectError({{"", "hello"}});
+  AssertInspectError({"234-12"});
+  AssertInspectError({"hello"});
 }
 
 TEST_F(TestPartitioning, DirectoryDictionaryInference) {
@@ -336,21 +336,18 @@ TEST_F(TestPartitioning, DirectoryDictionaryInference) {
   factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options);
 
   // type is still int32 if possible
-  AssertInspect({{"/0/1", ""}}, {DictInt("alpha"), DictInt("beta")});
+  AssertInspect({"/0/1"}, {DictInt("alpha"), DictInt("beta")});
 
   // If there are too many digits fall back to string
-  AssertInspect({{"/3760212050/1", ""}}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({"/3760212050/1"}, {DictStr("alpha"), DictInt("beta")});
 
   // successful dictionary inference
-  AssertInspect({{"/a/0", ""}}, {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({{"/a/0", ""}, {"/a/1", ""}}, {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({{"/a/0", ""}, {"/a", ""}}, {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({{"/a/0", ""}, {"/b/0", ""}, {"/a/1", ""}, {"/b/1", ""}},
-                {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({{"/a/-", ""}, {"/b/-", ""}, {"/a/_", ""}, {"/b/_", ""}},
-                {DictStr("alpha"), DictStr("beta")});
-  AssertInspect({{"/a/-", ""}, {"/b/-", ""}, {"/a/_", ""}, {"/b/_", ""}},
-                {DictStr("alpha"), DictStr("beta")});
+  AssertInspect({"/a/0"}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({"/a/0", "/a/1"}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({"/a/0", "/a"}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({"/0/a", "/1"}, {DictInt("alpha"), DictStr("beta")});
+  AssertInspect({"/a/0", "/b/0", "/a/1", "/b/1"}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({"/a/-", "/b/-", "/a/_", "/b/_"}, {DictStr("alpha"), DictStr("beta")});
 }
 
 TEST_F(TestPartitioning, FilenameDictionaryInference) {
@@ -359,16 +356,15 @@ TEST_F(TestPartitioning, FilenameDictionaryInference) {
   factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}, options);
 
   // type is still int32 if possible
-  AssertInspect({{"", "0_1_"}}, {DictInt("alpha"), DictInt("beta")});
+  AssertInspect({"0_1_"}, {DictInt("alpha"), DictInt("beta")});
 
   // If there are too many digits fall back to string
-  AssertInspect({{"", "3760212050_1_"}}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({"3760212050_1_"}, {DictStr("alpha"), DictInt("beta")});
 
   // successful dictionary inference
-  AssertInspect({{"", "a_0_"}}, {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({{"", "a_0_"}, {"", "a_1_"}}, {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({{"", "a_0_"}, {"", "b_0_"}, {"", "a_1_"}, {"", "b_1_"}},
-                {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({"a_0_"}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({"a_0_", "a_1_"}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({"a_0_", "b_0_", "a_1_", "b_1_"}, {DictStr("alpha"), DictInt("beta")});
 }
 
 TEST_F(TestPartitioning, DictionaryHasUniqueValues) {
@@ -377,8 +373,7 @@ TEST_F(TestPartitioning, DictionaryHasUniqueValues) {
   factory_ = DirectoryPartitioning::MakeFactory({"alpha"}, options);
 
   auto alpha = DictStr("alpha");
-  AssertInspect({{"/a", ""}, {"/b", ""}, {"/a", ""}, {"/b", ""}, {"/c", ""}, {"/a", ""}},
-                {alpha});
+  AssertInspect({"/a", "/b", "/a", "/b", "/c", "/a"}, {alpha});
   ASSERT_OK_AND_ASSIGN(auto partitioning, factory_->Finish(schema({alpha})));
 
   auto expected_dictionary =
@@ -400,7 +395,7 @@ TEST_F(TestPartitioning, DictionaryHasUniqueValues) {
 TEST_F(TestPartitioning, DiscoverSchemaSegfault) {
   // ARROW-7638
   factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"});
-  AssertInspectError({{"oops.txt", ""}});
+  AssertInspectError({"oops.txt"});
 }
 
 TEST_F(TestPartitioning, HivePartitioning) {
@@ -486,29 +481,27 @@ TEST_F(TestPartitioning, DiscoverHiveSchema) {
   factory_ = HivePartitioning::MakeFactory(options);
 
   // type is int32 if possible
-  AssertInspect({{"/alpha=0/beta=1", ""}}, {Int("alpha"), Int("beta")});
+  AssertInspect({"/alpha=0/beta=1"}, {Int("alpha"), Int("beta")});
 
   // extra segments are ignored
-  AssertInspect({{"/gamma=0/unexpected/delta=1/dat.parquet", ""}},
+  AssertInspect({"/gamma=0/unexpected/delta=1/dat.parquet"},
                 {Int("gamma"), Int("delta")});
 
   // schema field names are in order of first occurrence
   // (...so ensure your partitions are ordered the same for all paths)
-  AssertInspect({{"/alpha=0/beta=1", ""}, {"/beta=2/alpha=3", ""}},
-                {Int("alpha"), Int("beta")});
+  AssertInspect({"/alpha=0/beta=1", "/beta=2/alpha=3"}, {Int("alpha"), Int("beta")});
 
   // Null fallback strings shouldn't interfere with type inference
-  AssertInspect({{"/alpha=xyz/beta=x", ""}, {"/alpha=7/beta=xyz", ""}},
-                {Int("alpha"), Str("beta")});
+  AssertInspect({"/alpha=xyz/beta=x", "/alpha=7/beta=xyz"}, {Int("alpha"), Str("beta")});
 
   // Cannot infer if the only values are null
-  AssertInspectError({{"/alpha=xyz", ""}});
+  AssertInspectError({"/alpha=xyz"});
 
   // If there are too many digits fall back to string
-  AssertInspect({{"/alpha=3760212050", ""}}, {Str("alpha")});
+  AssertInspect({"/alpha=3760212050"}, {Str("alpha")});
 
   // missing path segments will not cause an error
-  AssertInspect({{"/alpha=0/beta=1", ""}, {"/beta=2/alpha=3", ""}, {"/gamma=what", ""}},
+  AssertInspect({"/alpha=0/beta=1", "/beta=2/alpha=3", "/gamma=what"},
                 {Int("alpha"), Int("beta"), Str("gamma")});
 }
 
@@ -519,27 +512,22 @@ TEST_F(TestPartitioning, HiveDictionaryInference) {
   factory_ = HivePartitioning::MakeFactory(options);
 
   // type is still int32 if possible
-  AssertInspect({{"/alpha=0/beta=1", ""}}, {DictInt("alpha"), DictInt("beta")});
+  AssertInspect({"/alpha=0/beta=1"}, {DictInt("alpha"), DictInt("beta")});
 
   // If there are too many digits fall back to string
-  AssertInspect({{"/alpha=3760212050", ""}}, {DictStr("alpha")});
+  AssertInspect({"/alpha=3760212050"}, {DictStr("alpha")});
 
   // successful dictionary inference
-  AssertInspect({{"/alpha=a/beta=0", ""}}, {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({{"/alpha=a/beta=0", ""}, {"/alpha=a/1", ""}},
-                {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({{"/alpha=a/beta=0", ""}, {"/alpha=xyz/beta=xyz", ""}},
-                {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({{"/alpha=a/beta=0", ""},
-                 {"/alpha=b/beta=0", ""},
-                 {"/alpha=a/beta=1", ""},
-                 {"/alpha=b/beta=1", ""}},
+  AssertInspect({"/alpha=a/beta=0"}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({"/alpha=a/beta=0", "/alpha=a/1"}, {DictStr("alpha"), DictInt("beta")});
+  AssertInspect({"/alpha=a/beta=0", "/alpha=xyz/beta=xyz"},
                 {DictStr("alpha"), DictInt("beta")});
-  AssertInspect({{"/alpha=a/beta=-", ""},
-                 {"/alpha=b/beta=-", ""},
-                 {"/alpha=a/beta=_", ""},
-                 {"/alpha=b/beta=_", ""}},
-                {DictStr("alpha"), DictStr("beta")});
+  AssertInspect(
+      {"/alpha=a/beta=0", "/alpha=b/beta=0", "/alpha=a/beta=1", "/alpha=b/beta=1"},
+      {DictStr("alpha"), DictInt("beta")});
+  AssertInspect(
+      {"/alpha=a/beta=-", "/alpha=b/beta=-", "/alpha=a/beta=_", "/alpha=b/beta=_"},
+      {DictStr("alpha"), DictStr("beta")});
 }
 
 TEST_F(TestPartitioning, HiveNullFallbackPassedOn) {
@@ -547,7 +535,7 @@ TEST_F(TestPartitioning, HiveNullFallbackPassedOn) {
   options.null_fallback = "xyz";
   factory_ = HivePartitioning::MakeFactory(options);
 
-  EXPECT_OK_AND_ASSIGN(auto schema, factory_->Inspect({{"/alpha=a/beta=0", ""}}));
+  EXPECT_OK_AND_ASSIGN(auto schema, factory_->Inspect({"/alpha=a/beta=0"}));
   EXPECT_OK_AND_ASSIGN(auto partitioning, factory_->Finish(schema));
   ASSERT_EQ("xyz",
             std::static_pointer_cast<HivePartitioning>(partitioning)->null_fallback());
@@ -559,12 +547,7 @@ TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) {
   factory_ = HivePartitioning::MakeFactory(options);
 
   auto alpha = DictStr("alpha");
-  AssertInspect({{"/alpha=a", ""},
-                 {"/alpha=b", ""},
-                 {"/alpha=a", ""},
-                 {"/alpha=b", ""},
-                 {"/alpha=c", ""},
-                 {"/alpha=a", ""}},
+  AssertInspect({"/alpha=a", "/alpha=b", "/alpha=a", "/alpha=b", "/alpha=c", "/alpha=a"},
                 {alpha});
   ASSERT_OK_AND_ASSIGN(auto partitioning, factory_->Finish(schema({alpha})));
 
@@ -591,23 +574,22 @@ TEST_F(TestPartitioning, ExistingSchemaDirectory) {
   options.schema = schema({field("alpha", int64()), field("beta", dict_type)});
   factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options);
 
-  AssertInspect({{"/0/1", ""}}, options.schema->fields());
-  AssertInspect({{"/0/1/what", ""}}, options.schema->fields());
+  AssertInspect({"/0/1"}, options.schema->fields());
+  AssertInspect({"/0/1/what"}, options.schema->fields());
 
   // fail if any segment is not parseable as schema type
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"),
-                                  factory_->Inspect({{"/0/1", ""}, {"/hello/1", ""}}));
+                                  factory_->Inspect({"/0/1", "/hello/1"}));
   factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options);
 
   // Now we don't fail since our type is large enough
-  AssertInspect({{"/3760212050/1", ""}}, options.schema->fields());
+  AssertInspect({"/3760212050/1"}, options.schema->fields());
   // If there are still too many digits, fail
-  EXPECT_RAISES_WITH_MESSAGE_THAT(
-      Invalid, ::testing::HasSubstr("Failed to parse string"),
-      factory_->Inspect({{"/1038581385102940193760212050/1", ""}}));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"),
+                                  factory_->Inspect({"/1038581385102940193760212050/1"}));
   factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options);
 
-  AssertInspect({{"/0/1", ""}, {"/2", ""}}, options.schema->fields());
+  AssertInspect({"/0/1", "/2"}, options.schema->fields());
 }
 
 TEST_F(TestPartitioning, ExistingSchemaHive) {
@@ -617,33 +599,33 @@ TEST_F(TestPartitioning, ExistingSchemaHive) {
   options.schema = schema({field("a", int64()), field("b", dict_type)});
   factory_ = HivePartitioning::MakeFactory(options);
 
-  AssertInspect({{"/a=0/b=1", ""}}, options.schema->fields());
-  AssertInspect({{"/a=0/b=1/what", ""}}, options.schema->fields());
-  AssertInspect({{"/a=0", ""}, {"/b=1", ""}}, options.schema->fields());
+  AssertInspect({"/a=0/b=1"}, options.schema->fields());
+  AssertInspect({"/a=0/b=1/what"}, options.schema->fields());
+  AssertInspect({"/a=0", "/b=1"}, options.schema->fields());
 
   // fail if any segment for field alpha is not parseable as schema type
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid,
       ::testing::HasSubstr(
           "Could not cast segments for partition field a to requested type int64"),
-      factory_->Inspect({{"/a=0/b=1", ""}, {"/a=hello/b=1", ""}}));
+      factory_->Inspect({"/a=0/b=1", "/a=hello/b=1"}));
   factory_ = HivePartitioning::MakeFactory(options);
 
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid,
       ::testing::HasSubstr("Requested schema has 2 fields, but only 1 were detected"),
-      factory_->Inspect({{"/a=0", ""}, {"/a=hello", ""}}));
+      factory_->Inspect({"/a=0", "/a=hello"}));
   factory_ = HivePartitioning::MakeFactory(options);
 
   // Now we don't fail since our type is large enough
-  AssertInspect({{"/a=3760212050/b=1", ""}}, options.schema->fields());
+  AssertInspect({"/a=3760212050/b=1"}, options.schema->fields());
   // If there are still too many digits, fail
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("Failed to parse string"),
-      factory_->Inspect({{"/a=1038581385102940193760212050/b=1", ""}}));
+      factory_->Inspect({"/a=1038581385102940193760212050/b=1"}));
   factory_ = HivePartitioning::MakeFactory(options);
 
-  AssertInspect({{"/a=0/b=1", ""}, {"/b=2", ""}}, options.schema->fields());
+  AssertInspect({"/a=0/b=1", "/b=2"}, options.schema->fields());
 }
 
 TEST_F(TestPartitioning, ExistingSchemaFilename) {
@@ -653,23 +635,22 @@ TEST_F(TestPartitioning, ExistingSchemaFilename) {
   options.schema = schema({field("alpha", int64()), field("beta", dict_type)});
   factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}, options);
 
-  AssertInspect({{"", "0_1_"}}, options.schema->fields());
-  AssertInspect({{"", "0_1_what_"}}, options.schema->fields());
+  AssertInspect({"0_1_"}, options.schema->fields());
+  AssertInspect({"0_1_what_"}, options.schema->fields());
 
   // fail if any segment is not parseable as schema type
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"),
-                                  factory_->Inspect({{"", "0_1_"}, {"", "hello_1_"}}));
+                                  factory_->Inspect({"0_1_", "hello_1_"}));
   factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}, options);
 
   // Now we don't fail since our type is large enough
-  AssertInspect({{"", "3760212050_1_"}}, options.schema->fields());
+  AssertInspect({"3760212050_1_"}, options.schema->fields());
   // If there are still too many digits, fail
-  EXPECT_RAISES_WITH_MESSAGE_THAT(
-      Invalid, ::testing::HasSubstr("Failed to parse string"),
-      factory_->Inspect({{"", "1038581385102940193760212050_1_"}}));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"),
+                                  factory_->Inspect({"1038581385102940193760212050_1_"}));
   factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}, options);
 
-  AssertInspect({{"", "0_1_"}, {"", "2_"}}, options.schema->fields());
+  AssertInspect({"0_1_", "2_"}, options.schema->fields());
 }
 
 TEST_F(TestPartitioning, UrlEncodedDirectory) {
@@ -678,8 +659,8 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
   options.schema = schema({field("date", ts), field("time", ts), field("str", utf8())});
   factory_ = DirectoryPartitioning::MakeFactory(options.schema->field_names(), options);
 
-  AssertInspect({{"/2021-05-04 00:00:00/2021-05-04 07:27:00/%24", ""},
-                 {"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/foo", ""}},
+  AssertInspect({"/2021-05-04 00:00:00/2021-05-04 07:27:00/%24",
+                 "/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/foo"},
                 options.schema->fields());
   auto date = std::make_shared<TimestampScalar>(1620086400, ts);
   auto time = std::make_shared<TimestampScalar>(1620113220, ts);
@@ -691,7 +672,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
 
   // Invalid UTF-8
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-                                  factory_->Inspect({{"/%AF/%BF/%CF", ""}}));
+                                  factory_->Inspect({"/%AF/%BF/%CF"}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
                                   partitioning_->Parse({"/%AF/%BF/%CF", ""}));
 
@@ -699,8 +680,8 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
   options.schema =
       schema({field("date", utf8()), field("time", utf8()), field("str", utf8())});
   factory_ = DirectoryPartitioning::MakeFactory(options.schema->field_names(), options);
-  AssertInspect({{"/2021-05-04 00:00:00/2021-05-04 07:27:00/%E3%81%8F%E3%81%BE", ""},
-                 {"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/foo", ""}},
+  AssertInspect({"/2021-05-04 00:00:00/2021-05-04 07:27:00/%E3%81%8F%E3%81%BE",
+                 "/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/foo"},
                 options.schema->fields());
   partitioning_ = std::make_shared<DirectoryPartitioning>(
       options.schema, ArrayVector(), options.AsPartitioningOptions());
@@ -718,9 +699,9 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
   factory_ = HivePartitioning::MakeFactory(options);
 
   AssertInspect(
-      {{"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", ""},
-       {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", ""},
-       {"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", ""}},
+      {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$",
+       "/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE",
+       "/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24"},
       options.schema->fields());
 
   auto date = std::make_shared<TimestampScalar>(1620086400, ts);
@@ -741,9 +722,8 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
                     equal(field_ref("time"), literal(time)), is_null(field_ref("str"))}));
 
   // Invalid UTF-8
-  EXPECT_RAISES_WITH_MESSAGE_THAT(
-      Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-      factory_->Inspect({{"/date=%AF/time=%BF/str=%CF", ""}}));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
+                                  factory_->Inspect({"/date=%AF/time=%BF/str=%CF"}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("was not valid UTF-8"),
       partitioning_->Parse({"/date=%AF/time=%BF/str=%CF", ""}));
@@ -753,9 +733,9 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
       schema({field("date", utf8()), field("time", utf8()), field("str", utf8())});
   factory_ = HivePartitioning::MakeFactory(options);
   AssertInspect(
-      {{"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", ""},
-       {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", ""},
-       {"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", ""}},
+      {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$",
+       "/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE",
+       "/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24"},
       options.schema->fields());
   partitioning_ = std::make_shared<HivePartitioning>(options.schema, ArrayVector(),
                                                      options.AsHivePartitioningOptions());
@@ -765,9 +745,8 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
                     equal(field_ref("str"), literal("%24"))}));
 
   // Invalid UTF-8
-  EXPECT_RAISES_WITH_MESSAGE_THAT(
-      Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-      factory_->Inspect({{"/date=\xAF/time=\xBF/str=\xCF", ""}}));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
+                                  factory_->Inspect({"/date=\xAF/time=\xBF/str=\xCF"}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("was not valid UTF-8"),
       partitioning_->Parse({"/date=\xAF/time=\xBF/str=\xCF", ""}));
@@ -781,15 +760,12 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
   options.null_fallback = "$";
   factory_ = HivePartitioning::MakeFactory(options);
 
-  AssertInspect({{"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
-                  "07:27:00/str=$",
-                  ""},
-                 {"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
-                  "07:27:00/str=%E3%81%8F%E3%81%BE",
-                  ""},
-                 {"/test%27%3B%20date=2021-05-04 "
-                  "00%3A00%3A00/test%27%3B%20time=2021-05-04 07%3A27%3A00/str=%24",
-                  ""}},
+  AssertInspect({"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
+                 "07:27:00/str=$",
+                 "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
+                 "07:27:00/str=%E3%81%8F%E3%81%BE",
+                 "/test%27%3B%20date=2021-05-04 "
+                 "00%3A00%3A00/test%27%3B%20time=2021-05-04 07%3A27%3A00/str=%24"},
                 options.schema->fields());
 
   auto date = std::make_shared<TimestampScalar>(1620086400, ts);
@@ -819,7 +795,7 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
   // Invalid UTF-8
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-      factory_->Inspect({{"/%AF=2021-05-04/time=2021-05-04 07%3A27%3A00/str=%24", ""}}));
+      factory_->Inspect({"/%AF=2021-05-04/time=2021-05-04 07%3A27%3A00/str=%24"}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("was not valid UTF-8"),
       partitioning_->Parse({"/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24", ""}));
@@ -999,12 +975,9 @@ TEST(TestStripPrefixAndFilename, Basic) {
   std::vector<std::string> input{"/data/year=2019/file.parquet",
                                  "/data/year=2019/month=12/file.parquet",
                                  "/data/year=2019/month=12/day=01/file.parquet"};
-  auto paths = StripPrefixAndFilename(input, "/data");
-  std::vector<std::string> path_directories;
-  std::transform(paths.begin(), paths.end(), std::back_inserter(path_directories),
-                 [](PartitionPathFormat const& path) { return path.directory; });
-  EXPECT_THAT(path_directories, testing::ElementsAre("year=2019", "year=2019/month=12",
-                                                     "year=2019/month=12/day=01"));
+  EXPECT_THAT(StripPrefixAndFilename(input, "/data"),
+              testing::ElementsAre("year=2019", "year=2019/month=12",
+                                   "year=2019/month=12/day=01"));
 }
 
 }  // namespace dataset
diff --git a/r/autobrew b/r/autobrew
new file mode 100644
index 00000000000..e653909eb57
--- /dev/null
+++ b/r/autobrew
@@ -0,0 +1,59 @@
+# Set the default bundle version in case we don't have the exact package version
+VERSION="${VERSION:0:5}"
+if [ "$VERSION" != "3.0.0" ] && [ "$VERSION" != "4.0.1" ] && [ "$VERSION" != "5.0.0" ] && [ "$VERSION" != "6.0.0" ] && [ "$VERSION" != "6.0.1" ] && [ "$VERSION" != "7.0.0" ]; then
+VERSION="7.0.0"
+fi
+
+# Find a bundle
+if [ $(arch | grep arm) ]; then
+bottle="https://autobrew.github.io/archive/arm64_big_sur/apache-arrow-${VERSION}-arm64_big_sur.tar.xz"
+# See: https://github.com/Homebrew/homebrew-core/issues/76537
+#elif [[ ${OSTYPE:6:2} -ge 20 ]]; then
+#bottle="https://autobrew.github.io/archive/big_sur/apache-arrow-${VERSION}-big_sur.tar.xz"
+elif [[ ${OSTYPE:6:2} -lt 17 ]] && [[ ${VERSION:0:1} -lt 7 ]]; then
+bottle="https://autobrew.github.io/archive/el_capitan/apache-arrow-${VERSION}-el_capitan.tar.xz"
+else
+bottle="https://autobrew.github.io/archive/high_sierra/apache-arrow-${VERSION}-high_sierra.tar.xz"
+fi
+
+# Skip if disabled
+if [ "$DISABLE_AUTOBREW" ]; then return 0; fi
+
+# Debug
+echo "Using autobrew bundle: $(basename $bottle)"
+
+# General setup
+BREWDIR="$PWD/.deps"
+mkdir -p $BREWDIR
+curl -sSL $bottle -o libs.tar.xz
+tar -xf libs.tar.xz --strip 1 -C $BREWDIR
+rm -f libs.tar.xz
+
+# Hardcoded flags
+AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-management -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common"
+
+# Aws-sdk-cpp 1.9.xxx requires extra flags
+if [ -f "$BREWDIR/lib/libaws-crt-cpp.a" ]; then
+echo "Linking to aws-sdk-cpp 1.9"
+AWS_LIBS="$AWS_LIBS -laws-crt-cpp -laws-c-io -laws-c-s3 -laws-c-auth -laws-c-http -laws-c-cal -laws-c-compression -laws-c-mqtt"
+fi
+
+PKG_LIBS="-L$BREWDIR/lib -lparquet -larrow_dataset -larrow -larrow_bundled_dependencies -lthrift -llz4 -lsnappy -lzstd $AWS_LIBS -lpthread -lcurl"
+PKG_CFLAGS="-I$BREWDIR/include -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_JSON -DARROW_R_WITH_S3"
+
+# Prevent linking against other libs in /usr/local/lib
+for FILE in $BREWDIR/lib/*.a; do
+  BASENAME=$(basename $FILE)
+  LIBNAME=$(echo "${BASENAME%.*}" | cut -c4-)
+  cp -f $FILE $BREWDIR/lib/libbrew$LIBNAME.a
+  PKG_LIBS=$(echo $PKG_LIBS | sed "s/-l$LIBNAME /-lbrew$LIBNAME /g")
+done
+
+# Cleanup
+echo "rm -Rf .deps" >> cleanup
+chmod +x cleanup
+
+# Disable tests on R 3.6 due to buggy custom CRAN compiler
+if [ "${R_VERSION:0:1}" = "3" ]; then
+  rm -f tests/testthat.R
+fi
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 14eb6eb397a..312b778aada 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -1767,11 +1767,11 @@ extern "C" SEXP _arrow_dataset___HivePartitioning__MakeFactory(SEXP null_fallbac
 
 // dataset.cpp
 #if defined(ARROW_R_WITH_DATASET)
-std::shared_ptr<arrow::Schema> dataset___PartitioningFactory__Inspect(const std::shared_ptr<ds::PartitioningFactory>& factory, const std::vector<std::pair<std::string, std::string>>& paths);
+std::shared_ptr<arrow::Schema> dataset___PartitioningFactory__Inspect(const std::shared_ptr<ds::PartitioningFactory>& factory, const std::vector<std::string>& paths);
 extern "C" SEXP _arrow_dataset___PartitioningFactory__Inspect(SEXP factory_sexp, SEXP paths_sexp){
 BEGIN_CPP11
 	arrow::r::Input<const std::shared_ptr<ds::PartitioningFactory>&>::type factory(factory_sexp);
-	arrow::r::Input<const std::vector<std::pair<std::string, std::string>>&>::type paths(paths_sexp);
+	arrow::r::Input<const std::vector<std::string>&>::type paths(paths_sexp);
 	return cpp11::as_sexp(dataset___PartitioningFactory__Inspect(factory, paths));
 END_CPP11
 }
diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h
index d9fee37e7f1..5a931ccb8de 100644
--- a/r/src/arrow_types.h
+++ b/r/src/arrow_types.h
@@ -36,6 +36,7 @@
 
 #if defined(ARROW_R_WITH_DATASET)
 #include <arrow/dataset/type_fwd.h>
+#include <arrow/dataset/partition.h>
 #endif
 
 #include <arrow/filesystem/type_fwd.h>
diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp
index 7a3273e9abc..aa44a0721e3 100644
--- a/r/src/dataset.cpp
+++ b/r/src/dataset.cpp
@@ -386,12 +386,8 @@ std::shared_ptr<ds::PartitioningFactory> dataset___HivePartitioning__MakeFactory
 // [[dataset::export]]
 std::shared_ptr<arrow::Schema> dataset___PartitioningFactory__Inspect(
     const std::shared_ptr<ds::PartitioningFactory>& factory,
-    const std::vector<std::pair<std::string, std::string>>& paths) {
-  std::vector<ds::PartitionPathFormat> partition_paths(paths.size());
-  for (auto& path : paths) {
-    partition_paths.emplace_back(ds::PartitionPathFormat{path.first, path.second});
-  }
-  return ValueOrStop(factory->Inspect(partition_paths));
+    const std::vector<std::string>& paths) {
+  return ValueOrStop(factory->Inspect(paths));
 }
 
 // [[dataset::export]]

From 267ac94b5d61ee6ca428048df35489a004725162 Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Thu, 5 May 2022 19:16:44 +0530
Subject: [PATCH 16/24] fix: GetOrInferSchema() to have PartitionPathFormat in
 argument

---
 cpp/src/arrow/dataset/partition.cc      | 29 +++++++-----
 cpp/src/arrow/dataset/partition.h       |  7 +--
 cpp/src/arrow/dataset/partition_test.cc |  9 ++--
 r/autobrew                              | 59 -------------------------
 4 files changed, 29 insertions(+), 75 deletions(-)
 delete mode 100644 r/autobrew

diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 6e636b88cc0..49f50622df1 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -814,33 +814,42 @@ PartitionPathFormat StripPrefixAndFilename(const std::string& path,
                              std::move(basename_filename.second)};
 }
 
-std::vector<std::string> StripPrefixAndFilename(const std::vector<std::string>& paths,
-                                                const std::string& prefix) {
-  std::vector<std::string> result;
+std::vector<PartitionPathFormat> StripPrefixAndFilename(
+    const std::vector<std::string>& paths, const std::string& prefix) {
+  std::vector<PartitionPathFormat> result;
   result.reserve(paths.size());
   for (const auto& path : paths) {
-    result.emplace_back(StripPrefixAndFilename(path, prefix).directory);
+    result.emplace_back(StripPrefixAndFilename(path, prefix));
   }
   return result;
 }
 
-std::vector<std::string> StripPrefixAndFilename(const std::vector<fs::FileInfo>& files,
-                                                const std::string& prefix) {
-  std::vector<std::string> result;
+std::vector<PartitionPathFormat> StripPrefixAndFilename(
+    const std::vector<fs::FileInfo>& files, const std::string& prefix) {
+  std::vector<PartitionPathFormat> result;
   result.reserve(files.size());
   for (const auto& info : files) {
-    result.emplace_back(StripPrefixAndFilename(info.path(), prefix).directory);
+    result.emplace_back(StripPrefixAndFilename(info.path(), prefix));
   }
   return result;
 }
 
 Result<std::shared_ptr<Schema>> PartitioningOrFactory::GetOrInferSchema(
-    const std::vector<std::string>& paths) {
+    const std::vector<PartitionPathFormat>& paths) {
+
   if (auto part = partitioning()) {
     return part->schema();
   }
+  std::vector<std::string> partition_paths(paths.size());
+  if (factory()->type_name() == "filename") {
+    std::transform(paths.begin(), paths.end(), std::back_inserter(partition_paths),
+                   [](PartitionPathFormat const& path) { return path.filename; });
+  } else {
+    std::transform(paths.begin(), paths.end(), std::back_inserter(partition_paths),
+                   [](PartitionPathFormat const& path) { return path.directory; });
+  }
 
-  return factory()->Inspect(paths);
+  return factory()->Inspect(partition_paths);
 }
 
 }  // namespace dataset
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index e18c1d6ddbf..f5368eaa788 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -366,11 +366,11 @@ ARROW_DS_EXPORT PartitionPathFormat StripPrefixAndFilename(const std::string& pa
                                                            const std::string& prefix);
 
 /// \brief Vector version of StripPrefixAndFilename.
-ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
+ARROW_DS_EXPORT std::vector<PartitionPathFormat> StripPrefixAndFilename(
     const std::vector<std::string>& paths, const std::string& prefix);
 
 /// \brief Vector version of StripPrefixAndFilename.
-ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
+ARROW_DS_EXPORT std::vector<PartitionPathFormat> StripPrefixAndFilename(
     const std::vector<fs::FileInfo>& files, const std::string& prefix);
 
 /// \brief Either a Partitioning or a PartitioningFactory
@@ -397,7 +397,8 @@ class ARROW_DS_EXPORT PartitioningOrFactory {
   const std::shared_ptr<PartitioningFactory>& factory() const { return factory_; }
 
   /// \brief Get the partition schema, inferring it with the given factory if needed.
-  Result<std::shared_ptr<Schema>> GetOrInferSchema(const std::vector<std::string>& paths);
+  Result<std::shared_ptr<Schema>> GetOrInferSchema(
+      const std::vector<PartitionPathFormat>& paths);
 
  private:
   std::shared_ptr<PartitioningFactory> factory_;
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 157a9c3af44..d872094ef89 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -975,9 +975,12 @@ TEST(TestStripPrefixAndFilename, Basic) {
   std::vector<std::string> input{"/data/year=2019/file.parquet",
                                  "/data/year=2019/month=12/file.parquet",
                                  "/data/year=2019/month=12/day=01/file.parquet"};
-  EXPECT_THAT(StripPrefixAndFilename(input, "/data"),
-              testing::ElementsAre("year=2019", "year=2019/month=12",
-                                   "year=2019/month=12/day=01"));
+  auto paths = StripPrefixAndFilename(input, "/data");
+  std::vector<std::string> path_directories;
+  std::transform(paths.begin(), paths.end(), std::back_inserter(path_directories),
+                 [](PartitionPathFormat const& path) { return path.directory; });
+  EXPECT_THAT(path_directories, testing::ElementsAre("year=2019", "year=2019/month=12",
+                                                     "year=2019/month=12/day=01"));
 }
 
 }  // namespace dataset
diff --git a/r/autobrew b/r/autobrew
deleted file mode 100644
index e653909eb57..00000000000
--- a/r/autobrew
+++ /dev/null
@@ -1,59 +0,0 @@
-# Set the default bundle version in case we don't have the exact package version
-VERSION="${VERSION:0:5}"
-if [ "$VERSION" != "3.0.0" ] && [ "$VERSION" != "4.0.1" ] && [ "$VERSION" != "5.0.0" ] && [ "$VERSION" != "6.0.0" ] && [ "$VERSION" != "6.0.1" ] && [ "$VERSION" != "7.0.0" ]; then
-VERSION="7.0.0"
-fi
-
-# Find a bundle
-if [ $(arch | grep arm) ]; then
-bottle="https://autobrew.github.io/archive/arm64_big_sur/apache-arrow-${VERSION}-arm64_big_sur.tar.xz"
-# See: https://github.com/Homebrew/homebrew-core/issues/76537
-#elif [[ ${OSTYPE:6:2} -ge 20 ]]; then
-#bottle="https://autobrew.github.io/archive/big_sur/apache-arrow-${VERSION}-big_sur.tar.xz"
-elif [[ ${OSTYPE:6:2} -lt 17 ]] && [[ ${VERSION:0:1} -lt 7 ]]; then
-bottle="https://autobrew.github.io/archive/el_capitan/apache-arrow-${VERSION}-el_capitan.tar.xz"
-else
-bottle="https://autobrew.github.io/archive/high_sierra/apache-arrow-${VERSION}-high_sierra.tar.xz"
-fi
-
-# Skip if disabled
-if [ "$DISABLE_AUTOBREW" ]; then return 0; fi
-
-# Debug
-echo "Using autobrew bundle: $(basename $bottle)"
-
-# General setup
-BREWDIR="$PWD/.deps"
-mkdir -p $BREWDIR
-curl -sSL $bottle -o libs.tar.xz
-tar -xf libs.tar.xz --strip 1 -C $BREWDIR
-rm -f libs.tar.xz
-
-# Hardcoded flags
-AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-management -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common"
-
-# Aws-sdk-cpp 1.9.xxx requires extra flags
-if [ -f "$BREWDIR/lib/libaws-crt-cpp.a" ]; then
-echo "Linking to aws-sdk-cpp 1.9"
-AWS_LIBS="$AWS_LIBS -laws-crt-cpp -laws-c-io -laws-c-s3 -laws-c-auth -laws-c-http -laws-c-cal -laws-c-compression -laws-c-mqtt"
-fi
-
-PKG_LIBS="-L$BREWDIR/lib -lparquet -larrow_dataset -larrow -larrow_bundled_dependencies -lthrift -llz4 -lsnappy -lzstd $AWS_LIBS -lpthread -lcurl"
-PKG_CFLAGS="-I$BREWDIR/include -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_JSON -DARROW_R_WITH_S3"
-
-# Prevent linking against other libs in /usr/local/lib
-for FILE in $BREWDIR/lib/*.a; do
-  BASENAME=$(basename $FILE)
-  LIBNAME=$(echo "${BASENAME%.*}" | cut -c4-)
-  cp -f $FILE $BREWDIR/lib/libbrew$LIBNAME.a
-  PKG_LIBS=$(echo $PKG_LIBS | sed "s/-l$LIBNAME /-lbrew$LIBNAME /g")
-done
-
-# Cleanup
-echo "rm -Rf .deps" >> cleanup
-chmod +x cleanup
-
-# Disable tests on R 3.6 due to buggy custom CRAN compiler
-if [ "${R_VERSION:0:1}" = "3" ]; then
-  rm -f tests/testthat.R
-fi

From 084fd96cb4acae99239366a29fa9b785721d2163 Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Thu, 5 May 2022 19:29:21 +0530
Subject: [PATCH 17/24] refactor: remove including partition.h in
 r/src/arrow_types.h

---
 r/src/arrow_types.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h
index 5a931ccb8de..d9fee37e7f1 100644
--- a/r/src/arrow_types.h
+++ b/r/src/arrow_types.h
@@ -36,7 +36,6 @@
 
 #if defined(ARROW_R_WITH_DATASET)
 #include <arrow/dataset/type_fwd.h>
-#include <arrow/dataset/partition.h>
 #endif
 
 #include <arrow/filesystem/type_fwd.h>

From 6133121d88d6569f4c2d1af2de2d7535a81abdc1 Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Thu, 5 May 2022 19:53:28 +0530
Subject: [PATCH 18/24] fix: cpp lint

---
 cpp/src/arrow/dataset/partition.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 49f50622df1..732f668b01f 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -836,7 +836,6 @@ std::vector<PartitionPathFormat> StripPrefixAndFilename(
 
 Result<std::shared_ptr<Schema>> PartitioningOrFactory::GetOrInferSchema(
     const std::vector<PartitionPathFormat>& paths) {
-
   if (auto part = partitioning()) {
     return part->schema();
   }

From 14b27118a1c28ff9e2471443486f353d014b7de2 Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Mon, 23 May 2022 12:52:30 +0530
Subject: [PATCH 19/24] feat: Parse() method to take the complete path

---
 cpp/src/arrow/dataset/discovery.cc      |   2 +-
 cpp/src/arrow/dataset/file_base.cc      |  12 +--
 cpp/src/arrow/dataset/file_parquet.cc   |  12 +--
 cpp/src/arrow/dataset/partition.cc      |  46 ++++----
 cpp/src/arrow/dataset/partition.h       |  26 ++---
 cpp/src/arrow/dataset/partition_test.cc | 137 ++++++++++++------------
 6 files changed, 110 insertions(+), 125 deletions(-)

diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc
index f8382aa4052..25fa7ff2b70 100644
--- a/cpp/src/arrow/dataset/discovery.cc
+++ b/cpp/src/arrow/dataset/discovery.cc
@@ -279,7 +279,7 @@ Result<std::shared_ptr<Dataset>> FileSystemDatasetFactory::Finish(FinishOptions
 
   std::vector<std::shared_ptr<FileFragment>> fragments;
   for (const auto& info : files_) {
-    auto fixed_path = StripPrefixAndFilename(info.path(), options_.partition_base_dir);
+    auto fixed_path = StripPrefix(info.path(), options_.partition_base_dir);
     ARROW_ASSIGN_OR_RAISE(auto partition, partitioning->Parse(fixed_path));
     ARROW_ASSIGN_OR_RAISE(auto fragment, format_->MakeFragment({info, fs_}, partition));
     fragments.push_back(fragment);
diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc
index 10277810575..ef5b7dbde4e 100644
--- a/cpp/src/arrow/dataset/file_base.cc
+++ b/cpp/src/arrow/dataset/file_base.cc
@@ -273,7 +273,7 @@ namespace {
 Status WriteBatch(std::shared_ptr<RecordBatch> batch, compute::Expression guarantee,
                   FileSystemDatasetWriteOptions write_options,
                   std::function<Status(std::shared_ptr<RecordBatch>,
-                                       const Partitioning::PartitionPathFormat&)>
+                                       const PartitionPathFormat&)>
                       write) {
   ARROW_ASSIGN_OR_RAISE(auto groups, write_options.partitioning->Partition(batch));
   batch.reset();  // drop to hopefully conserve memory
@@ -292,7 +292,7 @@ Status WriteBatch(std::shared_ptr<RecordBatch> batch, compute::Expression guaran
   for (std::size_t index = 0; index < groups.batches.size(); index++) {
     auto partition_expression = and_(groups.expressions[index], guarantee);
     auto next_batch = groups.batches[index];
-    Partitioning::PartitionPathFormat destination;
+    PartitionPathFormat destination;
     ARROW_ASSIGN_OR_RAISE(destination,
                           write_options.partitioning->Format(partition_expression));
     RETURN_NOT_OK(write(next_batch, destination));
@@ -337,10 +337,10 @@ class DatasetWritingSinkNodeConsumer : public compute::SinkNodeConsumer {
     return WriteBatch(
         batch, guarantee, write_options_,
         [this](std::shared_ptr<RecordBatch> next_batch,
-               const Partitioning::PartitionPathFormat& destination) {
+               const PartitionPathFormat& destination) {
           return task_group_.AddTask([this, next_batch, destination] {
             Future<> has_room = dataset_writer_->WriteRecordBatch(
-                next_batch, destination.directory, destination.prefix);
+                next_batch, destination.directory, destination.filename);
             if (!has_room.is_finished()) {
               // We don't have to worry about sequencing backpressure here since
               // task_group_ serves as our sequencer.  If batches continue to arrive after
@@ -481,11 +481,11 @@ class TeeNode : public compute::MapNode {
                         compute::Expression guarantee) {
     return WriteBatch(batch, guarantee, write_options_,
                       [this](std::shared_ptr<RecordBatch> next_batch,
-                             const Partitioning::PartitionPathFormat& destination) {
+                             const PartitionPathFormat& destination) {
                         return task_group_.AddTask([this, next_batch, destination] {
                           util::tracing::Span span;
                           Future<> has_room = dataset_writer_->WriteRecordBatch(
-                              next_batch, destination.directory, destination.prefix);
+                              next_batch, destination.directory, destination.filename);
                           if (!has_room.is_finished()) {
                             this->Pause();
                             return has_room.Then([this] { this->Resume(); });
diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
index 4b1ded92a4a..0ab72a877ba 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -919,18 +919,10 @@ Result<std::vector<std::shared_ptr<Schema>>> ParquetDatasetFactory::InspectSchem
     std::vector<std::string> stripped(paths_with_row_group_ids_.size());
 
     size_t i = 0;
-
-    if (options_.partitioning.factory()->type_name() == "filename") {
-      for (const auto& e : paths_with_row_group_ids_) {
+    for (const auto& e : paths_with_row_group_ids_) {
         stripped[i++] =
-            StripPrefixAndFilename(e.first, options_.partition_base_dir).filename;
+            StripPrefixAndFilename(e.first, options_.partition_base_dir);
       }
-    } else {
-      for (const auto& e : paths_with_row_group_ids_) {
-        stripped[i++] =
-            StripPrefixAndFilename(e.first, options_.partition_base_dir).directory;
-      }
-    }
     ARROW_ASSIGN_OR_RAISE(auto partition_schema, factory->Inspect(stripped));
 
     schemas.push_back(std::move(partition_schema));
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 732f668b01f..4c0426e579c 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -82,7 +82,7 @@ std::shared_ptr<Partitioning> Partitioning::Default() {
 
     std::string type_name() const override { return "default"; }
 
-    Result<compute::Expression> Parse(const PartitionPathFormat& path) const override {
+    Result<compute::Expression> Parse(const std::string& path) const override {
       return compute::literal(true);
     }
 
@@ -251,7 +251,7 @@ Result<compute::Expression> KeyValuePartitioning::ConvertKey(const Key& key) con
 }
 
 Result<compute::Expression> KeyValuePartitioning::Parse(
-    const PartitionPathFormat& path) const {
+    const std::string& path) const {
   std::vector<compute::Expression> expressions;
 
   ARROW_ASSIGN_OR_RAISE(auto parsed, ParseKeys(path));
@@ -377,8 +377,8 @@ DirectoryPartitioning::DirectoryPartitioning(std::shared_ptr<Schema> schema,
 }
 
 Result<std::vector<KeyValuePartitioning::Key>> DirectoryPartitioning::ParseKeys(
-    const PartitionPathFormat& path) const {
-  std::vector<std::string> segments = fs::internal::SplitAbstractPath(path.directory);
+    const std::string& path) const {
+  std::vector<std::string> segments = fs::internal::SplitAbstractPath(fs::internal::GetAbstractPathParent(path).first);
   return ParsePartitionSegments(segments);
 }
 
@@ -390,9 +390,9 @@ FilenamePartitioning::FilenamePartitioning(std::shared_ptr<Schema> schema,
 }
 
 Result<std::vector<KeyValuePartitioning::Key>> FilenamePartitioning::ParseKeys(
-    const PartitionPathFormat& path) const {
+    const std::string& path) const {
   std::vector<std::string> segments = fs::internal::SplitAbstractPath(
-      StripNonPrefix(path.filename), kFilenamePartitionSep);
+      StripNonPrefix(path), kFilenamePartitionSep);
   return ParsePartitionSegments(segments);
 }
 
@@ -719,10 +719,10 @@ Result<util::optional<KeyValuePartitioning::Key>> HivePartitioning::ParseKey(
 }
 
 Result<std::vector<KeyValuePartitioning::Key>> HivePartitioning::ParseKeys(
-    const PartitionPathFormat& path) const {
+    const std::string& path) const {
   std::vector<Key> keys;
 
-  for (const auto& segment : fs::internal::SplitAbstractPath(path.directory)) {
+  for (const auto& segment : fs::internal::SplitAbstractPath(fs::internal::GetAbstractPathParent(path).first)) {
     ARROW_ASSIGN_OR_RAISE(auto maybe_key, ParseKey(segment, hive_options_));
     if (auto key = maybe_key) {
       keys.push_back(std::move(*key));
@@ -805,18 +805,20 @@ std::shared_ptr<PartitioningFactory> HivePartitioning::MakeFactory(
   return std::shared_ptr<PartitioningFactory>(new HivePartitioningFactory(options));
 }
 
-PartitionPathFormat StripPrefixAndFilename(const std::string& path,
-                                           const std::string& prefix) {
+std::string StripPrefix(const std::string& path, const std::string& prefix) {
   auto maybe_base_less = fs::internal::RemoveAncestor(prefix, path);
   auto base_less = maybe_base_less ? std::string(*maybe_base_less) : path;
-  auto basename_filename = fs::internal::GetAbstractPathParent(base_less);
-  return PartitionPathFormat{std::move(basename_filename.first),
-                             std::move(basename_filename.second)};
+  return base_less;
 }
 
-std::vector<PartitionPathFormat> StripPrefixAndFilename(
+std::string StripPrefixAndFilename(const std::string& path, const std::string& prefix) {
+  auto base_less = StripPrefix(path, prefix);
+  auto basename_filename = fs::internal::GetAbstractPathParent(base_less);
+  return basename_filename.first;
+}
+std::vector<std::string> StripPrefixAndFilename(
     const std::vector<std::string>& paths, const std::string& prefix) {
-  std::vector<PartitionPathFormat> result;
+  std::vector<std::string> result;
   result.reserve(paths.size());
   for (const auto& path : paths) {
     result.emplace_back(StripPrefixAndFilename(path, prefix));
@@ -824,9 +826,9 @@ std::vector<PartitionPathFormat> StripPrefixAndFilename(
   return result;
 }
 
-std::vector<PartitionPathFormat> StripPrefixAndFilename(
+std::vector<std::string> StripPrefixAndFilename(
     const std::vector<fs::FileInfo>& files, const std::string& prefix) {
-  std::vector<PartitionPathFormat> result;
+  std::vector<std::string> result;
   result.reserve(files.size());
   for (const auto& info : files) {
     result.emplace_back(StripPrefixAndFilename(info.path(), prefix));
@@ -835,19 +837,11 @@ std::vector<PartitionPathFormat> StripPrefixAndFilename(
 }
 
 Result<std::shared_ptr<Schema>> PartitioningOrFactory::GetOrInferSchema(
-    const std::vector<PartitionPathFormat>& paths) {
+    const std::vector<std::string>& paths) {
   if (auto part = partitioning()) {
     return part->schema();
   }
   std::vector<std::string> partition_paths(paths.size());
-  if (factory()->type_name() == "filename") {
-    std::transform(paths.begin(), paths.end(), std::back_inserter(partition_paths),
-                   [](PartitionPathFormat const& path) { return path.filename; });
-  } else {
-    std::transform(paths.begin(), paths.end(), std::back_inserter(partition_paths),
-                   [](PartitionPathFormat const& path) { return path.directory; });
-  }
-
   return factory()->Inspect(partition_paths);
 }
 
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index f5368eaa788..0bfef786531 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -80,7 +80,7 @@ class ARROW_DS_EXPORT Partitioning {
       const std::shared_ptr<RecordBatch>& batch) const = 0;
 
   /// \brief Parse a path into a partition expression
-  virtual Result<compute::Expression> Parse(const PartitionPathFormat& path) const = 0;
+  virtual Result<compute::Expression> Parse(const std::string& path) const = 0;
 
   virtual Result<PartitionPathFormat> Format(const compute::Expression& expr) const = 0;
 
@@ -174,7 +174,7 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
   Result<PartitionedBatches> Partition(
       const std::shared_ptr<RecordBatch>& batch) const override;
 
-  Result<compute::Expression> Parse(const PartitionPathFormat& path) const override;
+  Result<compute::Expression> Parse(const std::string& path) const override;
 
   Result<PartitionPathFormat> Format(const compute::Expression& expr) const override;
 
@@ -191,7 +191,7 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
     }
   }
 
-  virtual Result<std::vector<Key>> ParseKeys(const PartitionPathFormat& path) const = 0;
+  virtual Result<std::vector<Key>> ParseKeys(const std::string& path) const = 0;
 
   virtual Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const = 0;
 
@@ -231,7 +231,7 @@ class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning {
       std::vector<std::string> field_names, PartitioningFactoryOptions = {});
 
  private:
-  Result<std::vector<Key>> ParseKeys(const PartitionPathFormat& path) const override;
+  Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
 
   Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
 };
@@ -288,7 +288,7 @@ class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning {
 
  private:
   const HivePartitioningOptions hive_options_;
-  Result<std::vector<Key>> ParseKeys(const PartitionPathFormat& path) const override;
+  Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
 
   Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
 };
@@ -310,8 +310,8 @@ class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning {
 
   std::string type_name() const override { return name_; }
 
-  Result<compute::Expression> Parse(const PartitionPathFormat& path) const override {
-    return parse_impl_(path.directory);
+  Result<compute::Expression> Parse(const std::string& path) const override {
+    return parse_impl_(path);
   }
 
   Result<PartitionPathFormat> Format(const compute::Expression& expr) const override {
@@ -353,24 +353,26 @@ class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning {
       std::vector<std::string> field_names, PartitioningFactoryOptions = {});
 
  private:
-  Result<std::vector<Key>> ParseKeys(const PartitionPathFormat& path) const override;
+  Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
 
   Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
 };
 
+ARROW_DS_EXPORT std::string StripPrefix(const std::string& path, const std::string& prefix);
+
 /// \brief Extracts the directory and filename and removes the prefix of a path
 ///
 /// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") ->
 /// {"year=2019","c.txt"}`
-ARROW_DS_EXPORT PartitionPathFormat StripPrefixAndFilename(const std::string& path,
+ARROW_DS_EXPORT std::string StripPrefixAndFilename(const std::string& path,
                                                            const std::string& prefix);
 
 /// \brief Vector version of StripPrefixAndFilename.
-ARROW_DS_EXPORT std::vector<PartitionPathFormat> StripPrefixAndFilename(
+ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
     const std::vector<std::string>& paths, const std::string& prefix);
 
 /// \brief Vector version of StripPrefixAndFilename.
-ARROW_DS_EXPORT std::vector<PartitionPathFormat> StripPrefixAndFilename(
+ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
     const std::vector<fs::FileInfo>& files, const std::string& prefix);
 
 /// \brief Either a Partitioning or a PartitioningFactory
@@ -398,7 +400,7 @@ class ARROW_DS_EXPORT PartitioningOrFactory {
 
   /// \brief Get the partition schema, inferring it with the given factory if needed.
   Result<std::shared_ptr<Schema>> GetOrInferSchema(
-      const std::vector<PartitionPathFormat>& paths);
+      const std::vector<std::string>& paths);
 
  private:
   std::shared_ptr<PartitioningFactory> factory_;
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index d872094ef89..1198dd9486b 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -43,11 +43,11 @@ namespace dataset {
 
 class TestPartitioning : public ::testing::Test {
  public:
-  void AssertParseError(const PartitionPathFormat& path) {
+  void AssertParseError(const std::string& path) {
     ASSERT_RAISES(Invalid, partitioning_->Parse(path));
   }
 
-  void AssertParse(const PartitionPathFormat& path, compute::Expression expected) {
+  void AssertParse(const std::string& path, compute::Expression expected) {
     ASSERT_OK_AND_ASSIGN(auto parsed, partitioning_->Parse(path));
     ASSERT_EQ(parsed, expected);
   }
@@ -64,11 +64,14 @@ class TestPartitioning : public ::testing::Test {
     ASSERT_OK_AND_ASSIGN(auto formatted, partitioning_->Format(expr));
     ASSERT_EQ(formatted.directory, expected_directory);
     ASSERT_EQ(formatted.filename, expected_filename);
-
+    
+    // if ((formatted.filename).empty()){
+    //   formatted.filename = "format.parquet";
+    // }
     // ensure the formatted path round trips the relevant components of the partition
     // expression: roundtripped should be a subset of expr
     ASSERT_OK_AND_ASSIGN(compute::Expression roundtripped,
-                         partitioning_->Parse(formatted));
+                         partitioning_->Parse(formatted.directory+formatted.filename));
     ASSERT_OK_AND_ASSIGN(roundtripped, roundtripped.Bind(*written_schema_));
     ASSERT_OK_AND_ASSIGN(auto simplified, SimplifyWithGuarantee(roundtripped, expr));
     ASSERT_EQ(simplified, literal(true));
@@ -187,20 +190,20 @@ TEST_F(TestPartitioning, DirectoryPartitioning) {
   partitioning_ = std::make_shared<DirectoryPartitioning>(
       schema({field("alpha", int32()), field("beta", utf8())}));
 
-  AssertParse({"/0/hello", ""}, and_(equal(field_ref("alpha"), literal(0)),
+  AssertParse("/0/hello", and_(equal(field_ref("alpha"), literal(0)),
                                      equal(field_ref("beta"), literal("hello"))));
-  AssertParse({"/3", ""}, equal(field_ref("alpha"), literal(3)));
-  AssertParseError({"/world/0", ""});    // reversed order
-  AssertParseError({"/0.0/foo", ""});    // invalid alpha
-  AssertParseError({"/3.25", ""});       // invalid alpha with missing beta
-  AssertParse({"", ""}, literal(true));  // no segments to parse
+  AssertParse("/3", equal(field_ref("alpha"), literal(3)));
+  AssertParseError("/world/0");    // reversed order
+  AssertParseError("/0.0/foo");    // invalid alpha
+  AssertParseError("/3.25");       // invalid alpha with missing beta
+  AssertParse("", literal(true));  // no segments to parse
 
   // gotcha someday:
-  AssertParse({"/0/dat.parquet", ""},
+  AssertParse("/0/dat.parquet",
               and_(equal(field_ref("alpha"), literal(0)),
                    equal(field_ref("beta"), literal("dat.parquet"))));
 
-  AssertParse({"/0/foo/ignored=2341", ""},
+  AssertParse("/0/foo/ignored=2341",
               and_(equal(field_ref("alpha"), literal(0)),
                    equal(field_ref("beta"), literal("foo"))));
 }
@@ -209,15 +212,15 @@ TEST_F(TestPartitioning, FilenamePartitioning) {
   partitioning_ = std::make_shared<FilenamePartitioning>(
       schema({field("alpha", int32()), field("beta", utf8())}));
 
-  AssertParse({"", "0_hello_"}, and_(equal(field_ref("alpha"), literal(0)),
+  AssertParse("0_hello_", and_(equal(field_ref("alpha"), literal(0)),
                                      equal(field_ref("beta"), literal("hello"))));
-  AssertParse({"", "0_"}, equal(field_ref("alpha"), literal(0)));
-  AssertParseError({"", "world_0_"});    // reversed order
-  AssertParseError({"", "0.0_foo_"});    // invalid alpha
-  AssertParseError({"", "3.25_"});       // invalid alpha with missing beta
-  AssertParse({"", ""}, literal(true));  // no segments to parse
+  AssertParse("0_", equal(field_ref("alpha"), literal(0)));
+  AssertParseError("world_0_");    // reversed order
+  AssertParseError("0.0_foo_");    // invalid alpha
+  AssertParseError("3.25_");       // invalid alpha with missing beta
+  AssertParse("", literal(true));  // no segments to parse
 
-  AssertParse({"", "0_foo_ignored=2341"}, and_(equal(field_ref("alpha"), literal(0)),
+  AssertParse("0_foo_ignored=2341", and_(equal(field_ref("alpha"), literal(0)),
                                                equal(field_ref("beta"), literal("foo"))));
 }
 
@@ -284,7 +287,7 @@ TEST_F(TestPartitioning, DirectoryPartitioningWithTemporal) {
         schema({field("year", int32()), field("month", int8()), field("day", temporal)}));
 
     ASSERT_OK_AND_ASSIGN(auto day, StringScalar("2020-06-08").CastTo(temporal));
-    AssertParse({"/2020/06/2020-06-08", ""},
+    AssertParse("/2020/06/2020-06-08",
                 and_({equal(field_ref("year"), literal(2020)),
                       equal(field_ref("month"), literal<int8_t>(6)),
                       equal(field_ref("day"), literal(day))}));
@@ -386,10 +389,10 @@ TEST_F(TestPartitioning, DictionaryHasUniqueValues) {
         std::make_shared<DictionaryScalar>(index_and_dictionary, alpha->type());
 
     auto path = "/" + expected_dictionary->GetString(i);
-    AssertParse({path, ""}, equal(field_ref("alpha"), literal(dictionary_scalar)));
+    AssertParse(path, equal(field_ref("alpha"), literal(dictionary_scalar)));
   }
 
-  AssertParseError({"/yosemite", ""});  // not in inspected dictionary
+  AssertParseError("/yosemite");  // not in inspected dictionary
 }
 
 TEST_F(TestPartitioning, DiscoverSchemaSegfault) {
@@ -402,28 +405,28 @@ TEST_F(TestPartitioning, HivePartitioning) {
   partitioning_ = std::make_shared<HivePartitioning>(
       schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz");
 
-  AssertParse({"/alpha=0/beta=3.25", ""}, and_(equal(field_ref("alpha"), literal(0)),
+  AssertParse("/alpha=0/beta=3.25", and_(equal(field_ref("alpha"), literal(0)),
                                                equal(field_ref("beta"), literal(3.25f))));
-  AssertParse({"/beta=3.25/alpha=0", ""}, and_(equal(field_ref("beta"), literal(3.25f)),
+  AssertParse("/beta=3.25/alpha=0", and_(equal(field_ref("beta"), literal(3.25f)),
                                                equal(field_ref("alpha"), literal(0))));
-  AssertParse({"/alpha=0", ""}, equal(field_ref("alpha"), literal(0)));
+  AssertParse("/alpha=0", equal(field_ref("alpha"), literal(0)));
   AssertParse(
-      {"/alpha=xyz/beta=3.25", ""},
+      "/alpha=xyz/beta=3.25",
       and_(is_null(field_ref("alpha")), equal(field_ref("beta"), literal(3.25f))));
-  AssertParse({"/beta=3.25", ""}, equal(field_ref("beta"), literal(3.25f)));
-  AssertParse({"", ""}, literal(true));
+  AssertParse("/beta=3.25", equal(field_ref("beta"), literal(3.25f)));
+  AssertParse("", literal(true));
 
-  AssertParse({"/alpha=0/beta=3.25/ignored=2341", ""},
+  AssertParse("/alpha=0/beta=3.25/ignored=2341",
               and_(equal(field_ref("alpha"), literal(0)),
                    equal(field_ref("beta"), literal(3.25f))));
 
-  AssertParse({"/alpha=0/beta=3.25/ignored=2341", ""},
+  AssertParse("/alpha=0/beta=3.25/ignored=2341",
               and_(equal(field_ref("alpha"), literal(0)),
                    equal(field_ref("beta"), literal(3.25f))));
 
-  AssertParse({"/ignored=2341", ""}, literal(true));
+  AssertParse("/ignored=2341", literal(true));
 
-  AssertParseError({"/alpha=0.0/beta=3.25", ""});  // conversion of "0.0" to int32 fails
+  AssertParseError("/alpha=0.0/beta=3.25");  // conversion of "0.0" to int32 fails
 }
 
 TEST_F(TestPartitioning, HivePartitioningFormat) {
@@ -561,10 +564,10 @@ TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) {
         std::make_shared<DictionaryScalar>(index_and_dictionary, alpha->type());
 
     auto path = "/alpha=" + expected_dictionary->GetString(i);
-    AssertParse({path, ""}, equal(field_ref("alpha"), literal(dictionary_scalar)));
+    AssertParse(path, equal(field_ref("alpha"), literal(dictionary_scalar)));
   }
 
-  AssertParseError({"/alpha=yosemite", ""});  // not in inspected dictionary
+  AssertParseError("/alpha=yosemite");  // not in inspected dictionary
 }
 
 TEST_F(TestPartitioning, ExistingSchemaDirectory) {
@@ -665,7 +668,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
   auto date = std::make_shared<TimestampScalar>(1620086400, ts);
   auto time = std::make_shared<TimestampScalar>(1620113220, ts);
   partitioning_ = std::make_shared<DirectoryPartitioning>(options.schema, ArrayVector());
-  AssertParse({"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24", ""},
+  AssertParse("/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24",
               and_({equal(field_ref("date"), literal(date)),
                     equal(field_ref("time"), literal(time)),
                     equal(field_ref("str"), literal("$"))}));
@@ -674,7 +677,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
                                   factory_->Inspect({"/%AF/%BF/%CF"}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-                                  partitioning_->Parse({"/%AF/%BF/%CF", ""}));
+                                  partitioning_->Parse("/%AF/%BF/%CF"));
 
   options.segment_encoding = SegmentEncoding::None;
   options.schema =
@@ -685,7 +688,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
                 options.schema->fields());
   partitioning_ = std::make_shared<DirectoryPartitioning>(
       options.schema, ArrayVector(), options.AsPartitioningOptions());
-  AssertParse({"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24", ""},
+  AssertParse("/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24",
               and_({equal(field_ref("date"), literal("2021-05-04 00%3A00%3A00")),
                     equal(field_ref("time"), literal("2021-05-04 07%3A27%3A00")),
                     equal(field_ref("str"), literal("%24"))}));
@@ -708,16 +711,16 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
   auto time = std::make_shared<TimestampScalar>(1620113220, ts);
   partitioning_ = std::make_shared<HivePartitioning>(options.schema, ArrayVector(),
                                                      options.AsHivePartitioningOptions());
-  AssertParse({"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", ""},
+  AssertParse("/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$",
               and_({equal(field_ref("date"), literal(date)),
                     equal(field_ref("time"), literal(time)), is_null(field_ref("str"))}));
   AssertParse(
-      {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", ""},
+      "/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE",
       and_({equal(field_ref("date"), literal(date)),
             equal(field_ref("time"), literal(time)),
             equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))}));
   // URL-encoded null fallback value
-  AssertParse({"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", ""},
+  AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24",
               and_({equal(field_ref("date"), literal(date)),
                     equal(field_ref("time"), literal(time)), is_null(field_ref("str"))}));
 
@@ -726,7 +729,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
                                   factory_->Inspect({"/date=%AF/time=%BF/str=%CF"}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-      partitioning_->Parse({"/date=%AF/time=%BF/str=%CF", ""}));
+      partitioning_->Parse("/date=%AF/time=%BF/str=%CF"));
 
   options.segment_encoding = SegmentEncoding::None;
   options.schema =
@@ -739,7 +742,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
       options.schema->fields());
   partitioning_ = std::make_shared<HivePartitioning>(options.schema, ArrayVector(),
                                                      options.AsHivePartitioningOptions());
-  AssertParse({"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", ""},
+  AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24",
               and_({equal(field_ref("date"), literal("2021-05-04 00%3A00%3A00")),
                     equal(field_ref("time"), literal("2021-05-04 07%3A27%3A00")),
                     equal(field_ref("str"), literal("%24"))}));
@@ -749,7 +752,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
                                   factory_->Inspect({"/date=\xAF/time=\xBF/str=\xCF"}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-      partitioning_->Parse({"/date=\xAF/time=\xBF/str=\xCF", ""}));
+      partitioning_->Parse("/date=\xAF/time=\xBF/str=\xCF"));
 }
 
 TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
@@ -773,22 +776,19 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
   partitioning_ = std::make_shared<HivePartitioning>(options.schema, ArrayVector(),
                                                      options.AsHivePartitioningOptions());
   AssertParse(
-      {"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
+      "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
        "07:27:00/str=$",
-       ""},
       and_({equal(field_ref("test'; date"), literal(date)),
             equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))}));
-  AssertParse({"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
+  AssertParse("/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
                "07:27:00/str=%E3%81%8F%E3%81%BE",
-               ""},
               and_({equal(field_ref("test'; date"), literal(date)),
                     equal(field_ref("test'; time"), literal(time)),
                     equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))}));
   // URL-encoded null fallback value
   AssertParse(
-      {"/test%27%3B%20date=2021-05-04 00%3A00%3A00/test%27%3B%20time=2021-05-04 "
+      "/test%27%3B%20date=2021-05-04 00%3A00%3A00/test%27%3B%20time=2021-05-04 "
        "07%3A27%3A00/str=%24",
-       ""},
       and_({equal(field_ref("test'; date"), literal(date)),
             equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))}));
 
@@ -798,7 +798,7 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
       factory_->Inspect({"/%AF=2021-05-04/time=2021-05-04 07%3A27%3A00/str=%24"}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-      partitioning_->Parse({"/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24", ""}));
+      partitioning_->Parse("/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24"));
 }
 
 TEST_F(TestPartitioning, EtlThenHive) {
@@ -823,18 +823,18 @@ TEST_F(TestPartitioning, EtlThenHive) {
         auto etl_segments_end = segments.begin() + etl_fields.size();
         auto etl_path =
             fs::internal::JoinAbstractPath(segments.begin(), etl_segments_end);
-        ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_part.Parse({etl_path, ""}));
+        ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_part.Parse(etl_path));
 
         auto alphabeta_segments_end = etl_segments_end + alphabeta_fields.size();
         auto alphabeta_path =
             fs::internal::JoinAbstractPath(etl_segments_end, alphabeta_segments_end);
         ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr,
-                              alphabeta_part.Parse({alphabeta_path, ""}));
+                              alphabeta_part.Parse(alphabeta_path));
 
         return and_(etl_expr, alphabeta_expr);
       });
 
-  AssertParse({"/1999/12/31/00/alpha=0/beta=3.25", ""},
+  AssertParse("/1999/12/31/00/alpha=0/beta=3.25",
               and_({equal(field_ref("year"), literal<int16_t>(1999)),
                     equal(field_ref("month"), literal<int8_t>(12)),
                     equal(field_ref("day"), literal<int8_t>(31)),
@@ -842,7 +842,7 @@ TEST_F(TestPartitioning, EtlThenHive) {
                     and_(equal(field_ref("alpha"), literal<int32_t>(0)),
                          equal(field_ref("beta"), literal<float>(3.25f)))}));
 
-  AssertParseError({"/20X6/03/21/05/alpha=0/beta=3.25", ""});
+  AssertParseError("/20X6/03/21/05/alpha=0/beta=3.25");
 }
 
 TEST_F(TestPartitioning, Set) {
@@ -883,9 +883,9 @@ TEST_F(TestPartitioning, Set) {
   auto x_in = [&](std::vector<int32_t> set) {
     return call("is_in", {field_ref("x")}, compute::SetLookupOptions{ints(set)});
   };
-  AssertParse({"/x in [1]", ""}, x_in({1}));
-  AssertParse({"/x in [1 4 5]", ""}, x_in({1, 4, 5}));
-  AssertParse({"/x in []", ""}, x_in({}));
+  AssertParse("/x in [1]", x_in({1}));
+  AssertParse("/x in [1 4 5]", x_in({1, 4, 5}));
+  AssertParse("/x in []", x_in({}));
 }
 
 // An adhoc partitioning which parses segments like "/x=[-3.25, 0.0)"
@@ -896,11 +896,11 @@ class RangePartitioning : public Partitioning {
 
   std::string type_name() const override { return "range"; }
 
-  Result<compute::Expression> Parse(const PartitionPathFormat& path) const override {
+  Result<compute::Expression> Parse(const std::string& path) const override {
     std::vector<compute::Expression> ranges;
 
     HivePartitioningOptions options;
-    for (auto segment : fs::internal::SplitAbstractPath(path.directory)) {
+    for (auto segment : fs::internal::SplitAbstractPath(path)) {
       ARROW_ASSIGN_OR_RAISE(auto key, HivePartitioning::ParseKey(segment, options));
       if (!key) {
         return Status::Invalid("can't parse '", segment, "' as a range");
@@ -955,7 +955,7 @@ TEST_F(TestPartitioning, Range) {
   partitioning_ = std::make_shared<RangePartitioning>(
       schema({field("x", float64()), field("y", float64()), field("z", float64())}));
 
-  AssertParse({"/x=[-1.5 0.0)/y=[0.0 1.5)/z=(1.5 3.0]", ""},
+  AssertParse("/x=[-1.5 0.0)/y=[0.0 1.5)/z=(1.5 3.0]",
               and_({and_(greater_equal(field_ref("x"), literal(-1.5)),
                          less(field_ref("x"), literal(0.0))),
                     and_(greater_equal(field_ref("y"), literal(0.0)),
@@ -965,21 +965,18 @@ TEST_F(TestPartitioning, Range) {
 }
 
 TEST(TestStripPrefixAndFilename, Basic) {
-  ASSERT_EQ(StripPrefixAndFilename("", "").directory, "");
-  ASSERT_EQ(StripPrefixAndFilename("a.csv", "").directory, "");
-  ASSERT_EQ(StripPrefixAndFilename("a/b.csv", "").directory, "a");
-  ASSERT_EQ(StripPrefixAndFilename("/a/b/c.csv", "/a").directory, "b");
-  ASSERT_EQ(StripPrefixAndFilename("/a/b/c/d.csv", "/a").directory, "b/c");
-  ASSERT_EQ(StripPrefixAndFilename("/a/b/c.csv", "/a/b").directory, "");
+  ASSERT_EQ(StripPrefixAndFilename("", ""), "");
+  ASSERT_EQ(StripPrefixAndFilename("a.csv", ""), "");
+  ASSERT_EQ(StripPrefixAndFilename("a/b.csv", ""), "a");
+  ASSERT_EQ(StripPrefixAndFilename("/a/b/c.csv", "/a"), "b");
+  ASSERT_EQ(StripPrefixAndFilename("/a/b/c/d.csv", "/a"), "b/c");
+  ASSERT_EQ(StripPrefixAndFilename("/a/b/c.csv", "/a/b"), "");
 
   std::vector<std::string> input{"/data/year=2019/file.parquet",
                                  "/data/year=2019/month=12/file.parquet",
                                  "/data/year=2019/month=12/day=01/file.parquet"};
   auto paths = StripPrefixAndFilename(input, "/data");
-  std::vector<std::string> path_directories;
-  std::transform(paths.begin(), paths.end(), std::back_inserter(path_directories),
-                 [](PartitionPathFormat const& path) { return path.directory; });
-  EXPECT_THAT(path_directories, testing::ElementsAre("year=2019", "year=2019/month=12",
+  EXPECT_THAT(paths, testing::ElementsAre("year=2019", "year=2019/month=12",
                                                      "year=2019/month=12/day=01"));
 }
 

From a524acca1f0dcd689bb4115ec7c375ba001dfb3f Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Mon, 23 May 2022 12:57:20 +0530
Subject: [PATCH 20/24] feat: [Python] Parse method to take the entire path

---
 python/pyarrow/_dataset.pyx                  | 6 ++----
 python/pyarrow/includes/libarrow_dataset.pxd | 6 +-----
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index d75caeacdc4..7e3fe0bfe21 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -1319,11 +1319,9 @@ cdef class Partitioning(_Weakrefable):
     cdef inline shared_ptr[CPartitioning] unwrap(self):
         return self.wrapped
 
-    def parse(self, directory="", filename=""):
+    def parse(self, path):
         cdef CResult[CExpression] result
-        cdef CPartitionPathFormat path
-        path.directory = tobytes(directory)
-        path.filename = tobytes(filename)
+        path = tobytes(path)
         result = self.partitioning.Parse(path)
         return Expression.wrap(GetResultValue(result))
 
diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
index 8a831a3a734..64e59e05613 100644
--- a/python/pyarrow/includes/libarrow_dataset.pxd
+++ b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -277,13 +277,9 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
         CCSVConvertOptions convert_options
         CCSVReadOptions read_options
 
-    cdef cppclass CPartitionPathFormat "arrow::dataset::PartitionPathFormat":
-        c_string directory
-        c_string filename
-
     cdef cppclass CPartitioning "arrow::dataset::Partitioning":
         c_string type_name() const
-        CResult[CExpression] Parse(const CPartitionPathFormat & path) const
+        CResult[CExpression] Parse(const c_string & path) const
         const shared_ptr[CSchema] & schema()
 
     cdef cppclass CSegmentEncoding" arrow::dataset::SegmentEncoding":

From e08b2e5f704285dbac47a916be3543899cc78196 Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Mon, 23 May 2022 14:19:03 +0530
Subject: [PATCH 21/24] fix: python test changes for new parse method signature

---
 cpp/src/arrow/dataset/file_parquet.cc |  2 +-
 cpp/src/arrow/dataset/partition.cc    |  3 +--
 python/pyarrow/_dataset.pyx           |  3 +--
 python/pyarrow/tests/test_dataset.py  | 14 +++++++-------
 4 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
index 0ab72a877ba..0361520892c 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -894,7 +894,7 @@ ParquetDatasetFactory::CollectParquetFragments(const Partitioning& partitioning)
     auto row_groups = Iota(metadata_subset->num_row_groups());
 
     auto partition_expression =
-        partitioning.Parse(StripPrefixAndFilename(path, options_.partition_base_dir))
+        partitioning.Parse(StripPrefix(path, options_.partition_base_dir))
             .ValueOr(compute::literal(true));
 
     ARROW_ASSIGN_OR_RAISE(
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 4c0426e579c..73d4f4b433f 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -841,8 +841,7 @@ Result<std::shared_ptr<Schema>> PartitioningOrFactory::GetOrInferSchema(
   if (auto part = partitioning()) {
     return part->schema();
   }
-  std::vector<std::string> partition_paths(paths.size());
-  return factory()->Inspect(partition_paths);
+  return factory()->Inspect(paths);
 }
 
 }  // namespace dataset
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 7e3fe0bfe21..10aeafbdbaa 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -1321,8 +1321,7 @@ cdef class Partitioning(_Weakrefable):
 
     def parse(self, path):
         cdef CResult[CExpression] result
-        path = tobytes(path)
-        result = self.partitioning.Parse(path)
+        result = self.partitioning.Parse(tobytes(path))
         return Expression.wrap(GetResultValue(result))
 
     @property
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index ca9ed3f2a1d..288301fd4b4 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -582,7 +582,7 @@ def test_partitioning():
     )
     assert len(partitioning.dictionaries) == 2
     assert all(x is None for x in partitioning.dictionaries)
-    expr = partitioning.parse('/3/3.14')
+    expr = partitioning.parse('/3/3.14/')
     assert isinstance(expr, ds.Expression)
 
     expected = (ds.field('group') == 3) & (ds.field('key') == 3.14)
@@ -591,7 +591,7 @@ def test_partitioning():
     with pytest.raises(pa.ArrowInvalid):
         partitioning.parse('/prefix/3/aaa')
 
-    expr = partitioning.parse('/3')
+    expr = partitioning.parse('/3/')
     expected = ds.field('group') == 3
     assert expr.equals(expected)
 
@@ -604,20 +604,20 @@ def test_partitioning():
     )
     assert len(partitioning.dictionaries) == 2
     assert all(x is None for x in partitioning.dictionaries)
-    expr = partitioning.parse('/alpha=0/beta=3')
+    expr = partitioning.parse('/alpha=0/beta=3/')
     expected = (
         (ds.field('alpha') == ds.scalar(0)) &
         (ds.field('beta') == ds.scalar(3))
     )
     assert expr.equals(expected)
 
-    expr = partitioning.parse('/alpha=xyz/beta=3')
+    expr = partitioning.parse('/alpha=xyz/beta=3/')
     expected = (
         (ds.field('alpha').is_null() & (ds.field('beta') == ds.scalar(3)))
     )
     assert expr.equals(expected)
 
-    for shouldfail in ['/alpha=one/beta=2', '/alpha=one', '/beta=two']:
+    for shouldfail in ['/alpha=one/beta=2/', '/alpha=one/', '/beta=two/']:
         with pytest.raises(pa.ArrowInvalid):
             partitioning.parse(shouldfail)
 
@@ -629,14 +629,14 @@ def test_partitioning():
     )
     assert len(partitioning.dictionaries) == 2
     assert all(x is None for x in partitioning.dictionaries)
-    expr = partitioning.parse('', '3_3.14_')
+    expr = partitioning.parse('3_3.14_')
     assert isinstance(expr, ds.Expression)
 
     expected = (ds.field('group') == 3) & (ds.field('key') == 3.14)
     assert expr.equals(expected)
 
     with pytest.raises(pa.ArrowInvalid):
-        partitioning.parse('', 'prefix_3_aaa_')
+        partitioning.parse('prefix_3_aaa_')
 
     partitioning = ds.DirectoryPartitioning(
         pa.schema([

From 1a61b0ca1891471d2d277797b37e44e0ca4d5134 Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Mon, 23 May 2022 14:34:04 +0530
Subject: [PATCH 22/24] fix: cpp & python lint

---
 cpp/src/arrow/dataset/file_base.cc      | 10 ++--
 cpp/src/arrow/dataset/file_parquet.cc   |  5 +-
 cpp/src/arrow/dataset/partition.cc      | 22 ++++----
 cpp/src/arrow/dataset/partition.h       |  8 +--
 cpp/src/arrow/dataset/partition_test.cc | 68 +++++++++++--------------
 5 files changed, 54 insertions(+), 59 deletions(-)

diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc
index ef5b7dbde4e..2e05706bbbc 100644
--- a/cpp/src/arrow/dataset/file_base.cc
+++ b/cpp/src/arrow/dataset/file_base.cc
@@ -270,11 +270,11 @@ Future<> FileWriter::Finish() {
 
 namespace {
 
-Status WriteBatch(std::shared_ptr<RecordBatch> batch, compute::Expression guarantee,
-                  FileSystemDatasetWriteOptions write_options,
-                  std::function<Status(std::shared_ptr<RecordBatch>,
-                                       const PartitionPathFormat&)>
-                      write) {
+Status WriteBatch(
+    std::shared_ptr<RecordBatch> batch, compute::Expression guarantee,
+    FileSystemDatasetWriteOptions write_options,
+    std::function<Status(std::shared_ptr<RecordBatch>, const PartitionPathFormat&)>
+        write) {
   ARROW_ASSIGN_OR_RAISE(auto groups, write_options.partitioning->Partition(batch));
   batch.reset();  // drop to hopefully conserve memory
 
diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
index 0361520892c..f2a3903208f 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -920,9 +920,8 @@ Result<std::vector<std::shared_ptr<Schema>>> ParquetDatasetFactory::InspectSchem
 
     size_t i = 0;
     for (const auto& e : paths_with_row_group_ids_) {
-        stripped[i++] =
-            StripPrefixAndFilename(e.first, options_.partition_base_dir);
-      }
+      stripped[i++] = StripPrefixAndFilename(e.first, options_.partition_base_dir);
+    }
     ARROW_ASSIGN_OR_RAISE(auto partition_schema, factory->Inspect(stripped));
 
     schemas.push_back(std::move(partition_schema));
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index 73d4f4b433f..d37dfd97d39 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -250,8 +250,7 @@ Result<compute::Expression> KeyValuePartitioning::ConvertKey(const Key& key) con
                         compute::literal(std::move(converted)));
 }
 
-Result<compute::Expression> KeyValuePartitioning::Parse(
-    const std::string& path) const {
+Result<compute::Expression> KeyValuePartitioning::Parse(const std::string& path) const {
   std::vector<compute::Expression> expressions;
 
   ARROW_ASSIGN_OR_RAISE(auto parsed, ParseKeys(path));
@@ -378,7 +377,8 @@ DirectoryPartitioning::DirectoryPartitioning(std::shared_ptr<Schema> schema,
 
 Result<std::vector<KeyValuePartitioning::Key>> DirectoryPartitioning::ParseKeys(
     const std::string& path) const {
-  std::vector<std::string> segments = fs::internal::SplitAbstractPath(fs::internal::GetAbstractPathParent(path).first);
+  std::vector<std::string> segments =
+      fs::internal::SplitAbstractPath(fs::internal::GetAbstractPathParent(path).first);
   return ParsePartitionSegments(segments);
 }
 
@@ -391,8 +391,8 @@ FilenamePartitioning::FilenamePartitioning(std::shared_ptr<Schema> schema,
 
 Result<std::vector<KeyValuePartitioning::Key>> FilenamePartitioning::ParseKeys(
     const std::string& path) const {
-  std::vector<std::string> segments = fs::internal::SplitAbstractPath(
-      StripNonPrefix(path), kFilenamePartitionSep);
+  std::vector<std::string> segments =
+      fs::internal::SplitAbstractPath(StripNonPrefix(path), kFilenamePartitionSep);
   return ParsePartitionSegments(segments);
 }
 
@@ -722,7 +722,8 @@ Result<std::vector<KeyValuePartitioning::Key>> HivePartitioning::ParseKeys(
     const std::string& path) const {
   std::vector<Key> keys;
 
-  for (const auto& segment : fs::internal::SplitAbstractPath(fs::internal::GetAbstractPathParent(path).first)) {
+  for (const auto& segment :
+       fs::internal::SplitAbstractPath(fs::internal::GetAbstractPathParent(path).first)) {
     ARROW_ASSIGN_OR_RAISE(auto maybe_key, ParseKey(segment, hive_options_));
     if (auto key = maybe_key) {
       keys.push_back(std::move(*key));
@@ -816,8 +817,9 @@ std::string StripPrefixAndFilename(const std::string& path, const std::string& p
   auto basename_filename = fs::internal::GetAbstractPathParent(base_less);
   return basename_filename.first;
 }
-std::vector<std::string> StripPrefixAndFilename(
-    const std::vector<std::string>& paths, const std::string& prefix) {
+
+std::vector<std::string> StripPrefixAndFilename(const std::vector<std::string>& paths,
+                                                const std::string& prefix) {
   std::vector<std::string> result;
   result.reserve(paths.size());
   for (const auto& path : paths) {
@@ -826,8 +828,8 @@ std::vector<std::string> StripPrefixAndFilename(
   return result;
 }
 
-std::vector<std::string> StripPrefixAndFilename(
-    const std::vector<fs::FileInfo>& files, const std::string& prefix) {
+std::vector<std::string> StripPrefixAndFilename(const std::vector<fs::FileInfo>& files,
+                                                const std::string& prefix) {
   std::vector<std::string> result;
   result.reserve(files.size());
   for (const auto& info : files) {
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index 0bfef786531..cd225f13787 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -358,14 +358,15 @@ class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning {
   Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
 };
 
-ARROW_DS_EXPORT std::string StripPrefix(const std::string& path, const std::string& prefix);
+ARROW_DS_EXPORT std::string StripPrefix(const std::string& path,
+                                        const std::string& prefix);
 
 /// \brief Extracts the directory and filename and removes the prefix of a path
 ///
 /// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") ->
 /// {"year=2019","c.txt"}`
 ARROW_DS_EXPORT std::string StripPrefixAndFilename(const std::string& path,
-                                                           const std::string& prefix);
+                                                   const std::string& prefix);
 
 /// \brief Vector version of StripPrefixAndFilename.
 ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
@@ -399,8 +400,7 @@ class ARROW_DS_EXPORT PartitioningOrFactory {
   const std::shared_ptr<PartitioningFactory>& factory() const { return factory_; }
 
   /// \brief Get the partition schema, inferring it with the given factory if needed.
-  Result<std::shared_ptr<Schema>> GetOrInferSchema(
-      const std::vector<std::string>& paths);
+  Result<std::shared_ptr<Schema>> GetOrInferSchema(const std::vector<std::string>& paths);
 
  private:
   std::shared_ptr<PartitioningFactory> factory_;
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 1198dd9486b..b023b1d5bc5 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -64,14 +64,14 @@ class TestPartitioning : public ::testing::Test {
     ASSERT_OK_AND_ASSIGN(auto formatted, partitioning_->Format(expr));
     ASSERT_EQ(formatted.directory, expected_directory);
     ASSERT_EQ(formatted.filename, expected_filename);
-    
+
     // if ((formatted.filename).empty()){
     //   formatted.filename = "format.parquet";
     // }
     // ensure the formatted path round trips the relevant components of the partition
     // expression: roundtripped should be a subset of expr
     ASSERT_OK_AND_ASSIGN(compute::Expression roundtripped,
-                         partitioning_->Parse(formatted.directory+formatted.filename));
+                         partitioning_->Parse(formatted.directory + formatted.filename));
     ASSERT_OK_AND_ASSIGN(roundtripped, roundtripped.Bind(*written_schema_));
     ASSERT_OK_AND_ASSIGN(auto simplified, SimplifyWithGuarantee(roundtripped, expr));
     ASSERT_EQ(simplified, literal(true));
@@ -191,7 +191,7 @@ TEST_F(TestPartitioning, DirectoryPartitioning) {
       schema({field("alpha", int32()), field("beta", utf8())}));
 
   AssertParse("/0/hello", and_(equal(field_ref("alpha"), literal(0)),
-                                     equal(field_ref("beta"), literal("hello"))));
+                               equal(field_ref("beta"), literal("hello"))));
   AssertParse("/3", equal(field_ref("alpha"), literal(3)));
   AssertParseError("/world/0");    // reversed order
   AssertParseError("/0.0/foo");    // invalid alpha
@@ -199,13 +199,11 @@ TEST_F(TestPartitioning, DirectoryPartitioning) {
   AssertParse("", literal(true));  // no segments to parse
 
   // gotcha someday:
-  AssertParse("/0/dat.parquet",
-              and_(equal(field_ref("alpha"), literal(0)),
-                   equal(field_ref("beta"), literal("dat.parquet"))));
+  AssertParse("/0/dat.parquet", and_(equal(field_ref("alpha"), literal(0)),
+                                     equal(field_ref("beta"), literal("dat.parquet"))));
 
-  AssertParse("/0/foo/ignored=2341",
-              and_(equal(field_ref("alpha"), literal(0)),
-                   equal(field_ref("beta"), literal("foo"))));
+  AssertParse("/0/foo/ignored=2341", and_(equal(field_ref("alpha"), literal(0)),
+                                          equal(field_ref("beta"), literal("foo"))));
 }
 
 TEST_F(TestPartitioning, FilenamePartitioning) {
@@ -213,7 +211,7 @@ TEST_F(TestPartitioning, FilenamePartitioning) {
       schema({field("alpha", int32()), field("beta", utf8())}));
 
   AssertParse("0_hello_", and_(equal(field_ref("alpha"), literal(0)),
-                                     equal(field_ref("beta"), literal("hello"))));
+                               equal(field_ref("beta"), literal("hello"))));
   AssertParse("0_", equal(field_ref("alpha"), literal(0)));
   AssertParseError("world_0_");    // reversed order
   AssertParseError("0.0_foo_");    // invalid alpha
@@ -221,7 +219,7 @@ TEST_F(TestPartitioning, FilenamePartitioning) {
   AssertParse("", literal(true));  // no segments to parse
 
   AssertParse("0_foo_ignored=2341", and_(equal(field_ref("alpha"), literal(0)),
-                                               equal(field_ref("beta"), literal("foo"))));
+                                         equal(field_ref("beta"), literal("foo"))));
 }
 
 TEST_F(TestPartitioning, DirectoryPartitioningFormat) {
@@ -406,13 +404,12 @@ TEST_F(TestPartitioning, HivePartitioning) {
       schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz");
 
   AssertParse("/alpha=0/beta=3.25", and_(equal(field_ref("alpha"), literal(0)),
-                                               equal(field_ref("beta"), literal(3.25f))));
+                                         equal(field_ref("beta"), literal(3.25f))));
   AssertParse("/beta=3.25/alpha=0", and_(equal(field_ref("beta"), literal(3.25f)),
-                                               equal(field_ref("alpha"), literal(0))));
+                                         equal(field_ref("alpha"), literal(0))));
   AssertParse("/alpha=0", equal(field_ref("alpha"), literal(0)));
-  AssertParse(
-      "/alpha=xyz/beta=3.25",
-      and_(is_null(field_ref("alpha")), equal(field_ref("beta"), literal(3.25f))));
+  AssertParse("/alpha=xyz/beta=3.25", and_(is_null(field_ref("alpha")),
+                                           equal(field_ref("beta"), literal(3.25f))));
   AssertParse("/beta=3.25", equal(field_ref("beta"), literal(3.25f)));
   AssertParse("", literal(true));
 
@@ -714,11 +711,10 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
   AssertParse("/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$",
               and_({equal(field_ref("date"), literal(date)),
                     equal(field_ref("time"), literal(time)), is_null(field_ref("str"))}));
-  AssertParse(
-      "/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE",
-      and_({equal(field_ref("date"), literal(date)),
-            equal(field_ref("time"), literal(time)),
-            equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))}));
+  AssertParse("/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE",
+              and_({equal(field_ref("date"), literal(date)),
+                    equal(field_ref("time"), literal(time)),
+                    equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))}));
   // URL-encoded null fallback value
   AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24",
               and_({equal(field_ref("date"), literal(date)),
@@ -727,9 +723,8 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
   // Invalid UTF-8
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
                                   factory_->Inspect({"/date=%AF/time=%BF/str=%CF"}));
-  EXPECT_RAISES_WITH_MESSAGE_THAT(
-      Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-      partitioning_->Parse("/date=%AF/time=%BF/str=%CF"));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
+                                  partitioning_->Parse("/date=%AF/time=%BF/str=%CF"));
 
   options.segment_encoding = SegmentEncoding::None;
   options.schema =
@@ -750,9 +745,8 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
   // Invalid UTF-8
   EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
                                   factory_->Inspect({"/date=\xAF/time=\xBF/str=\xCF"}));
-  EXPECT_RAISES_WITH_MESSAGE_THAT(
-      Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-      partitioning_->Parse("/date=\xAF/time=\xBF/str=\xCF"));
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"),
+                                  partitioning_->Parse("/date=\xAF/time=\xBF/str=\xCF"));
 }
 
 TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
@@ -777,18 +771,19 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
                                                      options.AsHivePartitioningOptions());
   AssertParse(
       "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
-       "07:27:00/str=$",
+      "07:27:00/str=$",
       and_({equal(field_ref("test'; date"), literal(date)),
             equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))}));
-  AssertParse("/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
-               "07:27:00/str=%E3%81%8F%E3%81%BE",
-              and_({equal(field_ref("test'; date"), literal(date)),
-                    equal(field_ref("test'; time"), literal(time)),
-                    equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))}));
+  AssertParse(
+      "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
+      "07:27:00/str=%E3%81%8F%E3%81%BE",
+      and_({equal(field_ref("test'; date"), literal(date)),
+            equal(field_ref("test'; time"), literal(time)),
+            equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))}));
   // URL-encoded null fallback value
   AssertParse(
       "/test%27%3B%20date=2021-05-04 00%3A00%3A00/test%27%3B%20time=2021-05-04 "
-       "07%3A27%3A00/str=%24",
+      "07%3A27%3A00/str=%24",
       and_({equal(field_ref("test'; date"), literal(date)),
             equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))}));
 
@@ -828,8 +823,7 @@ TEST_F(TestPartitioning, EtlThenHive) {
         auto alphabeta_segments_end = etl_segments_end + alphabeta_fields.size();
         auto alphabeta_path =
             fs::internal::JoinAbstractPath(etl_segments_end, alphabeta_segments_end);
-        ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr,
-                              alphabeta_part.Parse(alphabeta_path));
+        ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr, alphabeta_part.Parse(alphabeta_path));
 
         return and_(etl_expr, alphabeta_expr);
       });
@@ -977,7 +971,7 @@ TEST(TestStripPrefixAndFilename, Basic) {
                                  "/data/year=2019/month=12/day=01/file.parquet"};
   auto paths = StripPrefixAndFilename(input, "/data");
   EXPECT_THAT(paths, testing::ElementsAre("year=2019", "year=2019/month=12",
-                                                     "year=2019/month=12/day=01"));
+                                          "year=2019/month=12/day=01"));
 }
 
 }  // namespace dataset

From 2ae2ea3a60da519d466ff77dbfcf3186a4c2c806 Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Tue, 24 May 2022 04:58:59 +0530
Subject: [PATCH 23/24] fix: modify tests to have correct parse strings

---
 cpp/src/arrow/dataset/partition_test.cc | 87 +++++++++++++------------
 1 file changed, 44 insertions(+), 43 deletions(-)

diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index b023b1d5bc5..86b8c4f0b9d 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -190,17 +190,17 @@ TEST_F(TestPartitioning, DirectoryPartitioning) {
   partitioning_ = std::make_shared<DirectoryPartitioning>(
       schema({field("alpha", int32()), field("beta", utf8())}));
 
-  AssertParse("/0/hello", and_(equal(field_ref("alpha"), literal(0)),
-                               equal(field_ref("beta"), literal("hello"))));
-  AssertParse("/3", equal(field_ref("alpha"), literal(3)));
-  AssertParseError("/world/0");    // reversed order
-  AssertParseError("/0.0/foo");    // invalid alpha
-  AssertParseError("/3.25");       // invalid alpha with missing beta
+  AssertParse("/0/hello/", and_(equal(field_ref("alpha"), literal(0)),
+                                equal(field_ref("beta"), literal("hello"))));
+  AssertParse("/3/", equal(field_ref("alpha"), literal(3)));
+  AssertParseError("/world/0/");   // reversed order
+  AssertParseError("/0.0/foo/");   // invalid alpha
+  AssertParseError("/3.25/");      // invalid alpha with missing beta
   AssertParse("", literal(true));  // no segments to parse
 
   // gotcha someday:
-  AssertParse("/0/dat.parquet", and_(equal(field_ref("alpha"), literal(0)),
-                                     equal(field_ref("beta"), literal("dat.parquet"))));
+  AssertParse("/0/dat.parquet/", and_(equal(field_ref("alpha"), literal(0)),
+                                      equal(field_ref("beta"), literal("dat.parquet"))));
 
   AssertParse("/0/foo/ignored=2341", and_(equal(field_ref("alpha"), literal(0)),
                                           equal(field_ref("beta"), literal("foo"))));
@@ -285,7 +285,7 @@ TEST_F(TestPartitioning, DirectoryPartitioningWithTemporal) {
         schema({field("year", int32()), field("month", int8()), field("day", temporal)}));
 
     ASSERT_OK_AND_ASSIGN(auto day, StringScalar("2020-06-08").CastTo(temporal));
-    AssertParse("/2020/06/2020-06-08",
+    AssertParse("/2020/06/2020-06-08/",
                 and_({equal(field_ref("year"), literal(2020)),
                       equal(field_ref("month"), literal<int8_t>(6)),
                       equal(field_ref("day"), literal(day))}));
@@ -386,11 +386,11 @@ TEST_F(TestPartitioning, DictionaryHasUniqueValues) {
     auto dictionary_scalar =
         std::make_shared<DictionaryScalar>(index_and_dictionary, alpha->type());
 
-    auto path = "/" + expected_dictionary->GetString(i);
+    auto path = "/" + expected_dictionary->GetString(i) + "/";
     AssertParse(path, equal(field_ref("alpha"), literal(dictionary_scalar)));
   }
 
-  AssertParseError("/yosemite");  // not in inspected dictionary
+  AssertParseError("/yosemite/");  // not in inspected dictionary
 }
 
 TEST_F(TestPartitioning, DiscoverSchemaSegfault) {
@@ -403,27 +403,27 @@ TEST_F(TestPartitioning, HivePartitioning) {
   partitioning_ = std::make_shared<HivePartitioning>(
       schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz");
 
-  AssertParse("/alpha=0/beta=3.25", and_(equal(field_ref("alpha"), literal(0)),
-                                         equal(field_ref("beta"), literal(3.25f))));
-  AssertParse("/beta=3.25/alpha=0", and_(equal(field_ref("beta"), literal(3.25f)),
-                                         equal(field_ref("alpha"), literal(0))));
-  AssertParse("/alpha=0", equal(field_ref("alpha"), literal(0)));
-  AssertParse("/alpha=xyz/beta=3.25", and_(is_null(field_ref("alpha")),
-                                           equal(field_ref("beta"), literal(3.25f))));
-  AssertParse("/beta=3.25", equal(field_ref("beta"), literal(3.25f)));
+  AssertParse("/alpha=0/beta=3.25/", and_(equal(field_ref("alpha"), literal(0)),
+                                          equal(field_ref("beta"), literal(3.25f))));
+  AssertParse("/beta=3.25/alpha=0/", and_(equal(field_ref("beta"), literal(3.25f)),
+                                          equal(field_ref("alpha"), literal(0))));
+  AssertParse("/alpha=0/", equal(field_ref("alpha"), literal(0)));
+  AssertParse("/alpha=xyz/beta=3.25/", and_(is_null(field_ref("alpha")),
+                                            equal(field_ref("beta"), literal(3.25f))));
+  AssertParse("/beta=3.25/", equal(field_ref("beta"), literal(3.25f)));
   AssertParse("", literal(true));
 
-  AssertParse("/alpha=0/beta=3.25/ignored=2341",
+  AssertParse("/alpha=0/beta=3.25/ignored=2341/",
               and_(equal(field_ref("alpha"), literal(0)),
                    equal(field_ref("beta"), literal(3.25f))));
 
-  AssertParse("/alpha=0/beta=3.25/ignored=2341",
+  AssertParse("/alpha=0/beta=3.25/ignored=2341/",
               and_(equal(field_ref("alpha"), literal(0)),
                    equal(field_ref("beta"), literal(3.25f))));
 
-  AssertParse("/ignored=2341", literal(true));
+  AssertParse("/ignored=2341/", literal(true));
 
-  AssertParseError("/alpha=0.0/beta=3.25");  // conversion of "0.0" to int32 fails
+  AssertParseError("/alpha=0.0/beta=3.25/");  // conversion of "0.0" to int32 fails
 }
 
 TEST_F(TestPartitioning, HivePartitioningFormat) {
@@ -560,11 +560,11 @@ TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) {
     auto dictionary_scalar =
         std::make_shared<DictionaryScalar>(index_and_dictionary, alpha->type());
 
-    auto path = "/alpha=" + expected_dictionary->GetString(i);
+    auto path = "/alpha=" + expected_dictionary->GetString(i) + "/";
     AssertParse(path, equal(field_ref("alpha"), literal(dictionary_scalar)));
   }
 
-  AssertParseError("/alpha=yosemite");  // not in inspected dictionary
+  AssertParseError("/alpha=yosemite/");  // not in inspected dictionary
 }
 
 TEST_F(TestPartitioning, ExistingSchemaDirectory) {
@@ -665,7 +665,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
   auto date = std::make_shared<TimestampScalar>(1620086400, ts);
   auto time = std::make_shared<TimestampScalar>(1620113220, ts);
   partitioning_ = std::make_shared<DirectoryPartitioning>(options.schema, ArrayVector());
-  AssertParse("/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24",
+  AssertParse("/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24/",
               and_({equal(field_ref("date"), literal(date)),
                     equal(field_ref("time"), literal(time)),
                     equal(field_ref("str"), literal("$"))}));
@@ -685,7 +685,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) {
                 options.schema->fields());
   partitioning_ = std::make_shared<DirectoryPartitioning>(
       options.schema, ArrayVector(), options.AsPartitioningOptions());
-  AssertParse("/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24",
+  AssertParse("/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24/",
               and_({equal(field_ref("date"), literal("2021-05-04 00%3A00%3A00")),
                     equal(field_ref("time"), literal("2021-05-04 07%3A27%3A00")),
                     equal(field_ref("str"), literal("%24"))}));
@@ -708,15 +708,16 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
   auto time = std::make_shared<TimestampScalar>(1620113220, ts);
   partitioning_ = std::make_shared<HivePartitioning>(options.schema, ArrayVector(),
                                                      options.AsHivePartitioningOptions());
-  AssertParse("/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$",
+  AssertParse("/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$/",
               and_({equal(field_ref("date"), literal(date)),
                     equal(field_ref("time"), literal(time)), is_null(field_ref("str"))}));
-  AssertParse("/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE",
-              and_({equal(field_ref("date"), literal(date)),
-                    equal(field_ref("time"), literal(time)),
-                    equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))}));
+  AssertParse(
+      "/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE/",
+      and_({equal(field_ref("date"), literal(date)),
+            equal(field_ref("time"), literal(time)),
+            equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))}));
   // URL-encoded null fallback value
-  AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24",
+  AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24/",
               and_({equal(field_ref("date"), literal(date)),
                     equal(field_ref("time"), literal(time)), is_null(field_ref("str"))}));
 
@@ -737,7 +738,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) {
       options.schema->fields());
   partitioning_ = std::make_shared<HivePartitioning>(options.schema, ArrayVector(),
                                                      options.AsHivePartitioningOptions());
-  AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24",
+  AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24/",
               and_({equal(field_ref("date"), literal("2021-05-04 00%3A00%3A00")),
                     equal(field_ref("time"), literal("2021-05-04 07%3A27%3A00")),
                     equal(field_ref("str"), literal("%24"))}));
@@ -771,19 +772,19 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
                                                      options.AsHivePartitioningOptions());
   AssertParse(
       "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
-      "07:27:00/str=$",
+      "07:27:00/str=$/",
       and_({equal(field_ref("test'; date"), literal(date)),
             equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))}));
   AssertParse(
       "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 "
-      "07:27:00/str=%E3%81%8F%E3%81%BE",
+      "07:27:00/str=%E3%81%8F%E3%81%BE/",
       and_({equal(field_ref("test'; date"), literal(date)),
             equal(field_ref("test'; time"), literal(time)),
             equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))}));
   // URL-encoded null fallback value
   AssertParse(
       "/test%27%3B%20date=2021-05-04 00%3A00%3A00/test%27%3B%20time=2021-05-04 "
-      "07%3A27%3A00/str=%24",
+      "07%3A27%3A00/str=%24/",
       and_({equal(field_ref("test'; date"), literal(date)),
             equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))}));
 
@@ -793,7 +794,7 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) {
       factory_->Inspect({"/%AF=2021-05-04/time=2021-05-04 07%3A27%3A00/str=%24"}));
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, ::testing::HasSubstr("was not valid UTF-8"),
-      partitioning_->Parse("/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24"));
+      partitioning_->Parse("/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24/"));
 }
 
 TEST_F(TestPartitioning, EtlThenHive) {
@@ -818,17 +819,17 @@ TEST_F(TestPartitioning, EtlThenHive) {
         auto etl_segments_end = segments.begin() + etl_fields.size();
         auto etl_path =
             fs::internal::JoinAbstractPath(segments.begin(), etl_segments_end);
-        ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_part.Parse(etl_path));
+        ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_part.Parse(etl_path + "/"));
 
         auto alphabeta_segments_end = etl_segments_end + alphabeta_fields.size();
         auto alphabeta_path =
             fs::internal::JoinAbstractPath(etl_segments_end, alphabeta_segments_end);
-        ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr, alphabeta_part.Parse(alphabeta_path));
-
+        ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr,
+                              alphabeta_part.Parse(alphabeta_path + "/"));
         return and_(etl_expr, alphabeta_expr);
       });
 
-  AssertParse("/1999/12/31/00/alpha=0/beta=3.25",
+  AssertParse("/1999/12/31/00/alpha=0/beta=3.25/",
               and_({equal(field_ref("year"), literal<int16_t>(1999)),
                     equal(field_ref("month"), literal<int8_t>(12)),
                     equal(field_ref("day"), literal<int8_t>(31)),
@@ -836,7 +837,7 @@ TEST_F(TestPartitioning, EtlThenHive) {
                     and_(equal(field_ref("alpha"), literal<int32_t>(0)),
                          equal(field_ref("beta"), literal<float>(3.25f)))}));
 
-  AssertParseError("/20X6/03/21/05/alpha=0/beta=3.25");
+  AssertParseError("/20X6/03/21/05/alpha=0/beta=3.25/");
 }
 
 TEST_F(TestPartitioning, Set) {

From 718c924f1d6965e89806c9369b3f41ffff15c24a Mon Sep 17 00:00:00 2001
From: Sanjiban Sengupta <sanjiban.sg@gmail.com>
Date: Thu, 26 May 2022 13:36:35 +0530
Subject: [PATCH 24/24] fix: processing segments in ParseKeys method of
 FunctionPartitioning

---
 cpp/src/arrow/dataset/partition.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index d37dfd97d39..6b4d601db01 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -391,8 +391,9 @@ FilenamePartitioning::FilenamePartitioning(std::shared_ptr<Schema> schema,
 
 Result<std::vector<KeyValuePartitioning::Key>> FilenamePartitioning::ParseKeys(
     const std::string& path) const {
-  std::vector<std::string> segments =
-      fs::internal::SplitAbstractPath(StripNonPrefix(path), kFilenamePartitionSep);
+  std::vector<std::string> segments = fs::internal::SplitAbstractPath(
+      StripNonPrefix(fs::internal::GetAbstractPathParent(path).second),
+      kFilenamePartitionSep);
   return ParsePartitionSegments(segments);
 }