From 2c3cfb0b3e4932abda358d15ebc2cca13649dd06 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 25 Apr 2022 04:12:46 +0530 Subject: [PATCH 01/24] fix: use StripPrefix() for FilenamePartitioning --- cpp/src/arrow/dataset/discovery.cc | 7 ++++++- cpp/src/arrow/dataset/partition.cc | 7 ++++++- cpp/src/arrow/dataset/partition.h | 5 +++++ python/pyarrow/dataset.py | 1 - 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc index f8382aa4052..2632b05ffa0 100644 --- a/cpp/src/arrow/dataset/discovery.cc +++ b/cpp/src/arrow/dataset/discovery.cc @@ -278,8 +278,13 @@ Result> FileSystemDatasetFactory::Finish(FinishOptions } std::vector> fragments; + std::string fixed_path; for (const auto& info : files_) { - auto fixed_path = StripPrefixAndFilename(info.path(), options_.partition_base_dir); + if (partitioning->type_name() == "filename") { + fixed_path = StripPrefix(info.path(), options_.partition_base_dir); + } else { + fixed_path = StripPrefixAndFilename(info.path(), options_.partition_base_dir); + } ARROW_ASSIGN_OR_RAISE(auto partition, partitioning->Parse(fixed_path)); ARROW_ASSIGN_OR_RAISE(auto fragment, format_->MakeFragment({info, fs_}, partition)); fragments.push_back(fragment); diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 763ea0bc6ce..f175b047fc4 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -805,9 +805,14 @@ std::shared_ptr HivePartitioning::MakeFactory( return std::shared_ptr(new HivePartitioningFactory(options)); } -std::string StripPrefixAndFilename(const std::string& path, const std::string& prefix) { +std::string StripPrefix(const std::string& path, const std::string& prefix) { auto maybe_base_less = fs::internal::RemoveAncestor(prefix, path); auto base_less = maybe_base_less ? std::string(*maybe_base_less) : path; + return base_less; +} + +std::string StripPrefixAndFilename(const std::string& path, const std::string& prefix) { + auto base_less = StripPrefix(path, prefix); auto basename_filename = fs::internal::GetAbstractPathParent(base_less); return basename_filename.first; } diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index 6a679f34822..d2ea5db061a 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -358,6 +358,11 @@ class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning { Result FormatValues(const ScalarVector& values) const override; }; +/// \brief Remove only the prefix of a path. +/// +/// e.g., `StripPrefix("/data/2019_c.txt", "/data") -> "2019_c.txt"` +ARROW_DS_EXPORT std::string StripPrefix(const std::string& path, + const std::string& prefix); /// \brief Remove a prefix and the filename of a path. /// /// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") -> "year=2019"` diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index b22a3d032f0..0498dc7ab6f 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -218,7 +218,6 @@ def partitioning(schema=None, field_names=None, flavor=None, "For the default directory flavor, need to specify " "a Schema or a list of field names") if flavor == "filename": - # default flavor if schema is not None: if field_names is not None: raise ValueError( From ba752aae67b074a2ee10a7e9fb7296606de4ef8a Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 25 Apr 2022 05:20:42 +0530 Subject: [PATCH 02/24] test: roundtrip test for partitioning --- python/pyarrow/tests/test_dataset.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 3bc905f2f97..56bc19f4d48 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -662,6 +662,24 @@ def test_partitioning(): assert partitioning.dictionaries[1].to_pylist() == [ "first", "second", "third"] + # test partitioning roundtrip + table = pa.table([ + pa.array(range(20)), pa.array(np.random.randn(20)), + pa.array(np.repeat(['a', 'b'], 10))], + names=["f1", "f2", "part"] + ) + partitioning_schema = pa.schema([("part", pa.string())]) + for klass in [ds.DirectoryPartitioning, ds.HivePartitioning, + ds.FilenamePartitioning]: + with tempfile.TemporaryDirectory() as tempdir: + partitioning = klass(partitioning_schema) + ds.write_dataset(table, tempdir, + format='ipc', partitioning=partitioning) + load_back = ds.dataset(tempdir, format='ipc', + partitioning=partitioning) + load_back_table = load_back.to_table() + assert load_back_table.equals(table) + def test_expression_arithmetic_operators(): dataset = ds.dataset(pa.table({'a': [1, 2, 3], 'b': [2, 2, 2]})) From d666ce97ea0ac5a870b8a30c7374aeeb91204113 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Wed, 27 Apr 2022 01:31:34 +0530 Subject: [PATCH 03/24] feat: modify Parse() to accept both directory and filename prefix --- cpp/src/arrow/dataset/discovery.cc | 9 ++-- cpp/src/arrow/dataset/file_parquet.cc | 6 ++- cpp/src/arrow/dataset/partition.cc | 47 +++++++++----------- cpp/src/arrow/dataset/partition.h | 40 +++++++++-------- cpp/src/arrow/dataset/partition_test.cc | 22 ++++----- python/pyarrow/_dataset.pyx | 4 +- python/pyarrow/includes/libarrow_dataset.pxd | 2 +- python/pyarrow/tests/test_dataset.py | 4 +- 8 files changed, 66 insertions(+), 68 deletions(-) diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc index 2632b05ffa0..a46ea926535 100644 --- a/cpp/src/arrow/dataset/discovery.cc +++ b/cpp/src/arrow/dataset/discovery.cc @@ -280,12 +280,9 @@ Result> FileSystemDatasetFactory::Finish(FinishOptions std::vector> fragments; std::string fixed_path; for (const auto& info : files_) { - if (partitioning->type_name() == "filename") { - fixed_path = StripPrefix(info.path(), options_.partition_base_dir); - } else { - fixed_path = StripPrefixAndFilename(info.path(), options_.partition_base_dir); - } - ARROW_ASSIGN_OR_RAISE(auto partition, partitioning->Parse(fixed_path)); + auto fixed_path = StripPrefixAndFilename(info.path(), options_.partition_base_dir); + ARROW_ASSIGN_OR_RAISE(auto partition, + partitioning->Parse(fixed_path.directory, fixed_path.prefix)); ARROW_ASSIGN_OR_RAISE(auto fragment, format_->MakeFragment({info, fs_}, partition)); fragments.push_back(fragment); } diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 9f914b57f01..0340b3e08dc 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -894,7 +894,8 @@ ParquetDatasetFactory::CollectParquetFragments(const Partitioning& partitioning) auto row_groups = Iota(metadata_subset->num_row_groups()); auto partition_expression = - partitioning.Parse(StripPrefixAndFilename(path, options_.partition_base_dir)) + partitioning + .Parse(StripPrefixAndFilename(path, options_.partition_base_dir).directory) .ValueOr(compute::literal(true)); ARROW_ASSIGN_OR_RAISE( @@ -920,7 +921,8 @@ Result>> ParquetDatasetFactory::InspectSchem size_t i = 0; for (const auto& e : paths_with_row_group_ids_) { - stripped[i++] = StripPrefixAndFilename(e.first, options_.partition_base_dir); + stripped[i++] = + StripPrefixAndFilename(e.first, options_.partition_base_dir).directory; } ARROW_ASSIGN_OR_RAISE(auto partition_schema, factory->Inspect(stripped)); diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index f175b047fc4..67797a99daa 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -82,7 +82,8 @@ std::shared_ptr Partitioning::Default() { std::string type_name() const override { return "default"; } - Result Parse(const std::string& path) const override { + Result Parse(const std::string& directory, + const std::string& prefix) const override { return compute::literal(true); } @@ -250,10 +251,11 @@ Result KeyValuePartitioning::ConvertKey(const Key& key) con compute::literal(std::move(converted))); } -Result KeyValuePartitioning::Parse(const std::string& path) const { +Result KeyValuePartitioning::Parse(const std::string& directory, + const std::string& prefix) const { std::vector expressions; - ARROW_ASSIGN_OR_RAISE(auto parsed, ParseKeys(path)); + ARROW_ASSIGN_OR_RAISE(auto parsed, ParseKeys(directory, prefix)); for (const Key& key : parsed) { ARROW_ASSIGN_OR_RAISE(auto expr, ConvertKey(key)); if (expr == compute::literal(true)) continue; @@ -263,7 +265,7 @@ Result KeyValuePartitioning::Parse(const std::string& path) return and_(std::move(expressions)); } -Result KeyValuePartitioning::Format( +Result KeyValuePartitioning::Format( const compute::Expression& expr) const { ScalarVector values{static_cast(schema_->num_fields()), nullptr}; @@ -376,8 +378,8 @@ DirectoryPartitioning::DirectoryPartitioning(std::shared_ptr schema, } Result> DirectoryPartitioning::ParseKeys( - const std::string& path) const { - std::vector segments = fs::internal::SplitAbstractPath(path); + const std::string& directory, const std::string& prefix) const { + std::vector segments = fs::internal::SplitAbstractPath(directory); return ParsePartitionSegments(segments); } @@ -389,24 +391,24 @@ FilenamePartitioning::FilenamePartitioning(std::shared_ptr schema, } Result> FilenamePartitioning::ParseKeys( - const std::string& path) const { + const std::string& directory, const std::string& prefix) const { std::vector segments = - fs::internal::SplitAbstractPath(StripNonPrefix(path), kFilenamePartitionSep); + fs::internal::SplitAbstractPath(StripNonPrefix(prefix), kFilenamePartitionSep); return ParsePartitionSegments(segments); } -Result DirectoryPartitioning::FormatValues( +Result DirectoryPartitioning::FormatValues( const ScalarVector& values) const { std::vector segments; ARROW_ASSIGN_OR_RAISE(segments, FormatPartitionSegments(values)); return PartitionPathFormat{fs::internal::JoinAbstractPath(std::move(segments)), ""}; } -Result FilenamePartitioning::FormatValues( +Result FilenamePartitioning::FormatValues( const ScalarVector& values) const { std::vector segments; ARROW_ASSIGN_OR_RAISE(segments, FormatPartitionSegments(values)); - return Partitioning::PartitionPathFormat{ + return PartitionPathFormat{ "", fs::internal::JoinAbstractPath(std::move(segments), kFilenamePartitionSep) + kFilenamePartitionSep}; } @@ -718,10 +720,10 @@ Result> HivePartitioning::ParseKey( } Result> HivePartitioning::ParseKeys( - const std::string& path) const { + const std::string& directory, const std::string& prefix) const { std::vector keys; - for (const auto& segment : fs::internal::SplitAbstractPath(path)) { + for (const auto& segment : fs::internal::SplitAbstractPath(directory)) { ARROW_ASSIGN_OR_RAISE(auto maybe_key, ParseKey(segment, hive_options_)); if (auto key = maybe_key) { keys.push_back(std::move(*key)); @@ -731,7 +733,7 @@ Result> HivePartitioning::ParseKeys( return keys; } -Result HivePartitioning::FormatValues( +Result HivePartitioning::FormatValues( const ScalarVector& values) const { std::vector segments(static_cast(schema_->num_fields())); @@ -749,8 +751,7 @@ Result HivePartitioning::FormatValues( } } - return Partitioning::PartitionPathFormat{ - fs::internal::JoinAbstractPath(std::move(segments)), ""}; + return PartitionPathFormat{fs::internal::JoinAbstractPath(std::move(segments)), ""}; } class HivePartitioningFactory : public KeyValuePartitioningFactory { @@ -805,16 +806,12 @@ std::shared_ptr HivePartitioning::MakeFactory( return std::shared_ptr(new HivePartitioningFactory(options)); } -std::string StripPrefix(const std::string& path, const std::string& prefix) { +PartitionPathFormat StripPrefixAndFilename(const std::string& path, + const std::string& prefix) { auto maybe_base_less = fs::internal::RemoveAncestor(prefix, path); auto base_less = maybe_base_less ? std::string(*maybe_base_less) : path; - return base_less; -} - -std::string StripPrefixAndFilename(const std::string& path, const std::string& prefix) { - auto base_less = StripPrefix(path, prefix); auto basename_filename = fs::internal::GetAbstractPathParent(base_less); - return basename_filename.first; + return PartitionPathFormat{basename_filename.first, basename_filename.second}; } std::vector StripPrefixAndFilename(const std::vector& paths, @@ -822,7 +819,7 @@ std::vector StripPrefixAndFilename(const std::vector& std::vector result; result.reserve(paths.size()); for (const auto& path : paths) { - result.emplace_back(StripPrefixAndFilename(path, prefix)); + result.emplace_back(StripPrefixAndFilename(path, prefix).directory); } return result; } @@ -832,7 +829,7 @@ std::vector StripPrefixAndFilename(const std::vector& std::vector result; result.reserve(files.size()); for (const auto& info : files) { - result.emplace_back(StripPrefixAndFilename(info.path(), prefix)); + result.emplace_back(StripPrefixAndFilename(info.path(), prefix).directory); } return result; } diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index d2ea5db061a..b76bc7f9ab1 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -38,6 +38,10 @@ namespace dataset { constexpr char kFilenamePartitionSep = '_'; +struct PartitionPathFormat { + std::string directory, prefix; +}; + // ---------------------------------------------------------------------- // Partitioning @@ -76,11 +80,8 @@ class ARROW_DS_EXPORT Partitioning { const std::shared_ptr& batch) const = 0; /// \brief Parse a path into a partition expression - virtual Result Parse(const std::string& path) const = 0; - - struct PartitionPathFormat { - std::string directory, prefix; - }; + virtual Result Parse(const std::string& directory = "", + const std::string& prefix = "") const = 0; virtual Result Format(const compute::Expression& expr) const = 0; @@ -174,7 +175,8 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning { Result Partition( const std::shared_ptr& batch) const override; - Result Parse(const std::string& path) const override; + Result Parse(const std::string& directory = "", + const std::string& prefix = "") const override; Result Format(const compute::Expression& expr) const override; @@ -191,7 +193,8 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning { } } - virtual Result> ParseKeys(const std::string& path) const = 0; + virtual Result> ParseKeys(const std::string& directory, + const std::string& prefix) const = 0; virtual Result FormatValues(const ScalarVector& values) const = 0; @@ -231,7 +234,8 @@ class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning { std::vector field_names, PartitioningFactoryOptions = {}); private: - Result> ParseKeys(const std::string& path) const override; + Result> ParseKeys(const std::string& directory, + const std::string& prefix) const override; Result FormatValues(const ScalarVector& values) const override; }; @@ -288,7 +292,8 @@ class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning { private: const HivePartitioningOptions hive_options_; - Result> ParseKeys(const std::string& path) const override; + Result> ParseKeys(const std::string& directory, + const std::string& prefix) const override; Result FormatValues(const ScalarVector& values) const override; }; @@ -310,8 +315,9 @@ class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning { std::string type_name() const override { return name_; } - Result Parse(const std::string& path) const override { - return parse_impl_(path); + Result Parse(const std::string& directory = "", + const std::string& prefix = "") const override { + return parse_impl_(directory); } Result Format(const compute::Expression& expr) const override { @@ -353,21 +359,17 @@ class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning { std::vector field_names, PartitioningFactoryOptions = {}); private: - Result> ParseKeys(const std::string& path) const override; + Result> ParseKeys(const std::string& directory, + const std::string& prefix) const override; Result FormatValues(const ScalarVector& values) const override; }; -/// \brief Remove only the prefix of a path. -/// -/// e.g., `StripPrefix("/data/2019_c.txt", "/data") -> "2019_c.txt"` -ARROW_DS_EXPORT std::string StripPrefix(const std::string& path, - const std::string& prefix); /// \brief Remove a prefix and the filename of a path. /// /// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") -> "year=2019"` -ARROW_DS_EXPORT std::string StripPrefixAndFilename(const std::string& path, - const std::string& prefix); +ARROW_DS_EXPORT PartitionPathFormat StripPrefixAndFilename(const std::string& path, + const std::string& prefix); /// \brief Vector version of StripPrefixAndFilename. ARROW_DS_EXPORT std::vector StripPrefixAndFilename( diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 87f6f89ccd9..51939ac7092 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -873,11 +873,12 @@ class RangePartitioning : public Partitioning { std::string type_name() const override { return "range"; } - Result Parse(const std::string& path) const override { + Result Parse(const std::string& directory, + const std::string& prefix) const override { std::vector ranges; HivePartitioningOptions options; - for (auto segment : fs::internal::SplitAbstractPath(path)) { + for (auto segment : fs::internal::SplitAbstractPath(directory)) { ARROW_ASSIGN_OR_RAISE(auto key, HivePartitioning::ParseKey(segment, options)); if (!key) { return Status::Invalid("can't parse '", segment, "' as a range"); @@ -919,9 +920,8 @@ class RangePartitioning : public Partitioning { return Status::OK(); } - Result Format( - const compute::Expression&) const override { - return Partitioning::PartitionPathFormat{"", ""}; + Result Format(const compute::Expression&) const override { + return PartitionPathFormat{"", ""}; } Result Partition( const std::shared_ptr&) const override { @@ -943,12 +943,12 @@ TEST_F(TestPartitioning, Range) { } TEST(TestStripPrefixAndFilename, Basic) { - ASSERT_EQ(StripPrefixAndFilename("", ""), ""); - ASSERT_EQ(StripPrefixAndFilename("a.csv", ""), ""); - ASSERT_EQ(StripPrefixAndFilename("a/b.csv", ""), "a"); - ASSERT_EQ(StripPrefixAndFilename("/a/b/c.csv", "/a"), "b"); - ASSERT_EQ(StripPrefixAndFilename("/a/b/c/d.csv", "/a"), "b/c"); - ASSERT_EQ(StripPrefixAndFilename("/a/b/c.csv", "/a/b"), ""); + ASSERT_EQ(StripPrefixAndFilename("", "").directory, ""); + ASSERT_EQ(StripPrefixAndFilename("a.csv", "").directory, ""); + ASSERT_EQ(StripPrefixAndFilename("a/b.csv", "").directory, "a"); + ASSERT_EQ(StripPrefixAndFilename("/a/b/c.csv", "/a").directory, "b"); + ASSERT_EQ(StripPrefixAndFilename("/a/b/c/d.csv", "/a").directory, "b/c"); + ASSERT_EQ(StripPrefixAndFilename("/a/b/c.csv", "/a/b").directory, ""); std::vector input{"/data/year=2019/file.parquet", "/data/year=2019/month=12/file.parquet", diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 10aeafbdbaa..6b1b8a4ced3 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1319,9 +1319,9 @@ cdef class Partitioning(_Weakrefable): cdef inline shared_ptr[CPartitioning] unwrap(self): return self.wrapped - def parse(self, path): + def parse(self, directory="", prefix=""): cdef CResult[CExpression] result - result = self.partitioning.Parse(tobytes(path)) + result = self.partitioning.Parse(tobytes(directory), tobytes(prefix)) return Expression.wrap(GetResultValue(result)) @property diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd index 64e59e05613..bd7375aad81 100644 --- a/python/pyarrow/includes/libarrow_dataset.pxd +++ b/python/pyarrow/includes/libarrow_dataset.pxd @@ -279,7 +279,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: cdef cppclass CPartitioning "arrow::dataset::Partitioning": c_string type_name() const - CResult[CExpression] Parse(const c_string & path) const + CResult[CExpression] Parse(const c_string & directory, const c_string & prefix) const const shared_ptr[CSchema] & schema() cdef cppclass CSegmentEncoding" arrow::dataset::SegmentEncoding": diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 56bc19f4d48..ca9ed3f2a1d 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -629,14 +629,14 @@ def test_partitioning(): ) assert len(partitioning.dictionaries) == 2 assert all(x is None for x in partitioning.dictionaries) - expr = partitioning.parse('3_3.14_') + expr = partitioning.parse('', '3_3.14_') assert isinstance(expr, ds.Expression) expected = (ds.field('group') == 3) & (ds.field('key') == 3.14) assert expr.equals(expected) with pytest.raises(pa.ArrowInvalid): - partitioning.parse('prefix_3_aaa_') + partitioning.parse('', 'prefix_3_aaa_') partitioning = ds.DirectoryPartitioning( pa.schema([ From 8da2d5ef434c009256fadf1ac24ea8cf2ede3b5f Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Wed, 27 Apr 2022 18:52:45 +0530 Subject: [PATCH 04/24] feat: ARROW_EXPORT PartitionPathFormat --- cpp/src/arrow/dataset/partition.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index b76bc7f9ab1..a8af1291ddc 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -38,7 +38,7 @@ namespace dataset { constexpr char kFilenamePartitionSep = '_'; -struct PartitionPathFormat { +struct ARROW_DS_EXPORT PartitionPathFormat { std::string directory, prefix; }; From ab1e3873c1a656e5a45b20ecee5c2d8f9ce5cd8e Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Wed, 27 Apr 2022 18:53:35 +0530 Subject: [PATCH 05/24] fix: remove shadowed variable fixed_path --- cpp/src/arrow/dataset/discovery.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc index a46ea926535..25e1f72482a 100644 --- a/cpp/src/arrow/dataset/discovery.cc +++ b/cpp/src/arrow/dataset/discovery.cc @@ -278,7 +278,6 @@ Result> FileSystemDatasetFactory::Finish(FinishOptions } std::vector> fragments; - std::string fixed_path; for (const auto& info : files_) { auto fixed_path = StripPrefixAndFilename(info.path(), options_.partition_base_dir); ARROW_ASSIGN_OR_RAISE(auto partition, From 52ec5eec611a0d606ae76bc14a667d4d334e09a1 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Thu, 28 Apr 2022 13:45:45 +0530 Subject: [PATCH 06/24] feat: use PartitionPathFormat as argument to Parse methods --- cpp/src/arrow/dataset/discovery.cc | 2 +- cpp/src/arrow/dataset/file_parquet.cc | 2 +- cpp/src/arrow/dataset/partition.cc | 20 +++++++++----------- cpp/src/arrow/dataset/partition.h | 23 ++++++++--------------- cpp/src/arrow/dataset/partition_test.cc | 23 +++++++++++------------ 5 files changed, 30 insertions(+), 40 deletions(-) diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc index 25e1f72482a..668624ac09a 100644 --- a/cpp/src/arrow/dataset/discovery.cc +++ b/cpp/src/arrow/dataset/discovery.cc @@ -281,7 +281,7 @@ Result> FileSystemDatasetFactory::Finish(FinishOptions for (const auto& info : files_) { auto fixed_path = StripPrefixAndFilename(info.path(), options_.partition_base_dir); ARROW_ASSIGN_OR_RAISE(auto partition, - partitioning->Parse(fixed_path.directory, fixed_path.prefix)); + partitioning->Parse(fixed_path)); ARROW_ASSIGN_OR_RAISE(auto fragment, format_->MakeFragment({info, fs_}, partition)); fragments.push_back(fragment); } diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 0340b3e08dc..8f9590a6e17 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -895,7 +895,7 @@ ParquetDatasetFactory::CollectParquetFragments(const Partitioning& partitioning) auto partition_expression = partitioning - .Parse(StripPrefixAndFilename(path, options_.partition_base_dir).directory) + .Parse(StripPrefixAndFilename(path, options_.partition_base_dir)) .ValueOr(compute::literal(true)); ARROW_ASSIGN_OR_RAISE( diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 67797a99daa..861fd6374ee 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -82,8 +82,7 @@ std::shared_ptr Partitioning::Default() { std::string type_name() const override { return "default"; } - Result Parse(const std::string& directory, - const std::string& prefix) const override { + Result Parse(const PartitionPathFormat& path) const override { return compute::literal(true); } @@ -251,11 +250,10 @@ Result KeyValuePartitioning::ConvertKey(const Key& key) con compute::literal(std::move(converted))); } -Result KeyValuePartitioning::Parse(const std::string& directory, - const std::string& prefix) const { +Result KeyValuePartitioning::Parse(const PartitionPathFormat& path) const { std::vector expressions; - ARROW_ASSIGN_OR_RAISE(auto parsed, ParseKeys(directory, prefix)); + ARROW_ASSIGN_OR_RAISE(auto parsed, ParseKeys(path)); for (const Key& key : parsed) { ARROW_ASSIGN_OR_RAISE(auto expr, ConvertKey(key)); if (expr == compute::literal(true)) continue; @@ -378,8 +376,8 @@ DirectoryPartitioning::DirectoryPartitioning(std::shared_ptr schema, } Result> DirectoryPartitioning::ParseKeys( - const std::string& directory, const std::string& prefix) const { - std::vector segments = fs::internal::SplitAbstractPath(directory); + const PartitionPathFormat& path) const { + std::vector segments = fs::internal::SplitAbstractPath(path.directory); return ParsePartitionSegments(segments); } @@ -391,9 +389,9 @@ FilenamePartitioning::FilenamePartitioning(std::shared_ptr schema, } Result> FilenamePartitioning::ParseKeys( - const std::string& directory, const std::string& prefix) const { + const PartitionPathFormat& path) const { std::vector segments = - fs::internal::SplitAbstractPath(StripNonPrefix(prefix), kFilenamePartitionSep); + fs::internal::SplitAbstractPath(StripNonPrefix(path.prefix), kFilenamePartitionSep); return ParsePartitionSegments(segments); } @@ -720,10 +718,10 @@ Result> HivePartitioning::ParseKey( } Result> HivePartitioning::ParseKeys( - const std::string& directory, const std::string& prefix) const { + const PartitionPathFormat& path) const { std::vector keys; - for (const auto& segment : fs::internal::SplitAbstractPath(directory)) { + for (const auto& segment : fs::internal::SplitAbstractPath(path.directory)) { ARROW_ASSIGN_OR_RAISE(auto maybe_key, ParseKey(segment, hive_options_)); if (auto key = maybe_key) { keys.push_back(std::move(*key)); diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index a8af1291ddc..897581809eb 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -80,8 +80,7 @@ class ARROW_DS_EXPORT Partitioning { const std::shared_ptr& batch) const = 0; /// \brief Parse a path into a partition expression - virtual Result Parse(const std::string& directory = "", - const std::string& prefix = "") const = 0; + virtual Result Parse(const PartitionPathFormat& path) const = 0; virtual Result Format(const compute::Expression& expr) const = 0; @@ -175,8 +174,7 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning { Result Partition( const std::shared_ptr& batch) const override; - Result Parse(const std::string& directory = "", - const std::string& prefix = "") const override; + Result Parse(const PartitionPathFormat& path) const override; Result Format(const compute::Expression& expr) const override; @@ -193,8 +191,7 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning { } } - virtual Result> ParseKeys(const std::string& directory, - const std::string& prefix) const = 0; + virtual Result> ParseKeys(const PartitionPathFormat& path) const = 0; virtual Result FormatValues(const ScalarVector& values) const = 0; @@ -234,8 +231,7 @@ class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning { std::vector field_names, PartitioningFactoryOptions = {}); private: - Result> ParseKeys(const std::string& directory, - const std::string& prefix) const override; + Result> ParseKeys(const PartitionPathFormat& path) const override; Result FormatValues(const ScalarVector& values) const override; }; @@ -292,8 +288,7 @@ class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning { private: const HivePartitioningOptions hive_options_; - Result> ParseKeys(const std::string& directory, - const std::string& prefix) const override; + Result> ParseKeys(const PartitionPathFormat& path) const override; Result FormatValues(const ScalarVector& values) const override; }; @@ -315,9 +310,8 @@ class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning { std::string type_name() const override { return name_; } - Result Parse(const std::string& directory = "", - const std::string& prefix = "") const override { - return parse_impl_(directory); + Result Parse(const PartitionPathFormat& path) const override { + return parse_impl_(path.directory); } Result Format(const compute::Expression& expr) const override { @@ -359,8 +353,7 @@ class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning { std::vector field_names, PartitioningFactoryOptions = {}); private: - Result> ParseKeys(const std::string& directory, - const std::string& prefix) const override; + Result> ParseKeys(const PartitionPathFormat& path) const override; Result FormatValues(const ScalarVector& values) const override; }; diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 51939ac7092..efb1ecf6b25 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -44,11 +44,11 @@ namespace dataset { class TestPartitioning : public ::testing::Test { public: void AssertParseError(const std::string& path) { - ASSERT_RAISES(Invalid, partitioning_->Parse(path)); + ASSERT_RAISES(Invalid, partitioning_->Parse(PartitionPathFormat{path,""})); } void AssertParse(const std::string& path, compute::Expression expected) { - ASSERT_OK_AND_ASSIGN(auto parsed, partitioning_->Parse(path)); + ASSERT_OK_AND_ASSIGN(auto parsed, partitioning_->Parse(PartitionPathFormat{path,""})); ASSERT_EQ(parsed, expected); } @@ -68,7 +68,7 @@ class TestPartitioning : public ::testing::Test { // ensure the formatted path round trips the relevant components of the partition // expression: roundtripped should be a subset of expr ASSERT_OK_AND_ASSIGN(compute::Expression roundtripped, - partitioning_->Parse(formatted.directory)); + partitioning_->Parse(formatted)); ASSERT_OK_AND_ASSIGN(roundtripped, roundtripped.Bind(*written_schema_)); ASSERT_OK_AND_ASSIGN(auto simplified, SimplifyWithGuarantee(roundtripped, expr)); @@ -656,7 +656,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), factory_->Inspect({"/%AF/%BF/%CF"})); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), - partitioning_->Parse({"/%AF/%BF/%CF"})); + partitioning_->Parse({"/%AF/%BF/%CF",""})); options.segment_encoding = SegmentEncoding::None; options.schema = @@ -706,7 +706,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) { EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), factory_->Inspect({"/date=%AF/time=%BF/str=%CF"})); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), - partitioning_->Parse({"/date=%AF/time=%BF/str=%CF"})); + partitioning_->Parse({"/date=%AF/time=%BF/str=%CF",""})); options.segment_encoding = SegmentEncoding::None; options.schema = @@ -729,7 +729,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) { factory_->Inspect({"/date=\xAF/time=\xBF/str=\xCF"})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("was not valid UTF-8"), - partitioning_->Parse({"/date=\xAF/time=\xBF/str=\xCF"})); + partitioning_->Parse({"/date=\xAF/time=\xBF/str=\xCF",""})); } TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { @@ -776,7 +776,7 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { factory_->Inspect({"/%AF=2021-05-04/time=2021-05-04 07%3A27%3A00/str=%24"})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("was not valid UTF-8"), - partitioning_->Parse({"/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24"})); + partitioning_->Parse({"/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24",""})); } TEST_F(TestPartitioning, EtlThenHive) { @@ -801,12 +801,12 @@ TEST_F(TestPartitioning, EtlThenHive) { auto etl_segments_end = segments.begin() + etl_fields.size(); auto etl_path = fs::internal::JoinAbstractPath(segments.begin(), etl_segments_end); - ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_part.Parse(etl_path)); + ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_part.Parse({etl_path,""})); auto alphabeta_segments_end = etl_segments_end + alphabeta_fields.size(); auto alphabeta_path = fs::internal::JoinAbstractPath(etl_segments_end, alphabeta_segments_end); - ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr, alphabeta_part.Parse(alphabeta_path)); + ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr, alphabeta_part.Parse({alphabeta_path,""})); return and_(etl_expr, alphabeta_expr); }); @@ -873,12 +873,11 @@ class RangePartitioning : public Partitioning { std::string type_name() const override { return "range"; } - Result Parse(const std::string& directory, - const std::string& prefix) const override { + Result Parse(const PartitionPathFormat& path) const override { std::vector ranges; HivePartitioningOptions options; - for (auto segment : fs::internal::SplitAbstractPath(directory)) { + for (auto segment : fs::internal::SplitAbstractPath(path.directory)) { ARROW_ASSIGN_OR_RAISE(auto key, HivePartitioning::ParseKey(segment, options)); if (!key) { return Status::Invalid("can't parse '", segment, "' as a range"); From ecaffb7413b2a83d610d64b277b5394a79c0c54a Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 29 Apr 2022 14:08:16 +0530 Subject: [PATCH 07/24] test: Parse tests to use PartitionPathArgument object --- cpp/src/arrow/dataset/partition_test.cc | 107 ++++++++++++++---------- cpp/src/arrow/filesystem/path_util.cc | 8 +- 2 files changed, 65 insertions(+), 50 deletions(-) diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index efb1ecf6b25..4209152e6f0 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -43,12 +43,12 @@ namespace dataset { class TestPartitioning : public ::testing::Test { public: - void AssertParseError(const std::string& path) { - ASSERT_RAISES(Invalid, partitioning_->Parse(PartitionPathFormat{path,""})); + void AssertParseError(const PartitionPathFormat& path) { + ASSERT_RAISES(Invalid, partitioning_->Parse(path)); } - void AssertParse(const std::string& path, compute::Expression expected) { - ASSERT_OK_AND_ASSIGN(auto parsed, partitioning_->Parse(PartitionPathFormat{path,""})); + void AssertParse(const PartitionPathFormat& path, compute::Expression expected) { + ASSERT_OK_AND_ASSIGN(auto parsed, partitioning_->Parse(path)); ASSERT_EQ(parsed, expected); } @@ -69,7 +69,6 @@ class TestPartitioning : public ::testing::Test { // expression: roundtripped should be a subset of expr ASSERT_OK_AND_ASSIGN(compute::Expression roundtripped, partitioning_->Parse(formatted)); - ASSERT_OK_AND_ASSIGN(roundtripped, roundtripped.Bind(*written_schema_)); ASSERT_OK_AND_ASSIGN(auto simplified, SimplifyWithGuarantee(roundtripped, expr)); ASSERT_EQ(simplified, literal(true)); @@ -188,19 +187,35 @@ TEST_F(TestPartitioning, DirectoryPartitioning) { partitioning_ = std::make_shared( schema({field("alpha", int32()), field("beta", utf8())})); - AssertParse("/0/hello", and_(equal(field_ref("alpha"), literal(0)), + AssertParse({"/0/hello",""}, and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal("hello")))); - AssertParse("/3", equal(field_ref("alpha"), literal(3))); - AssertParseError("/world/0"); // reversed order - AssertParseError("/0.0/foo"); // invalid alpha - AssertParseError("/3.25"); // invalid alpha with missing beta - AssertParse("", literal(true)); // no segments to parse + AssertParse({"/3",""}, equal(field_ref("alpha"), literal(3))); + AssertParseError({"/world/0",""}); // reversed order + AssertParseError({"/0.0/foo",""}); // invalid alpha + AssertParseError({"/3.25",""}); // invalid alpha with missing beta + AssertParse({"",""}, literal(true)); // no segments to parse // gotcha someday: - AssertParse("/0/dat.parquet", and_(equal(field_ref("alpha"), literal(0)), + AssertParse({"/0/dat.parquet",""}, and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal("dat.parquet")))); - AssertParse("/0/foo/ignored=2341", and_(equal(field_ref("alpha"), literal(0)), + AssertParse({"/0/foo/ignored=2341",""}, and_(equal(field_ref("alpha"), literal(0)), + equal(field_ref("beta"), literal("foo")))); +} + +TEST_F(TestPartitioning, FilenamePartitioning) { + partitioning_ = std::make_shared( + schema({field("alpha", int32()), field("beta", utf8())})); + + AssertParse({"","0_hello_"}, and_(equal(field_ref("alpha"), literal(0)), + equal(field_ref("beta"), literal("hello")))); + AssertParse({"","0_"},equal(field_ref("alpha"), literal(0))); + AssertParseError({"","world_0_"}); // reversed order + AssertParseError({"","0.0_foo_"}); // invalid alpha + AssertParseError({"","3.25_"}); // invalid alpha with missing beta + AssertParse({"",""}, literal(true)); // no segments to parse + + AssertParse({"","0_foo_ignored=2341"}, and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal("foo")))); } @@ -267,7 +282,7 @@ TEST_F(TestPartitioning, DirectoryPartitioningWithTemporal) { schema({field("year", int32()), field("month", int8()), field("day", temporal)})); ASSERT_OK_AND_ASSIGN(auto day, StringScalar("2020-06-08").CastTo(temporal)); - AssertParse("/2020/06/2020-06-08", + AssertParse({"/2020/06/2020-06-08",""}, and_({equal(field_ref("year"), literal(2020)), equal(field_ref("month"), literal(6)), equal(field_ref("day"), literal(day))})); @@ -369,10 +384,10 @@ TEST_F(TestPartitioning, DictionaryHasUniqueValues) { std::make_shared(index_and_dictionary, alpha->type()); auto path = "/" + expected_dictionary->GetString(i); - AssertParse(path, equal(field_ref("alpha"), literal(dictionary_scalar))); + AssertParse({path,""}, equal(field_ref("alpha"), literal(dictionary_scalar))); } - AssertParseError("/yosemite"); // not in inspected dictionary + AssertParseError({"/yosemite",""}); // not in inspected dictionary } TEST_F(TestPartitioning, DiscoverSchemaSegfault) { @@ -385,27 +400,27 @@ TEST_F(TestPartitioning, HivePartitioning) { partitioning_ = std::make_shared( schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz"); - AssertParse("/alpha=0/beta=3.25", and_(equal(field_ref("alpha"), literal(0)), + AssertParse({"/alpha=0/beta=3.25",""}, and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal(3.25f)))); - AssertParse("/beta=3.25/alpha=0", and_(equal(field_ref("beta"), literal(3.25f)), + AssertParse({"/beta=3.25/alpha=0",""}, and_(equal(field_ref("beta"), literal(3.25f)), equal(field_ref("alpha"), literal(0)))); - AssertParse("/alpha=0", equal(field_ref("alpha"), literal(0))); - AssertParse("/alpha=xyz/beta=3.25", and_(is_null(field_ref("alpha")), + AssertParse({"/alpha=0",""}, equal(field_ref("alpha"), literal(0))); + AssertParse({"/alpha=xyz/beta=3.25",""}, and_(is_null(field_ref("alpha")), equal(field_ref("beta"), literal(3.25f)))); - AssertParse("/beta=3.25", equal(field_ref("beta"), literal(3.25f))); - AssertParse("", literal(true)); + AssertParse({"/beta=3.25",""}, equal(field_ref("beta"), literal(3.25f))); + AssertParse({"",""}, literal(true)); - AssertParse("/alpha=0/unexpected/beta=3.25", + AssertParse({"/alpha=0/unexpected/beta=3.25",""}, and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal(3.25f)))); - AssertParse("/alpha=0/beta=3.25/ignored=2341", + AssertParse({"/alpha=0/beta=3.25/ignored=2341",""}, and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal(3.25f)))); - AssertParse("/ignored=2341", literal(true)); + AssertParse({"/ignored=2341",""}, literal(true)); - AssertParseError("/alpha=0.0/beta=3.25"); // conversion of "0.0" to int32 fails + AssertParseError({"/alpha=0.0/beta=3.25",""}); // conversion of "0.0" to int32 fails } TEST_F(TestPartitioning, HivePartitioningFormat) { @@ -543,10 +558,10 @@ TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) { std::make_shared(index_and_dictionary, alpha->type()); auto path = "/alpha=" + expected_dictionary->GetString(i); - AssertParse(path, equal(field_ref("alpha"), literal(dictionary_scalar))); + AssertParse({path,""}, equal(field_ref("alpha"), literal(dictionary_scalar))); } - AssertParseError("/alpha=yosemite"); // not in inspected dictionary + AssertParseError({"/alpha=yosemite",""}); // not in inspected dictionary } TEST_F(TestPartitioning, ExistingSchemaDirectory) { @@ -647,7 +662,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { auto date = std::make_shared(1620086400, ts); auto time = std::make_shared(1620113220, ts); partitioning_ = std::make_shared(options.schema, ArrayVector()); - AssertParse("/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24", + AssertParse({"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24",""}, and_({equal(field_ref("date"), literal(date)), equal(field_ref("time"), literal(time)), equal(field_ref("str"), literal("$"))})); @@ -667,7 +682,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { options.schema->fields()); partitioning_ = std::make_shared( options.schema, ArrayVector(), options.AsPartitioningOptions()); - AssertParse("/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24", + AssertParse({"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24",""}, and_({equal(field_ref("date"), literal("2021-05-04 00%3A00%3A00")), equal(field_ref("time"), literal("2021-05-04 07%3A27%3A00")), equal(field_ref("str"), literal("%24"))})); @@ -690,15 +705,15 @@ TEST_F(TestPartitioning, UrlEncodedHive) { auto time = std::make_shared(1620113220, ts); partitioning_ = std::make_shared(options.schema, ArrayVector(), options.AsHivePartitioningOptions()); - AssertParse("/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", + AssertParse({"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$",""}, and_({equal(field_ref("date"), literal(date)), equal(field_ref("time"), literal(time)), is_null(field_ref("str"))})); - AssertParse("/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", + AssertParse({"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE",""}, and_({equal(field_ref("date"), literal(date)), equal(field_ref("time"), literal(time)), equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))})); // URL-encoded null fallback value - AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", + AssertParse({"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24",""}, and_({equal(field_ref("date"), literal(date)), equal(field_ref("time"), literal(time)), is_null(field_ref("str"))})); @@ -719,7 +734,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) { options.schema->fields()); partitioning_ = std::make_shared(options.schema, ArrayVector(), options.AsHivePartitioningOptions()); - AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", + AssertParse({"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24",""}, and_({equal(field_ref("date"), literal("2021-05-04 00%3A00%3A00")), equal(field_ref("time"), literal("2021-05-04 07%3A27%3A00")), equal(field_ref("str"), literal("%24"))})); @@ -753,20 +768,20 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { partitioning_ = std::make_shared(options.schema, ArrayVector(), options.AsHivePartitioningOptions()); AssertParse( - "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " - "07:27:00/str=$", + {"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " + "07:27:00/str=$",""}, and_({equal(field_ref("test'; date"), literal(date)), equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))})); AssertParse( - "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " - "07:27:00/str=%E3%81%8F%E3%81%BE", + {"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " + "07:27:00/str=%E3%81%8F%E3%81%BE",""}, and_({equal(field_ref("test'; date"), literal(date)), equal(field_ref("test'; time"), literal(time)), equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))})); // URL-encoded null fallback value AssertParse( - "/test%27%3B%20date=2021-05-04 00%3A00%3A00/test%27%3B%20time=2021-05-04 " - "07%3A27%3A00/str=%24", + {"/test%27%3B%20date=2021-05-04 00%3A00%3A00/test%27%3B%20time=2021-05-04 " + "07%3A27%3A00/str=%24",""}, and_({equal(field_ref("test'; date"), literal(date)), equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))})); @@ -811,7 +826,7 @@ TEST_F(TestPartitioning, EtlThenHive) { return and_(etl_expr, alphabeta_expr); }); - AssertParse("/1999/12/31/00/alpha=0/beta=3.25", + AssertParse({"/1999/12/31/00/alpha=0/beta=3.25",""}, and_({equal(field_ref("year"), literal(1999)), equal(field_ref("month"), literal(12)), equal(field_ref("day"), literal(31)), @@ -819,7 +834,7 @@ TEST_F(TestPartitioning, EtlThenHive) { and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal(3.25f)))})); - AssertParseError("/20X6/03/21/05/alpha=0/beta=3.25"); + AssertParseError({"/20X6/03/21/05/alpha=0/beta=3.25",""}); } TEST_F(TestPartitioning, Set) { @@ -860,9 +875,9 @@ TEST_F(TestPartitioning, Set) { auto x_in = [&](std::vector set) { return call("is_in", {field_ref("x")}, compute::SetLookupOptions{ints(set)}); }; - AssertParse("/x in [1]", x_in({1})); - AssertParse("/x in [1 4 5]", x_in({1, 4, 5})); - AssertParse("/x in []", x_in({})); + AssertParse({"/x in [1]",""}, x_in({1})); + AssertParse({"/x in [1 4 5]",""}, x_in({1, 4, 5})); + AssertParse({"/x in []",""}, x_in({})); } // An adhoc partitioning which parses segments like "/x=[-3.25, 0.0)" @@ -932,7 +947,7 @@ TEST_F(TestPartitioning, Range) { partitioning_ = std::make_shared( schema({field("x", float64()), field("y", float64()), field("z", float64())})); - AssertParse("/x=[-1.5 0.0)/y=[0.0 1.5)/z=(1.5 3.0]", + AssertParse({"/x=[-1.5 0.0)/y=[0.0 1.5)/z=(1.5 3.0]",""}, and_({and_(greater_equal(field_ref("x"), literal(-1.5)), less(field_ref("x"), literal(0.0))), and_(greater_equal(field_ref("y"), literal(0.0)), diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index 83ff069ecff..04484dd5bda 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -34,12 +34,12 @@ namespace internal { std::vector SplitAbstractPath(const std::string& path, char sep) { std::vector parts; auto v = util::string_view(path); - // Strip trailing slash - if (v.length() > 0 && v.back() == kSep) { + // Strip trailing separator + if (v.length() > 0 && v.back() == sep) { v = v.substr(0, v.length() - 1); } - // Strip leading slash - if (v.length() > 0 && v.front() == kSep) { + // Strip leading separator + if (v.length() > 0 && v.front() == sep) { v = v.substr(1); } if (v.length() == 0) { From 9d277cd54786d5e656aa7e1b752f046321ef01c7 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 29 Apr 2022 21:48:18 +0530 Subject: [PATCH 08/24] feat: modify Parse() method in PyArrow to use PartitionPathFormat --- cpp/src/arrow/dataset/discovery.cc | 3 +- cpp/src/arrow/dataset/file_parquet.cc | 3 +- cpp/src/arrow/dataset/partition.cc | 5 +- cpp/src/arrow/dataset/partition_test.cc | 146 ++++++++++--------- python/pyarrow/_dataset.pyx | 5 +- python/pyarrow/includes/libarrow_dataset.pxd | 6 +- 6 files changed, 91 insertions(+), 77 deletions(-) diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc index 668624ac09a..f8382aa4052 100644 --- a/cpp/src/arrow/dataset/discovery.cc +++ b/cpp/src/arrow/dataset/discovery.cc @@ -280,8 +280,7 @@ Result> FileSystemDatasetFactory::Finish(FinishOptions std::vector> fragments; for (const auto& info : files_) { auto fixed_path = StripPrefixAndFilename(info.path(), options_.partition_base_dir); - ARROW_ASSIGN_OR_RAISE(auto partition, - partitioning->Parse(fixed_path)); + ARROW_ASSIGN_OR_RAISE(auto partition, partitioning->Parse(fixed_path)); ARROW_ASSIGN_OR_RAISE(auto fragment, format_->MakeFragment({info, fs_}, partition)); fragments.push_back(fragment); } diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 8f9590a6e17..a1e9698538e 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -894,8 +894,7 @@ ParquetDatasetFactory::CollectParquetFragments(const Partitioning& partitioning) auto row_groups = Iota(metadata_subset->num_row_groups()); auto partition_expression = - partitioning - .Parse(StripPrefixAndFilename(path, options_.partition_base_dir)) + partitioning.Parse(StripPrefixAndFilename(path, options_.partition_base_dir)) .ValueOr(compute::literal(true)); ARROW_ASSIGN_OR_RAISE( diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 861fd6374ee..3a9294fe0d5 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -250,7 +250,8 @@ Result KeyValuePartitioning::ConvertKey(const Key& key) con compute::literal(std::move(converted))); } -Result KeyValuePartitioning::Parse(const PartitionPathFormat& path) const { +Result KeyValuePartitioning::Parse( + const PartitionPathFormat& path) const { std::vector expressions; ARROW_ASSIGN_OR_RAISE(auto parsed, ParseKeys(path)); @@ -809,7 +810,7 @@ PartitionPathFormat StripPrefixAndFilename(const std::string& path, auto maybe_base_less = fs::internal::RemoveAncestor(prefix, path); auto base_less = maybe_base_less ? std::string(*maybe_base_less) : path; auto basename_filename = fs::internal::GetAbstractPathParent(base_less); - return PartitionPathFormat{basename_filename.first, basename_filename.second}; + return PartitionPathFormat{std::move(basename_filename.first), std::move(basename_filename.second)}; } std::vector StripPrefixAndFilename(const std::vector& paths, diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 4209152e6f0..728ad83931a 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -187,36 +187,38 @@ TEST_F(TestPartitioning, DirectoryPartitioning) { partitioning_ = std::make_shared( schema({field("alpha", int32()), field("beta", utf8())})); - AssertParse({"/0/hello",""}, and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), literal("hello")))); - AssertParse({"/3",""}, equal(field_ref("alpha"), literal(3))); - AssertParseError({"/world/0",""}); // reversed order - AssertParseError({"/0.0/foo",""}); // invalid alpha - AssertParseError({"/3.25",""}); // invalid alpha with missing beta - AssertParse({"",""}, literal(true)); // no segments to parse + AssertParse({"/0/hello", ""}, and_(equal(field_ref("alpha"), literal(0)), + equal(field_ref("beta"), literal("hello")))); + AssertParse({"/3", ""}, equal(field_ref("alpha"), literal(3))); + AssertParseError({"/world/0", ""}); // reversed order + AssertParseError({"/0.0/foo", ""}); // invalid alpha + AssertParseError({"/3.25", ""}); // invalid alpha with missing beta + AssertParse({"", ""}, literal(true)); // no segments to parse // gotcha someday: - AssertParse({"/0/dat.parquet",""}, and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), literal("dat.parquet")))); + AssertParse({"/0/dat.parquet", ""}, + and_(equal(field_ref("alpha"), literal(0)), + equal(field_ref("beta"), literal("dat.parquet")))); - AssertParse({"/0/foo/ignored=2341",""}, and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), literal("foo")))); + AssertParse({"/0/foo/ignored=2341", ""}, + and_(equal(field_ref("alpha"), literal(0)), + equal(field_ref("beta"), literal("foo")))); } TEST_F(TestPartitioning, FilenamePartitioning) { partitioning_ = std::make_shared( schema({field("alpha", int32()), field("beta", utf8())})); - AssertParse({"","0_hello_"}, and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), literal("hello")))); - AssertParse({"","0_"},equal(field_ref("alpha"), literal(0))); - AssertParseError({"","world_0_"}); // reversed order - AssertParseError({"","0.0_foo_"}); // invalid alpha - AssertParseError({"","3.25_"}); // invalid alpha with missing beta - AssertParse({"",""}, literal(true)); // no segments to parse + AssertParse({"", "0_hello_"}, and_(equal(field_ref("alpha"), literal(0)), + equal(field_ref("beta"), literal("hello")))); + AssertParse({"", "0_"}, equal(field_ref("alpha"), literal(0))); + AssertParseError({"", "world_0_"}); // reversed order + AssertParseError({"", "0.0_foo_"}); // invalid alpha + AssertParseError({"", "3.25_"}); // invalid alpha with missing beta + AssertParse({"", ""}, literal(true)); // no segments to parse - AssertParse({"","0_foo_ignored=2341"}, and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), literal("foo")))); + AssertParse({"", "0_foo_ignored=2341"}, and_(equal(field_ref("alpha"), literal(0)), + equal(field_ref("beta"), literal("foo")))); } TEST_F(TestPartitioning, DirectoryPartitioningFormat) { @@ -282,7 +284,7 @@ TEST_F(TestPartitioning, DirectoryPartitioningWithTemporal) { schema({field("year", int32()), field("month", int8()), field("day", temporal)})); ASSERT_OK_AND_ASSIGN(auto day, StringScalar("2020-06-08").CastTo(temporal)); - AssertParse({"/2020/06/2020-06-08",""}, + AssertParse({"/2020/06/2020-06-08", ""}, and_({equal(field_ref("year"), literal(2020)), equal(field_ref("month"), literal(6)), equal(field_ref("day"), literal(day))})); @@ -384,10 +386,10 @@ TEST_F(TestPartitioning, DictionaryHasUniqueValues) { std::make_shared(index_and_dictionary, alpha->type()); auto path = "/" + expected_dictionary->GetString(i); - AssertParse({path,""}, equal(field_ref("alpha"), literal(dictionary_scalar))); + AssertParse({path, ""}, equal(field_ref("alpha"), literal(dictionary_scalar))); } - AssertParseError({"/yosemite",""}); // not in inspected dictionary + AssertParseError({"/yosemite", ""}); // not in inspected dictionary } TEST_F(TestPartitioning, DiscoverSchemaSegfault) { @@ -400,27 +402,28 @@ TEST_F(TestPartitioning, HivePartitioning) { partitioning_ = std::make_shared( schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz"); - AssertParse({"/alpha=0/beta=3.25",""}, and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), literal(3.25f)))); - AssertParse({"/beta=3.25/alpha=0",""}, and_(equal(field_ref("beta"), literal(3.25f)), - equal(field_ref("alpha"), literal(0)))); - AssertParse({"/alpha=0",""}, equal(field_ref("alpha"), literal(0))); - AssertParse({"/alpha=xyz/beta=3.25",""}, and_(is_null(field_ref("alpha")), - equal(field_ref("beta"), literal(3.25f)))); - AssertParse({"/beta=3.25",""}, equal(field_ref("beta"), literal(3.25f))); - AssertParse({"",""}, literal(true)); - - AssertParse({"/alpha=0/unexpected/beta=3.25",""}, + AssertParse({"/alpha=0/beta=3.25", ""}, and_(equal(field_ref("alpha"), literal(0)), + equal(field_ref("beta"), literal(3.25f)))); + AssertParse({"/beta=3.25/alpha=0", ""}, and_(equal(field_ref("beta"), literal(3.25f)), + equal(field_ref("alpha"), literal(0)))); + AssertParse({"/alpha=0", ""}, equal(field_ref("alpha"), literal(0))); + AssertParse( + {"/alpha=xyz/beta=3.25", ""}, + and_(is_null(field_ref("alpha")), equal(field_ref("beta"), literal(3.25f)))); + AssertParse({"/beta=3.25", ""}, equal(field_ref("beta"), literal(3.25f))); + AssertParse({"", ""}, literal(true)); + + AssertParse({"/alpha=0/beta=3.25/ignored=2341", ""}, and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal(3.25f)))); - AssertParse({"/alpha=0/beta=3.25/ignored=2341",""}, + AssertParse({"/alpha=0/beta=3.25/ignored=2341", ""}, and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal(3.25f)))); - AssertParse({"/ignored=2341",""}, literal(true)); + AssertParse({"/ignored=2341", ""}, literal(true)); - AssertParseError({"/alpha=0.0/beta=3.25",""}); // conversion of "0.0" to int32 fails + AssertParseError({"/alpha=0.0/beta=3.25", ""}); // conversion of "0.0" to int32 fails } TEST_F(TestPartitioning, HivePartitioningFormat) { @@ -558,10 +561,10 @@ TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) { std::make_shared(index_and_dictionary, alpha->type()); auto path = "/alpha=" + expected_dictionary->GetString(i); - AssertParse({path,""}, equal(field_ref("alpha"), literal(dictionary_scalar))); + AssertParse({path, ""}, equal(field_ref("alpha"), literal(dictionary_scalar))); } - AssertParseError({"/alpha=yosemite",""}); // not in inspected dictionary + AssertParseError({"/alpha=yosemite", ""}); // not in inspected dictionary } TEST_F(TestPartitioning, ExistingSchemaDirectory) { @@ -662,7 +665,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { auto date = std::make_shared(1620086400, ts); auto time = std::make_shared(1620113220, ts); partitioning_ = std::make_shared(options.schema, ArrayVector()); - AssertParse({"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24",""}, + AssertParse({"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24", ""}, and_({equal(field_ref("date"), literal(date)), equal(field_ref("time"), literal(time)), equal(field_ref("str"), literal("$"))})); @@ -671,7 +674,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), factory_->Inspect({"/%AF/%BF/%CF"})); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), - partitioning_->Parse({"/%AF/%BF/%CF",""})); + partitioning_->Parse({"/%AF/%BF/%CF", ""})); options.segment_encoding = SegmentEncoding::None; options.schema = @@ -682,7 +685,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { options.schema->fields()); partitioning_ = std::make_shared( options.schema, ArrayVector(), options.AsPartitioningOptions()); - AssertParse({"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24",""}, + AssertParse({"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24", ""}, and_({equal(field_ref("date"), literal("2021-05-04 00%3A00%3A00")), equal(field_ref("time"), literal("2021-05-04 07%3A27%3A00")), equal(field_ref("str"), literal("%24"))})); @@ -705,23 +708,25 @@ TEST_F(TestPartitioning, UrlEncodedHive) { auto time = std::make_shared(1620113220, ts); partitioning_ = std::make_shared(options.schema, ArrayVector(), options.AsHivePartitioningOptions()); - AssertParse({"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$",""}, + AssertParse({"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", ""}, and_({equal(field_ref("date"), literal(date)), equal(field_ref("time"), literal(time)), is_null(field_ref("str"))})); - AssertParse({"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE",""}, - and_({equal(field_ref("date"), literal(date)), - equal(field_ref("time"), literal(time)), - equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))})); + AssertParse( + {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", ""}, + and_({equal(field_ref("date"), literal(date)), + equal(field_ref("time"), literal(time)), + equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))})); // URL-encoded null fallback value - AssertParse({"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24",""}, + AssertParse({"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", ""}, and_({equal(field_ref("date"), literal(date)), equal(field_ref("time"), literal(time)), is_null(field_ref("str"))})); // Invalid UTF-8 EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), factory_->Inspect({"/date=%AF/time=%BF/str=%CF"})); - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), - partitioning_->Parse({"/date=%AF/time=%BF/str=%CF",""})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("was not valid UTF-8"), + partitioning_->Parse({"/date=%AF/time=%BF/str=%CF", ""})); options.segment_encoding = SegmentEncoding::None; options.schema = @@ -734,7 +739,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) { options.schema->fields()); partitioning_ = std::make_shared(options.schema, ArrayVector(), options.AsHivePartitioningOptions()); - AssertParse({"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24",""}, + AssertParse({"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", ""}, and_({equal(field_ref("date"), literal("2021-05-04 00%3A00%3A00")), equal(field_ref("time"), literal("2021-05-04 07%3A27%3A00")), equal(field_ref("str"), literal("%24"))})); @@ -744,7 +749,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) { factory_->Inspect({"/date=\xAF/time=\xBF/str=\xCF"})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("was not valid UTF-8"), - partitioning_->Parse({"/date=\xAF/time=\xBF/str=\xCF",""})); + partitioning_->Parse({"/date=\xAF/time=\xBF/str=\xCF", ""})); } TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { @@ -769,19 +774,21 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { options.AsHivePartitioningOptions()); AssertParse( {"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " - "07:27:00/str=$",""}, + "07:27:00/str=$", + ""}, and_({equal(field_ref("test'; date"), literal(date)), equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))})); - AssertParse( - {"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " - "07:27:00/str=%E3%81%8F%E3%81%BE",""}, - and_({equal(field_ref("test'; date"), literal(date)), - equal(field_ref("test'; time"), literal(time)), - equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))})); + AssertParse({"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " + "07:27:00/str=%E3%81%8F%E3%81%BE", + ""}, + and_({equal(field_ref("test'; date"), literal(date)), + equal(field_ref("test'; time"), literal(time)), + equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))})); // URL-encoded null fallback value AssertParse( {"/test%27%3B%20date=2021-05-04 00%3A00%3A00/test%27%3B%20time=2021-05-04 " - "07%3A27%3A00/str=%24",""}, + "07%3A27%3A00/str=%24", + ""}, and_({equal(field_ref("test'; date"), literal(date)), equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))})); @@ -791,7 +798,7 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { factory_->Inspect({"/%AF=2021-05-04/time=2021-05-04 07%3A27%3A00/str=%24"})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("was not valid UTF-8"), - partitioning_->Parse({"/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24",""})); + partitioning_->Parse({"/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24", ""})); } TEST_F(TestPartitioning, EtlThenHive) { @@ -816,17 +823,18 @@ TEST_F(TestPartitioning, EtlThenHive) { auto etl_segments_end = segments.begin() + etl_fields.size(); auto etl_path = fs::internal::JoinAbstractPath(segments.begin(), etl_segments_end); - ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_part.Parse({etl_path,""})); + ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_part.Parse({etl_path, ""})); auto alphabeta_segments_end = etl_segments_end + alphabeta_fields.size(); auto alphabeta_path = fs::internal::JoinAbstractPath(etl_segments_end, alphabeta_segments_end); - ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr, alphabeta_part.Parse({alphabeta_path,""})); + ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr, + alphabeta_part.Parse({alphabeta_path, ""})); return and_(etl_expr, alphabeta_expr); }); - AssertParse({"/1999/12/31/00/alpha=0/beta=3.25",""}, + AssertParse({"/1999/12/31/00/alpha=0/beta=3.25", ""}, and_({equal(field_ref("year"), literal(1999)), equal(field_ref("month"), literal(12)), equal(field_ref("day"), literal(31)), @@ -834,7 +842,7 @@ TEST_F(TestPartitioning, EtlThenHive) { and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal(3.25f)))})); - AssertParseError({"/20X6/03/21/05/alpha=0/beta=3.25",""}); + AssertParseError({"/20X6/03/21/05/alpha=0/beta=3.25", ""}); } TEST_F(TestPartitioning, Set) { @@ -875,9 +883,9 @@ TEST_F(TestPartitioning, Set) { auto x_in = [&](std::vector set) { return call("is_in", {field_ref("x")}, compute::SetLookupOptions{ints(set)}); }; - AssertParse({"/x in [1]",""}, x_in({1})); - AssertParse({"/x in [1 4 5]",""}, x_in({1, 4, 5})); - AssertParse({"/x in []",""}, x_in({})); + AssertParse({"/x in [1]", ""}, x_in({1})); + AssertParse({"/x in [1 4 5]", ""}, x_in({1, 4, 5})); + AssertParse({"/x in []", ""}, x_in({})); } // An adhoc partitioning which parses segments like "/x=[-3.25, 0.0)" @@ -947,7 +955,7 @@ TEST_F(TestPartitioning, Range) { partitioning_ = std::make_shared( schema({field("x", float64()), field("y", float64()), field("z", float64())})); - AssertParse({"/x=[-1.5 0.0)/y=[0.0 1.5)/z=(1.5 3.0]",""}, + AssertParse({"/x=[-1.5 0.0)/y=[0.0 1.5)/z=(1.5 3.0]", ""}, and_({and_(greater_equal(field_ref("x"), literal(-1.5)), less(field_ref("x"), literal(0.0))), and_(greater_equal(field_ref("y"), literal(0.0)), diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 6b1b8a4ced3..7fff11b9231 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1321,7 +1321,10 @@ cdef class Partitioning(_Weakrefable): def parse(self, directory="", prefix=""): cdef CResult[CExpression] result - result = self.partitioning.Parse(tobytes(directory), tobytes(prefix)) + cdef CPartitionPathFormat path + path.directory = tobytes(directory) + path.prefix = tobytes(prefix) + result = self.partitioning.Parse(path) return Expression.wrap(GetResultValue(result)) @property diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd index bd7375aad81..71b76c6c48c 100644 --- a/python/pyarrow/includes/libarrow_dataset.pxd +++ b/python/pyarrow/includes/libarrow_dataset.pxd @@ -277,9 +277,13 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: CCSVConvertOptions convert_options CCSVReadOptions read_options + cdef cppclass CPartitionPathFormat "arrow::dataset::PartitionPathFormat": + c_string directory + c_string prefix + cdef cppclass CPartitioning "arrow::dataset::Partitioning": c_string type_name() const - CResult[CExpression] Parse(const c_string & directory, const c_string & prefix) const + CResult[CExpression] Parse(const CPartitionPathFormat & path) const const shared_ptr[CSchema] & schema() cdef cppclass CSegmentEncoding" arrow::dataset::SegmentEncoding": From 6d8c4f272ea7275fbf9403e0051bcdca62b86f45 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 29 Apr 2022 21:52:55 +0530 Subject: [PATCH 09/24] docs: update docstring of StripPrefixAndFilename --- cpp/src/arrow/dataset/partition.cc | 3 ++- cpp/src/arrow/dataset/partition.h | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 3a9294fe0d5..5fba5610381 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -810,7 +810,8 @@ PartitionPathFormat StripPrefixAndFilename(const std::string& path, auto maybe_base_less = fs::internal::RemoveAncestor(prefix, path); auto base_less = maybe_base_less ? std::string(*maybe_base_less) : path; auto basename_filename = fs::internal::GetAbstractPathParent(base_less); - return PartitionPathFormat{std::move(basename_filename.first), std::move(basename_filename.second)}; + return PartitionPathFormat{std::move(basename_filename.first), + std::move(basename_filename.second)}; } std::vector StripPrefixAndFilename(const std::vector& paths, diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index 897581809eb..e4846ef7d08 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -358,9 +358,10 @@ class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning { Result FormatValues(const ScalarVector& values) const override; }; -/// \brief Remove a prefix and the filename of a path. +/// \brief Extracts the directory and filename and removes the prefix of a path /// -/// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") -> "year=2019"` +/// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") -> +/// {"year=2019","c.txt"}` ARROW_DS_EXPORT PartitionPathFormat StripPrefixAndFilename(const std::string& path, const std::string& prefix); From 6876f21589aacee059178d67b597dff73428153f Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Fri, 29 Apr 2022 23:56:25 +0530 Subject: [PATCH 10/24] refactor: rename prefix to filename in PartitionPathFormat --- cpp/src/arrow/dataset/partition.cc | 4 ++-- cpp/src/arrow/dataset/partition.h | 2 +- cpp/src/arrow/dataset/partition_test.cc | 4 ++-- python/pyarrow/_dataset.pyx | 4 ++-- python/pyarrow/includes/libarrow_dataset.pxd | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 5fba5610381..6e636b88cc0 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -391,8 +391,8 @@ FilenamePartitioning::FilenamePartitioning(std::shared_ptr schema, Result> FilenamePartitioning::ParseKeys( const PartitionPathFormat& path) const { - std::vector segments = - fs::internal::SplitAbstractPath(StripNonPrefix(path.prefix), kFilenamePartitionSep); + std::vector segments = fs::internal::SplitAbstractPath( + StripNonPrefix(path.filename), kFilenamePartitionSep); return ParsePartitionSegments(segments); } diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index e4846ef7d08..e18c1d6ddbf 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -39,7 +39,7 @@ namespace dataset { constexpr char kFilenamePartitionSep = '_'; struct ARROW_DS_EXPORT PartitionPathFormat { - std::string directory, prefix; + std::string directory, filename; }; // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 728ad83931a..157a9c3af44 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -58,12 +58,12 @@ class TestPartitioning : public ::testing::Test { } void AssertFormat(compute::Expression expr, const std::string& expected_directory, - const std::string& expected_prefix = "") { + const std::string& expected_filename = "") { // formatted partition expressions are bound to the schema of the dataset being // written ASSERT_OK_AND_ASSIGN(auto formatted, partitioning_->Format(expr)); ASSERT_EQ(formatted.directory, expected_directory); - ASSERT_EQ(formatted.prefix, expected_prefix); + ASSERT_EQ(formatted.filename, expected_filename); // ensure the formatted path round trips the relevant components of the partition // expression: roundtripped should be a subset of expr diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 7fff11b9231..d75caeacdc4 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1319,11 +1319,11 @@ cdef class Partitioning(_Weakrefable): cdef inline shared_ptr[CPartitioning] unwrap(self): return self.wrapped - def parse(self, directory="", prefix=""): + def parse(self, directory="", filename=""): cdef CResult[CExpression] result cdef CPartitionPathFormat path path.directory = tobytes(directory) - path.prefix = tobytes(prefix) + path.filename = tobytes(filename) result = self.partitioning.Parse(path) return Expression.wrap(GetResultValue(result)) diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd index 71b76c6c48c..8a831a3a734 100644 --- a/python/pyarrow/includes/libarrow_dataset.pxd +++ b/python/pyarrow/includes/libarrow_dataset.pxd @@ -279,7 +279,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: cdef cppclass CPartitionPathFormat "arrow::dataset::PartitionPathFormat": c_string directory - c_string prefix + c_string filename cdef cppclass CPartitioning "arrow::dataset::Partitioning": c_string type_name() const From ec9357956bac0d19c0a66e1402214a7e1c4435ea Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Sat, 30 Apr 2022 03:38:43 +0530 Subject: [PATCH 11/24] feat: Inspect() methods to use PartitionPathFormat --- cpp/src/arrow/dataset/file_parquet.cc | 5 +- cpp/src/arrow/dataset/partition.cc | 32 ++-- cpp/src/arrow/dataset/partition.h | 9 +- cpp/src/arrow/dataset/partition_test.cc | 217 +++++++++++++----------- 4 files changed, 145 insertions(+), 118 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index a1e9698538e..cd0f1973081 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -916,12 +916,11 @@ Result>> ParquetDatasetFactory::InspectSchem if (auto factory = options_.partitioning.factory()) { // Gather paths found in RowGroups' ColumnChunks. - std::vector stripped(paths_with_row_group_ids_.size()); + std::vector stripped(paths_with_row_group_ids_.size()); size_t i = 0; for (const auto& e : paths_with_row_group_ids_) { - stripped[i++] = - StripPrefixAndFilename(e.first, options_.partition_base_dir).directory; + stripped[i++] = StripPrefixAndFilename(e.first, options_.partition_base_dir); } ARROW_ASSIGN_OR_RAISE(auto partition_schema, factory->Inspect(stripped)); diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 6e636b88cc0..5d7457141cf 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -581,10 +581,10 @@ class DirectoryPartitioningFactory : public KeyValuePartitioningFactory { std::string type_name() const override { return "directory"; } Result> Inspect( - const std::vector& paths) override { + const std::vector& paths) override { for (auto path : paths) { std::vector segments; - segments = fs::internal::SplitAbstractPath(path); + segments = fs::internal::SplitAbstractPath(path.directory); RETURN_NOT_OK(InspectPartitionSegments(segments, field_names_)); } return DoInspect(); @@ -628,11 +628,11 @@ class FilenamePartitioningFactory : public KeyValuePartitioningFactory { std::string type_name() const override { return "filename"; } Result> Inspect( - const std::vector& paths) override { + const std::vector& paths) override { for (const auto& path : paths) { std::vector segments; - segments = - fs::internal::SplitAbstractPath(StripNonPrefix(path), kFilenamePartitionSep); + segments = fs::internal::SplitAbstractPath(StripNonPrefix(path.filename), + kFilenamePartitionSep); RETURN_NOT_OK(InspectPartitionSegments(segments, field_names_)); } return DoInspect(); @@ -761,10 +761,10 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory { std::string type_name() const override { return "hive"; } Result> Inspect( - const std::vector& paths) override { + const std::vector& paths) override { auto options = options_.AsHivePartitioningOptions(); for (auto path : paths) { - for (auto&& segment : fs::internal::SplitAbstractPath(path)) { + for (auto&& segment : fs::internal::SplitAbstractPath(path.directory)) { ARROW_ASSIGN_OR_RAISE(auto maybe_key, HivePartitioning::ParseKey(segment, options)); if (auto key = maybe_key) { @@ -814,28 +814,28 @@ PartitionPathFormat StripPrefixAndFilename(const std::string& path, std::move(basename_filename.second)}; } -std::vector StripPrefixAndFilename(const std::vector& paths, - const std::string& prefix) { - std::vector result; +std::vector StripPrefixAndFilename( + const std::vector& paths, const std::string& prefix) { + std::vector result; result.reserve(paths.size()); for (const auto& path : paths) { - result.emplace_back(StripPrefixAndFilename(path, prefix).directory); + result.emplace_back(StripPrefixAndFilename(path, prefix)); } return result; } -std::vector StripPrefixAndFilename(const std::vector& files, - const std::string& prefix) { - std::vector result; +std::vector StripPrefixAndFilename( + const std::vector& files, const std::string& prefix) { + std::vector result; result.reserve(files.size()); for (const auto& info : files) { - result.emplace_back(StripPrefixAndFilename(info.path(), prefix).directory); + result.emplace_back(StripPrefixAndFilename(info.path(), prefix)); } return result; } Result> PartitioningOrFactory::GetOrInferSchema( - const std::vector& paths) { + const std::vector& paths) { if (auto part = partitioning()) { return part->schema(); } diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index e18c1d6ddbf..fe20e837619 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -152,7 +152,7 @@ class ARROW_DS_EXPORT PartitioningFactory { /// Get the schema for the resulting Partitioning. /// This may reset internal state, for example dictionaries of unique representations. virtual Result> Inspect( - const std::vector& paths) = 0; + const std::vector& paths) = 0; /// Create a partitioning using the provided schema /// (fields may be dropped). @@ -366,11 +366,11 @@ ARROW_DS_EXPORT PartitionPathFormat StripPrefixAndFilename(const std::string& pa const std::string& prefix); /// \brief Vector version of StripPrefixAndFilename. -ARROW_DS_EXPORT std::vector StripPrefixAndFilename( +ARROW_DS_EXPORT std::vector StripPrefixAndFilename( const std::vector& paths, const std::string& prefix); /// \brief Vector version of StripPrefixAndFilename. -ARROW_DS_EXPORT std::vector StripPrefixAndFilename( +ARROW_DS_EXPORT std::vector StripPrefixAndFilename( const std::vector& files, const std::string& prefix); /// \brief Either a Partitioning or a PartitioningFactory @@ -397,7 +397,8 @@ class ARROW_DS_EXPORT PartitioningOrFactory { const std::shared_ptr& factory() const { return factory_; } /// \brief Get the partition schema, inferring it with the given factory if needed. - Result> GetOrInferSchema(const std::vector& paths); + Result> GetOrInferSchema( + const std::vector& paths); private: std::shared_ptr factory_; diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 157a9c3af44..fe84da843cf 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -74,7 +74,7 @@ class TestPartitioning : public ::testing::Test { ASSERT_EQ(simplified, literal(true)); } - void AssertInspect(const std::vector& paths, + void AssertInspect(const std::vector& paths, const std::vector>& expected) { ASSERT_OK_AND_ASSIGN(auto actual, factory_->Inspect(paths)); ASSERT_EQ(*actual, Schema(expected)); @@ -122,7 +122,7 @@ class TestPartitioning : public ::testing::Test { AssertPartition(partitioning, record_batch, expected_batches, expected_expressions); } - void AssertInspectError(const std::vector& paths) { + void AssertInspectError(const std::vector& paths) { ASSERT_RAISES(Invalid, factory_->Inspect(paths)); } @@ -295,39 +295,39 @@ TEST_F(TestPartitioning, DiscoverSchemaDirectory) { factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}); // type is int32 if possible - AssertInspect({"/0/1"}, {Int("alpha"), Int("beta")}); + AssertInspect({{"/0/1", ""}}, {Int("alpha"), Int("beta")}); // extra segments are ignored - AssertInspect({"/0/1/what"}, {Int("alpha"), Int("beta")}); + AssertInspect({{"/0/1/what", ""}}, {Int("alpha"), Int("beta")}); // fall back to string if any segment for field alpha is not parseable as int - AssertInspect({"/0/1", "/hello/1"}, {Str("alpha"), Int("beta")}); + AssertInspect({{"/0/1", ""}, {"/hello/1", ""}}, {Str("alpha"), Int("beta")}); // If there are too many digits fall back to string - AssertInspect({"/3760212050/1"}, {Str("alpha"), Int("beta")}); + AssertInspect({{"/3760212050/1", ""}}, {Str("alpha"), Int("beta")}); // missing segment for beta doesn't cause an error or fallback - AssertInspect({"/0/1", "/hello"}, {Str("alpha"), Int("beta")}); + AssertInspect({{"/0/1", ""}, {"/hello", ""}}, {Str("alpha"), Int("beta")}); } TEST_F(TestPartitioning, DiscoverSchemaFilename) { factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}); // type is int32 if possible - AssertInspect({"0_1_"}, {Int("alpha"), Int("beta")}); + AssertInspect({{"", "0_1_"}}, {Int("alpha"), Int("beta")}); // extra segments are ignored - AssertInspect({"0_1_what_"}, {Int("alpha"), Int("beta")}); + AssertInspect({{"", "0_1_what_"}}, {Int("alpha"), Int("beta")}); // fall back to string if any segment for field alpha is not parseable as int - AssertInspect({"0_1_", "hello_1_"}, {Str("alpha"), Int("beta")}); + AssertInspect({{"", "0_1_"}, {"", "hello_1_"}}, {Str("alpha"), Int("beta")}); // If there are too many digits fall back to string - AssertInspect({"3760212050_1_"}, {Str("alpha"), Int("beta")}); + AssertInspect({{"", "3760212050_1_"}}, {Str("alpha"), Int("beta")}); // Invalid syntax - AssertInspectError({"234-12"}); - AssertInspectError({"hello"}); + AssertInspectError({{"", "234-12"}}); + AssertInspectError({{"", "hello"}}); } TEST_F(TestPartitioning, DirectoryDictionaryInference) { @@ -336,18 +336,21 @@ TEST_F(TestPartitioning, DirectoryDictionaryInference) { factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options); // type is still int32 if possible - AssertInspect({"/0/1"}, {DictInt("alpha"), DictInt("beta")}); + AssertInspect({{"/0/1", ""}}, {DictInt("alpha"), DictInt("beta")}); // If there are too many digits fall back to string - AssertInspect({"/3760212050/1"}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({{"/3760212050/1", ""}}, {DictStr("alpha"), DictInt("beta")}); // successful dictionary inference - AssertInspect({"/a/0"}, {DictStr("alpha"), DictInt("beta")}); - AssertInspect({"/a/0", "/a/1"}, {DictStr("alpha"), DictInt("beta")}); - AssertInspect({"/a/0", "/a"}, {DictStr("alpha"), DictInt("beta")}); - AssertInspect({"/0/a", "/1"}, {DictInt("alpha"), DictStr("beta")}); - AssertInspect({"/a/0", "/b/0", "/a/1", "/b/1"}, {DictStr("alpha"), DictInt("beta")}); - AssertInspect({"/a/-", "/b/-", "/a/_", "/b/_"}, {DictStr("alpha"), DictStr("beta")}); + AssertInspect({{"/a/0", ""}}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({{"/a/0", ""}, {"/a/1", ""}}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({{"/a/0", ""}, {"/a", ""}}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({{"/a/0", ""}, {"/b/0", ""}, {"/a/1", ""}, {"/b/1", ""}}, + {DictStr("alpha"), DictInt("beta")}); + AssertInspect({{"/a/-", ""}, {"/b/-", ""}, {"/a/_", ""}, {"/b/_", ""}}, + {DictStr("alpha"), DictStr("beta")}); + AssertInspect({{"/a/-", ""}, {"/b/-", ""}, {"/a/_", ""}, {"/b/_", ""}}, + {DictStr("alpha"), DictStr("beta")}); } TEST_F(TestPartitioning, FilenameDictionaryInference) { @@ -356,15 +359,16 @@ TEST_F(TestPartitioning, FilenameDictionaryInference) { factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}, options); // type is still int32 if possible - AssertInspect({"0_1_"}, {DictInt("alpha"), DictInt("beta")}); + AssertInspect({{"", "0_1_"}}, {DictInt("alpha"), DictInt("beta")}); // If there are too many digits fall back to string - AssertInspect({"3760212050_1_"}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({{"", "3760212050_1_"}}, {DictStr("alpha"), DictInt("beta")}); // successful dictionary inference - AssertInspect({"a_0_"}, {DictStr("alpha"), DictInt("beta")}); - AssertInspect({"a_0_", "a_1_"}, {DictStr("alpha"), DictInt("beta")}); - AssertInspect({"a_0_", "b_0_", "a_1_", "b_1_"}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({{"", "a_0_"}}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({{"", "a_0_"}, {"", "a_1_"}}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({{"", "a_0_"}, {"", "b_0_"}, {"", "a_1_"}, {"", "b_1_"}}, + {DictStr("alpha"), DictInt("beta")}); } TEST_F(TestPartitioning, DictionaryHasUniqueValues) { @@ -373,7 +377,8 @@ TEST_F(TestPartitioning, DictionaryHasUniqueValues) { factory_ = DirectoryPartitioning::MakeFactory({"alpha"}, options); auto alpha = DictStr("alpha"); - AssertInspect({"/a", "/b", "/a", "/b", "/c", "/a"}, {alpha}); + AssertInspect({{"/a", ""}, {"/b", ""}, {"/a", ""}, {"/b", ""}, {"/c", ""}, {"/a", ""}}, + {alpha}); ASSERT_OK_AND_ASSIGN(auto partitioning, factory_->Finish(schema({alpha}))); auto expected_dictionary = @@ -395,7 +400,7 @@ TEST_F(TestPartitioning, DictionaryHasUniqueValues) { TEST_F(TestPartitioning, DiscoverSchemaSegfault) { // ARROW-7638 factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}); - AssertInspectError({"oops.txt"}); + AssertInspectError({{"oops.txt", ""}}); } TEST_F(TestPartitioning, HivePartitioning) { @@ -481,27 +486,29 @@ TEST_F(TestPartitioning, DiscoverHiveSchema) { factory_ = HivePartitioning::MakeFactory(options); // type is int32 if possible - AssertInspect({"/alpha=0/beta=1"}, {Int("alpha"), Int("beta")}); + AssertInspect({{"/alpha=0/beta=1", ""}}, {Int("alpha"), Int("beta")}); // extra segments are ignored - AssertInspect({"/gamma=0/unexpected/delta=1/dat.parquet"}, + AssertInspect({{"/gamma=0/unexpected/delta=1/dat.parquet", ""}}, {Int("gamma"), Int("delta")}); // schema field names are in order of first occurrence // (...so ensure your partitions are ordered the same for all paths) - AssertInspect({"/alpha=0/beta=1", "/beta=2/alpha=3"}, {Int("alpha"), Int("beta")}); + AssertInspect({{"/alpha=0/beta=1", ""}, {"/beta=2/alpha=3", ""}}, + {Int("alpha"), Int("beta")}); // Null fallback strings shouldn't interfere with type inference - AssertInspect({"/alpha=xyz/beta=x", "/alpha=7/beta=xyz"}, {Int("alpha"), Str("beta")}); + AssertInspect({{"/alpha=xyz/beta=x", ""}, {"/alpha=7/beta=xyz", ""}}, + {Int("alpha"), Str("beta")}); // Cannot infer if the only values are null - AssertInspectError({"/alpha=xyz"}); + AssertInspectError({{"/alpha=xyz", ""}}); // If there are too many digits fall back to string - AssertInspect({"/alpha=3760212050"}, {Str("alpha")}); + AssertInspect({{"/alpha=3760212050", ""}}, {Str("alpha")}); // missing path segments will not cause an error - AssertInspect({"/alpha=0/beta=1", "/beta=2/alpha=3", "/gamma=what"}, + AssertInspect({{"/alpha=0/beta=1", ""}, {"/beta=2/alpha=3", ""}, {"/gamma=what", ""}}, {Int("alpha"), Int("beta"), Str("gamma")}); } @@ -512,22 +519,27 @@ TEST_F(TestPartitioning, HiveDictionaryInference) { factory_ = HivePartitioning::MakeFactory(options); // type is still int32 if possible - AssertInspect({"/alpha=0/beta=1"}, {DictInt("alpha"), DictInt("beta")}); + AssertInspect({{"/alpha=0/beta=1", ""}}, {DictInt("alpha"), DictInt("beta")}); // If there are too many digits fall back to string - AssertInspect({"/alpha=3760212050"}, {DictStr("alpha")}); + AssertInspect({{"/alpha=3760212050", ""}}, {DictStr("alpha")}); // successful dictionary inference - AssertInspect({"/alpha=a/beta=0"}, {DictStr("alpha"), DictInt("beta")}); - AssertInspect({"/alpha=a/beta=0", "/alpha=a/1"}, {DictStr("alpha"), DictInt("beta")}); - AssertInspect({"/alpha=a/beta=0", "/alpha=xyz/beta=xyz"}, + AssertInspect({{"/alpha=a/beta=0", ""}}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({{"/alpha=a/beta=0", ""}, {"/alpha=a/1", ""}}, {DictStr("alpha"), DictInt("beta")}); - AssertInspect( - {"/alpha=a/beta=0", "/alpha=b/beta=0", "/alpha=a/beta=1", "/alpha=b/beta=1"}, - {DictStr("alpha"), DictInt("beta")}); - AssertInspect( - {"/alpha=a/beta=-", "/alpha=b/beta=-", "/alpha=a/beta=_", "/alpha=b/beta=_"}, - {DictStr("alpha"), DictStr("beta")}); + AssertInspect({{"/alpha=a/beta=0", ""}, {"/alpha=xyz/beta=xyz", ""}}, + {DictStr("alpha"), DictInt("beta")}); + AssertInspect({{"/alpha=a/beta=0", ""}, + {"/alpha=b/beta=0", ""}, + {"/alpha=a/beta=1", ""}, + {"/alpha=b/beta=1", ""}}, + {DictStr("alpha"), DictInt("beta")}); + AssertInspect({{"/alpha=a/beta=-", ""}, + {"/alpha=b/beta=-", ""}, + {"/alpha=a/beta=_", ""}, + {"/alpha=b/beta=_", ""}}, + {DictStr("alpha"), DictStr("beta")}); } TEST_F(TestPartitioning, HiveNullFallbackPassedOn) { @@ -535,7 +547,7 @@ TEST_F(TestPartitioning, HiveNullFallbackPassedOn) { options.null_fallback = "xyz"; factory_ = HivePartitioning::MakeFactory(options); - EXPECT_OK_AND_ASSIGN(auto schema, factory_->Inspect({"/alpha=a/beta=0"})); + EXPECT_OK_AND_ASSIGN(auto schema, factory_->Inspect({{"/alpha=a/beta=0", ""}})); EXPECT_OK_AND_ASSIGN(auto partitioning, factory_->Finish(schema)); ASSERT_EQ("xyz", std::static_pointer_cast(partitioning)->null_fallback()); @@ -547,7 +559,12 @@ TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) { factory_ = HivePartitioning::MakeFactory(options); auto alpha = DictStr("alpha"); - AssertInspect({"/alpha=a", "/alpha=b", "/alpha=a", "/alpha=b", "/alpha=c", "/alpha=a"}, + AssertInspect({{"/alpha=a", ""}, + {"/alpha=b", ""}, + {"/alpha=a", ""}, + {"/alpha=b", ""}, + {"/alpha=c", ""}, + {"/alpha=a", ""}}, {alpha}); ASSERT_OK_AND_ASSIGN(auto partitioning, factory_->Finish(schema({alpha}))); @@ -574,22 +591,23 @@ TEST_F(TestPartitioning, ExistingSchemaDirectory) { options.schema = schema({field("alpha", int64()), field("beta", dict_type)}); factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options); - AssertInspect({"/0/1"}, options.schema->fields()); - AssertInspect({"/0/1/what"}, options.schema->fields()); + AssertInspect({{"/0/1", ""}}, options.schema->fields()); + AssertInspect({{"/0/1/what", ""}}, options.schema->fields()); // fail if any segment is not parseable as schema type EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"), - factory_->Inspect({"/0/1", "/hello/1"})); + factory_->Inspect({{"/0/1", ""}, {"/hello/1", ""}})); factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options); // Now we don't fail since our type is large enough - AssertInspect({"/3760212050/1"}, options.schema->fields()); + AssertInspect({{"/3760212050/1", ""}}, options.schema->fields()); // If there are still too many digits, fail - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"), - factory_->Inspect({"/1038581385102940193760212050/1"})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("Failed to parse string"), + factory_->Inspect({{"/1038581385102940193760212050/1", ""}})); factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options); - AssertInspect({"/0/1", "/2"}, options.schema->fields()); + AssertInspect({{"/0/1", ""}, {"/2", ""}}, options.schema->fields()); } TEST_F(TestPartitioning, ExistingSchemaHive) { @@ -599,33 +617,33 @@ TEST_F(TestPartitioning, ExistingSchemaHive) { options.schema = schema({field("a", int64()), field("b", dict_type)}); factory_ = HivePartitioning::MakeFactory(options); - AssertInspect({"/a=0/b=1"}, options.schema->fields()); - AssertInspect({"/a=0/b=1/what"}, options.schema->fields()); - AssertInspect({"/a=0", "/b=1"}, options.schema->fields()); + AssertInspect({{"/a=0/b=1", ""}}, options.schema->fields()); + AssertInspect({{"/a=0/b=1/what", ""}}, options.schema->fields()); + AssertInspect({{"/a=0", ""}, {"/b=1", ""}}, options.schema->fields()); // fail if any segment for field alpha is not parseable as schema type EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr( "Could not cast segments for partition field a to requested type int64"), - factory_->Inspect({"/a=0/b=1", "/a=hello/b=1"})); + factory_->Inspect({{"/a=0/b=1", ""}, {"/a=hello/b=1", ""}})); factory_ = HivePartitioning::MakeFactory(options); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("Requested schema has 2 fields, but only 1 were detected"), - factory_->Inspect({"/a=0", "/a=hello"})); + factory_->Inspect({{"/a=0", ""}, {"/a=hello", ""}})); factory_ = HivePartitioning::MakeFactory(options); // Now we don't fail since our type is large enough - AssertInspect({"/a=3760212050/b=1"}, options.schema->fields()); + AssertInspect({{"/a=3760212050/b=1", ""}}, options.schema->fields()); // If there are still too many digits, fail EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("Failed to parse string"), - factory_->Inspect({"/a=1038581385102940193760212050/b=1"})); + factory_->Inspect({{"/a=1038581385102940193760212050/b=1", ""}})); factory_ = HivePartitioning::MakeFactory(options); - AssertInspect({"/a=0/b=1", "/b=2"}, options.schema->fields()); + AssertInspect({{"/a=0/b=1", ""}, {"/b=2", ""}}, options.schema->fields()); } TEST_F(TestPartitioning, ExistingSchemaFilename) { @@ -635,22 +653,23 @@ TEST_F(TestPartitioning, ExistingSchemaFilename) { options.schema = schema({field("alpha", int64()), field("beta", dict_type)}); factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}, options); - AssertInspect({"0_1_"}, options.schema->fields()); - AssertInspect({"0_1_what_"}, options.schema->fields()); + AssertInspect({{"", "0_1_"}}, options.schema->fields()); + AssertInspect({{"", "0_1_what_"}}, options.schema->fields()); // fail if any segment is not parseable as schema type EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"), - factory_->Inspect({"0_1_", "hello_1_"})); + factory_->Inspect({{"", "0_1_"}, {"", "hello_1_"}})); factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}, options); // Now we don't fail since our type is large enough - AssertInspect({"3760212050_1_"}, options.schema->fields()); + AssertInspect({{"", "3760212050_1_"}}, options.schema->fields()); // If there are still too many digits, fail - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"), - factory_->Inspect({"1038581385102940193760212050_1_"})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("Failed to parse string"), + factory_->Inspect({{"", "1038581385102940193760212050_1_"}})); factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}, options); - AssertInspect({"0_1_", "2_"}, options.schema->fields()); + AssertInspect({{"", "0_1_"}, {"", "2_"}}, options.schema->fields()); } TEST_F(TestPartitioning, UrlEncodedDirectory) { @@ -659,8 +678,8 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { options.schema = schema({field("date", ts), field("time", ts), field("str", utf8())}); factory_ = DirectoryPartitioning::MakeFactory(options.schema->field_names(), options); - AssertInspect({"/2021-05-04 00:00:00/2021-05-04 07:27:00/%24", - "/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/foo"}, + AssertInspect({{"/2021-05-04 00:00:00/2021-05-04 07:27:00/%24", ""}, + {"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/foo", ""}}, options.schema->fields()); auto date = std::make_shared(1620086400, ts); auto time = std::make_shared(1620113220, ts); @@ -672,7 +691,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { // Invalid UTF-8 EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), - factory_->Inspect({"/%AF/%BF/%CF"})); + factory_->Inspect({{"/%AF/%BF/%CF", ""}})); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), partitioning_->Parse({"/%AF/%BF/%CF", ""})); @@ -680,8 +699,8 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { options.schema = schema({field("date", utf8()), field("time", utf8()), field("str", utf8())}); factory_ = DirectoryPartitioning::MakeFactory(options.schema->field_names(), options); - AssertInspect({"/2021-05-04 00:00:00/2021-05-04 07:27:00/%E3%81%8F%E3%81%BE", - "/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/foo"}, + AssertInspect({{"/2021-05-04 00:00:00/2021-05-04 07:27:00/%E3%81%8F%E3%81%BE", ""}, + {"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/foo", ""}}, options.schema->fields()); partitioning_ = std::make_shared( options.schema, ArrayVector(), options.AsPartitioningOptions()); @@ -699,9 +718,9 @@ TEST_F(TestPartitioning, UrlEncodedHive) { factory_ = HivePartitioning::MakeFactory(options); AssertInspect( - {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", - "/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", - "/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24"}, + {{"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", ""}, + {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", ""}, + {"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", ""}}, options.schema->fields()); auto date = std::make_shared(1620086400, ts); @@ -722,8 +741,9 @@ TEST_F(TestPartitioning, UrlEncodedHive) { equal(field_ref("time"), literal(time)), is_null(field_ref("str"))})); // Invalid UTF-8 - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), - factory_->Inspect({"/date=%AF/time=%BF/str=%CF"})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("was not valid UTF-8"), + factory_->Inspect({{"/date=%AF/time=%BF/str=%CF", ""}})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("was not valid UTF-8"), partitioning_->Parse({"/date=%AF/time=%BF/str=%CF", ""})); @@ -733,9 +753,9 @@ TEST_F(TestPartitioning, UrlEncodedHive) { schema({field("date", utf8()), field("time", utf8()), field("str", utf8())}); factory_ = HivePartitioning::MakeFactory(options); AssertInspect( - {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", - "/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", - "/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24"}, + {{"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", ""}, + {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", ""}, + {"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", ""}}, options.schema->fields()); partitioning_ = std::make_shared(options.schema, ArrayVector(), options.AsHivePartitioningOptions()); @@ -745,8 +765,9 @@ TEST_F(TestPartitioning, UrlEncodedHive) { equal(field_ref("str"), literal("%24"))})); // Invalid UTF-8 - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), - factory_->Inspect({"/date=\xAF/time=\xBF/str=\xCF"})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("was not valid UTF-8"), + factory_->Inspect({{"/date=\xAF/time=\xBF/str=\xCF", ""}})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("was not valid UTF-8"), partitioning_->Parse({"/date=\xAF/time=\xBF/str=\xCF", ""})); @@ -760,12 +781,15 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { options.null_fallback = "$"; factory_ = HivePartitioning::MakeFactory(options); - AssertInspect({"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " - "07:27:00/str=$", - "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " - "07:27:00/str=%E3%81%8F%E3%81%BE", - "/test%27%3B%20date=2021-05-04 " - "00%3A00%3A00/test%27%3B%20time=2021-05-04 07%3A27%3A00/str=%24"}, + AssertInspect({{"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " + "07:27:00/str=$", + ""}, + {"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " + "07:27:00/str=%E3%81%8F%E3%81%BE", + ""}, + {"/test%27%3B%20date=2021-05-04 " + "00%3A00%3A00/test%27%3B%20time=2021-05-04 07%3A27%3A00/str=%24", + ""}}, options.schema->fields()); auto date = std::make_shared(1620086400, ts); @@ -795,7 +819,7 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { // Invalid UTF-8 EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("was not valid UTF-8"), - factory_->Inspect({"/%AF=2021-05-04/time=2021-05-04 07%3A27%3A00/str=%24"})); + factory_->Inspect({{"/%AF=2021-05-04/time=2021-05-04 07%3A27%3A00/str=%24", ""}})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("was not valid UTF-8"), partitioning_->Parse({"/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24", ""})); @@ -975,9 +999,12 @@ TEST(TestStripPrefixAndFilename, Basic) { std::vector input{"/data/year=2019/file.parquet", "/data/year=2019/month=12/file.parquet", "/data/year=2019/month=12/day=01/file.parquet"}; - EXPECT_THAT(StripPrefixAndFilename(input, "/data"), - testing::ElementsAre("year=2019", "year=2019/month=12", - "year=2019/month=12/day=01")); + auto paths = StripPrefixAndFilename(input, "/data"); + std::vector path_directories; + std::transform(paths.begin(), paths.end(), std::back_inserter(path_directories), + [](PartitionPathFormat const& path) { return path.directory; }); + EXPECT_THAT(path_directories, testing::ElementsAre("year=2019", "year=2019/month=12", + "year=2019/month=12/day=01")); } } // namespace dataset From 83ea5fdc0d733734d93c2b60a2c84f5dc6e48f14 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Sat, 30 Apr 2022 04:38:07 +0530 Subject: [PATCH 12/24] fix: use PartitionPathFormat in dataset___PartitioningFactory__Inspect for R --- r/src/dataset.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp index aa44a0721e3..2abc7af2953 100644 --- a/r/src/dataset.cpp +++ b/r/src/dataset.cpp @@ -386,7 +386,7 @@ std::shared_ptr dataset___HivePartitioning__MakeFactory // [[dataset::export]] std::shared_ptr dataset___PartitioningFactory__Inspect( const std::shared_ptr& factory, - const std::vector& paths) { + const std::vector& paths) { return ValueOrStop(factory->Inspect(paths)); } From 5b5cf95082cc5f41c8f5082e6cb203460831706b Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Tue, 3 May 2022 21:51:43 +0530 Subject: [PATCH 13/24] fix: using PartitionPathFormat in arrowExports for R --- .RData | Bin 0 -> 48 bytes r/src/arrowExports.cpp | 4 ++-- r/src/arrow_types.h | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) create mode 100644 .RData diff --git a/.RData b/.RData new file mode 100644 index 0000000000000000000000000000000000000000..e9cee15b7b2568b8d14df240eb4cb675ca76b36d GIT binary patch literal 48 zcmb2|=3oE=X6~X+gJ)e2k`fXU(h?GxCarN$W6sX#n7xj5!bg6F?=#lDQ~)Xj09iQ@ A4FCWD literal 0 HcmV?d00001 diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 312b778aada..c878225ed25 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1767,11 +1767,11 @@ extern "C" SEXP _arrow_dataset___HivePartitioning__MakeFactory(SEXP null_fallbac // dataset.cpp #if defined(ARROW_R_WITH_DATASET) -std::shared_ptr dataset___PartitioningFactory__Inspect(const std::shared_ptr& factory, const std::vector& paths); +std::shared_ptr dataset___PartitioningFactory__Inspect(const std::shared_ptr& factory, const std::vector& paths); extern "C" SEXP _arrow_dataset___PartitioningFactory__Inspect(SEXP factory_sexp, SEXP paths_sexp){ BEGIN_CPP11 arrow::r::Input&>::type factory(factory_sexp); - arrow::r::Input&>::type paths(paths_sexp); + arrow::r::Input&>::type paths(paths_sexp); return cpp11::as_sexp(dataset___PartitioningFactory__Inspect(factory, paths)); END_CPP11 } diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index d9fee37e7f1..5a931ccb8de 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -36,6 +36,7 @@ #if defined(ARROW_R_WITH_DATASET) #include +#include #endif #include From c5260651d636b0a010e624182b495f79bf6615a5 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Wed, 4 May 2022 07:28:38 +0530 Subject: [PATCH 14/24] fix: creating PartitionPathFormat object only in dataset___PartitioningFactory__Inspect --- r/src/arrowExports.cpp | 4 ++-- r/src/arrow_types.h | 1 - r/src/dataset.cpp | 8 ++++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index c878225ed25..14eb6eb397a 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1767,11 +1767,11 @@ extern "C" SEXP _arrow_dataset___HivePartitioning__MakeFactory(SEXP null_fallbac // dataset.cpp #if defined(ARROW_R_WITH_DATASET) -std::shared_ptr dataset___PartitioningFactory__Inspect(const std::shared_ptr& factory, const std::vector& paths); +std::shared_ptr dataset___PartitioningFactory__Inspect(const std::shared_ptr& factory, const std::vector>& paths); extern "C" SEXP _arrow_dataset___PartitioningFactory__Inspect(SEXP factory_sexp, SEXP paths_sexp){ BEGIN_CPP11 arrow::r::Input&>::type factory(factory_sexp); - arrow::r::Input&>::type paths(paths_sexp); + arrow::r::Input>&>::type paths(paths_sexp); return cpp11::as_sexp(dataset___PartitioningFactory__Inspect(factory, paths)); END_CPP11 } diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 5a931ccb8de..d9fee37e7f1 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -36,7 +36,6 @@ #if defined(ARROW_R_WITH_DATASET) #include -#include #endif #include diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp index 2abc7af2953..7a3273e9abc 100644 --- a/r/src/dataset.cpp +++ b/r/src/dataset.cpp @@ -386,8 +386,12 @@ std::shared_ptr dataset___HivePartitioning__MakeFactory // [[dataset::export]] std::shared_ptr dataset___PartitioningFactory__Inspect( const std::shared_ptr& factory, - const std::vector& paths) { - return ValueOrStop(factory->Inspect(paths)); + const std::vector>& paths) { + std::vector partition_paths(paths.size()); + for (auto& path : paths) { + partition_paths.emplace_back(ds::PartitionPathFormat{path.first, path.second}); + } + return ValueOrStop(factory->Inspect(partition_paths)); } // [[dataset::export]] From c7d1136eddb4b0576995a050077e1df61ca01218 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Thu, 5 May 2022 13:49:22 +0530 Subject: [PATCH 15/24] fix: Remove R implementation, instead put check in InspectSchemas --- .RData | Bin 48 -> 0 bytes cpp/src/arrow/dataset/file_parquet.cc | 15 +- cpp/src/arrow/dataset/partition.cc | 32 ++-- cpp/src/arrow/dataset/partition.h | 9 +- cpp/src/arrow/dataset/partition_test.cc | 217 +++++++++++------------- r/autobrew | 59 +++++++ r/src/arrowExports.cpp | 4 +- r/src/arrow_types.h | 1 + r/src/dataset.cpp | 8 +- 9 files changed, 191 insertions(+), 154 deletions(-) delete mode 100644 .RData create mode 100644 r/autobrew diff --git a/.RData b/.RData deleted file mode 100644 index e9cee15b7b2568b8d14df240eb4cb675ca76b36d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 48 zcmb2|=3oE=X6~X+gJ)e2k`fXU(h?GxCarN$W6sX#n7xj5!bg6F?=#lDQ~)Xj09iQ@ A4FCWD diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index cd0f1973081..4b1ded92a4a 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -916,11 +916,20 @@ Result>> ParquetDatasetFactory::InspectSchem if (auto factory = options_.partitioning.factory()) { // Gather paths found in RowGroups' ColumnChunks. - std::vector stripped(paths_with_row_group_ids_.size()); + std::vector stripped(paths_with_row_group_ids_.size()); size_t i = 0; - for (const auto& e : paths_with_row_group_ids_) { - stripped[i++] = StripPrefixAndFilename(e.first, options_.partition_base_dir); + + if (options_.partitioning.factory()->type_name() == "filename") { + for (const auto& e : paths_with_row_group_ids_) { + stripped[i++] = + StripPrefixAndFilename(e.first, options_.partition_base_dir).filename; + } + } else { + for (const auto& e : paths_with_row_group_ids_) { + stripped[i++] = + StripPrefixAndFilename(e.first, options_.partition_base_dir).directory; + } } ARROW_ASSIGN_OR_RAISE(auto partition_schema, factory->Inspect(stripped)); diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 5d7457141cf..6e636b88cc0 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -581,10 +581,10 @@ class DirectoryPartitioningFactory : public KeyValuePartitioningFactory { std::string type_name() const override { return "directory"; } Result> Inspect( - const std::vector& paths) override { + const std::vector& paths) override { for (auto path : paths) { std::vector segments; - segments = fs::internal::SplitAbstractPath(path.directory); + segments = fs::internal::SplitAbstractPath(path); RETURN_NOT_OK(InspectPartitionSegments(segments, field_names_)); } return DoInspect(); @@ -628,11 +628,11 @@ class FilenamePartitioningFactory : public KeyValuePartitioningFactory { std::string type_name() const override { return "filename"; } Result> Inspect( - const std::vector& paths) override { + const std::vector& paths) override { for (const auto& path : paths) { std::vector segments; - segments = fs::internal::SplitAbstractPath(StripNonPrefix(path.filename), - kFilenamePartitionSep); + segments = + fs::internal::SplitAbstractPath(StripNonPrefix(path), kFilenamePartitionSep); RETURN_NOT_OK(InspectPartitionSegments(segments, field_names_)); } return DoInspect(); @@ -761,10 +761,10 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory { std::string type_name() const override { return "hive"; } Result> Inspect( - const std::vector& paths) override { + const std::vector& paths) override { auto options = options_.AsHivePartitioningOptions(); for (auto path : paths) { - for (auto&& segment : fs::internal::SplitAbstractPath(path.directory)) { + for (auto&& segment : fs::internal::SplitAbstractPath(path)) { ARROW_ASSIGN_OR_RAISE(auto maybe_key, HivePartitioning::ParseKey(segment, options)); if (auto key = maybe_key) { @@ -814,28 +814,28 @@ PartitionPathFormat StripPrefixAndFilename(const std::string& path, std::move(basename_filename.second)}; } -std::vector StripPrefixAndFilename( - const std::vector& paths, const std::string& prefix) { - std::vector result; +std::vector StripPrefixAndFilename(const std::vector& paths, + const std::string& prefix) { + std::vector result; result.reserve(paths.size()); for (const auto& path : paths) { - result.emplace_back(StripPrefixAndFilename(path, prefix)); + result.emplace_back(StripPrefixAndFilename(path, prefix).directory); } return result; } -std::vector StripPrefixAndFilename( - const std::vector& files, const std::string& prefix) { - std::vector result; +std::vector StripPrefixAndFilename(const std::vector& files, + const std::string& prefix) { + std::vector result; result.reserve(files.size()); for (const auto& info : files) { - result.emplace_back(StripPrefixAndFilename(info.path(), prefix)); + result.emplace_back(StripPrefixAndFilename(info.path(), prefix).directory); } return result; } Result> PartitioningOrFactory::GetOrInferSchema( - const std::vector& paths) { + const std::vector& paths) { if (auto part = partitioning()) { return part->schema(); } diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index fe20e837619..e18c1d6ddbf 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -152,7 +152,7 @@ class ARROW_DS_EXPORT PartitioningFactory { /// Get the schema for the resulting Partitioning. /// This may reset internal state, for example dictionaries of unique representations. virtual Result> Inspect( - const std::vector& paths) = 0; + const std::vector& paths) = 0; /// Create a partitioning using the provided schema /// (fields may be dropped). @@ -366,11 +366,11 @@ ARROW_DS_EXPORT PartitionPathFormat StripPrefixAndFilename(const std::string& pa const std::string& prefix); /// \brief Vector version of StripPrefixAndFilename. -ARROW_DS_EXPORT std::vector StripPrefixAndFilename( +ARROW_DS_EXPORT std::vector StripPrefixAndFilename( const std::vector& paths, const std::string& prefix); /// \brief Vector version of StripPrefixAndFilename. -ARROW_DS_EXPORT std::vector StripPrefixAndFilename( +ARROW_DS_EXPORT std::vector StripPrefixAndFilename( const std::vector& files, const std::string& prefix); /// \brief Either a Partitioning or a PartitioningFactory @@ -397,8 +397,7 @@ class ARROW_DS_EXPORT PartitioningOrFactory { const std::shared_ptr& factory() const { return factory_; } /// \brief Get the partition schema, inferring it with the given factory if needed. - Result> GetOrInferSchema( - const std::vector& paths); + Result> GetOrInferSchema(const std::vector& paths); private: std::shared_ptr factory_; diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index fe84da843cf..157a9c3af44 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -74,7 +74,7 @@ class TestPartitioning : public ::testing::Test { ASSERT_EQ(simplified, literal(true)); } - void AssertInspect(const std::vector& paths, + void AssertInspect(const std::vector& paths, const std::vector>& expected) { ASSERT_OK_AND_ASSIGN(auto actual, factory_->Inspect(paths)); ASSERT_EQ(*actual, Schema(expected)); @@ -122,7 +122,7 @@ class TestPartitioning : public ::testing::Test { AssertPartition(partitioning, record_batch, expected_batches, expected_expressions); } - void AssertInspectError(const std::vector& paths) { + void AssertInspectError(const std::vector& paths) { ASSERT_RAISES(Invalid, factory_->Inspect(paths)); } @@ -295,39 +295,39 @@ TEST_F(TestPartitioning, DiscoverSchemaDirectory) { factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}); // type is int32 if possible - AssertInspect({{"/0/1", ""}}, {Int("alpha"), Int("beta")}); + AssertInspect({"/0/1"}, {Int("alpha"), Int("beta")}); // extra segments are ignored - AssertInspect({{"/0/1/what", ""}}, {Int("alpha"), Int("beta")}); + AssertInspect({"/0/1/what"}, {Int("alpha"), Int("beta")}); // fall back to string if any segment for field alpha is not parseable as int - AssertInspect({{"/0/1", ""}, {"/hello/1", ""}}, {Str("alpha"), Int("beta")}); + AssertInspect({"/0/1", "/hello/1"}, {Str("alpha"), Int("beta")}); // If there are too many digits fall back to string - AssertInspect({{"/3760212050/1", ""}}, {Str("alpha"), Int("beta")}); + AssertInspect({"/3760212050/1"}, {Str("alpha"), Int("beta")}); // missing segment for beta doesn't cause an error or fallback - AssertInspect({{"/0/1", ""}, {"/hello", ""}}, {Str("alpha"), Int("beta")}); + AssertInspect({"/0/1", "/hello"}, {Str("alpha"), Int("beta")}); } TEST_F(TestPartitioning, DiscoverSchemaFilename) { factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}); // type is int32 if possible - AssertInspect({{"", "0_1_"}}, {Int("alpha"), Int("beta")}); + AssertInspect({"0_1_"}, {Int("alpha"), Int("beta")}); // extra segments are ignored - AssertInspect({{"", "0_1_what_"}}, {Int("alpha"), Int("beta")}); + AssertInspect({"0_1_what_"}, {Int("alpha"), Int("beta")}); // fall back to string if any segment for field alpha is not parseable as int - AssertInspect({{"", "0_1_"}, {"", "hello_1_"}}, {Str("alpha"), Int("beta")}); + AssertInspect({"0_1_", "hello_1_"}, {Str("alpha"), Int("beta")}); // If there are too many digits fall back to string - AssertInspect({{"", "3760212050_1_"}}, {Str("alpha"), Int("beta")}); + AssertInspect({"3760212050_1_"}, {Str("alpha"), Int("beta")}); // Invalid syntax - AssertInspectError({{"", "234-12"}}); - AssertInspectError({{"", "hello"}}); + AssertInspectError({"234-12"}); + AssertInspectError({"hello"}); } TEST_F(TestPartitioning, DirectoryDictionaryInference) { @@ -336,21 +336,18 @@ TEST_F(TestPartitioning, DirectoryDictionaryInference) { factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options); // type is still int32 if possible - AssertInspect({{"/0/1", ""}}, {DictInt("alpha"), DictInt("beta")}); + AssertInspect({"/0/1"}, {DictInt("alpha"), DictInt("beta")}); // If there are too many digits fall back to string - AssertInspect({{"/3760212050/1", ""}}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({"/3760212050/1"}, {DictStr("alpha"), DictInt("beta")}); // successful dictionary inference - AssertInspect({{"/a/0", ""}}, {DictStr("alpha"), DictInt("beta")}); - AssertInspect({{"/a/0", ""}, {"/a/1", ""}}, {DictStr("alpha"), DictInt("beta")}); - AssertInspect({{"/a/0", ""}, {"/a", ""}}, {DictStr("alpha"), DictInt("beta")}); - AssertInspect({{"/a/0", ""}, {"/b/0", ""}, {"/a/1", ""}, {"/b/1", ""}}, - {DictStr("alpha"), DictInt("beta")}); - AssertInspect({{"/a/-", ""}, {"/b/-", ""}, {"/a/_", ""}, {"/b/_", ""}}, - {DictStr("alpha"), DictStr("beta")}); - AssertInspect({{"/a/-", ""}, {"/b/-", ""}, {"/a/_", ""}, {"/b/_", ""}}, - {DictStr("alpha"), DictStr("beta")}); + AssertInspect({"/a/0"}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({"/a/0", "/a/1"}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({"/a/0", "/a"}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({"/0/a", "/1"}, {DictInt("alpha"), DictStr("beta")}); + AssertInspect({"/a/0", "/b/0", "/a/1", "/b/1"}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({"/a/-", "/b/-", "/a/_", "/b/_"}, {DictStr("alpha"), DictStr("beta")}); } TEST_F(TestPartitioning, FilenameDictionaryInference) { @@ -359,16 +356,15 @@ TEST_F(TestPartitioning, FilenameDictionaryInference) { factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}, options); // type is still int32 if possible - AssertInspect({{"", "0_1_"}}, {DictInt("alpha"), DictInt("beta")}); + AssertInspect({"0_1_"}, {DictInt("alpha"), DictInt("beta")}); // If there are too many digits fall back to string - AssertInspect({{"", "3760212050_1_"}}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({"3760212050_1_"}, {DictStr("alpha"), DictInt("beta")}); // successful dictionary inference - AssertInspect({{"", "a_0_"}}, {DictStr("alpha"), DictInt("beta")}); - AssertInspect({{"", "a_0_"}, {"", "a_1_"}}, {DictStr("alpha"), DictInt("beta")}); - AssertInspect({{"", "a_0_"}, {"", "b_0_"}, {"", "a_1_"}, {"", "b_1_"}}, - {DictStr("alpha"), DictInt("beta")}); + AssertInspect({"a_0_"}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({"a_0_", "a_1_"}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({"a_0_", "b_0_", "a_1_", "b_1_"}, {DictStr("alpha"), DictInt("beta")}); } TEST_F(TestPartitioning, DictionaryHasUniqueValues) { @@ -377,8 +373,7 @@ TEST_F(TestPartitioning, DictionaryHasUniqueValues) { factory_ = DirectoryPartitioning::MakeFactory({"alpha"}, options); auto alpha = DictStr("alpha"); - AssertInspect({{"/a", ""}, {"/b", ""}, {"/a", ""}, {"/b", ""}, {"/c", ""}, {"/a", ""}}, - {alpha}); + AssertInspect({"/a", "/b", "/a", "/b", "/c", "/a"}, {alpha}); ASSERT_OK_AND_ASSIGN(auto partitioning, factory_->Finish(schema({alpha}))); auto expected_dictionary = @@ -400,7 +395,7 @@ TEST_F(TestPartitioning, DictionaryHasUniqueValues) { TEST_F(TestPartitioning, DiscoverSchemaSegfault) { // ARROW-7638 factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}); - AssertInspectError({{"oops.txt", ""}}); + AssertInspectError({"oops.txt"}); } TEST_F(TestPartitioning, HivePartitioning) { @@ -486,29 +481,27 @@ TEST_F(TestPartitioning, DiscoverHiveSchema) { factory_ = HivePartitioning::MakeFactory(options); // type is int32 if possible - AssertInspect({{"/alpha=0/beta=1", ""}}, {Int("alpha"), Int("beta")}); + AssertInspect({"/alpha=0/beta=1"}, {Int("alpha"), Int("beta")}); // extra segments are ignored - AssertInspect({{"/gamma=0/unexpected/delta=1/dat.parquet", ""}}, + AssertInspect({"/gamma=0/unexpected/delta=1/dat.parquet"}, {Int("gamma"), Int("delta")}); // schema field names are in order of first occurrence // (...so ensure your partitions are ordered the same for all paths) - AssertInspect({{"/alpha=0/beta=1", ""}, {"/beta=2/alpha=3", ""}}, - {Int("alpha"), Int("beta")}); + AssertInspect({"/alpha=0/beta=1", "/beta=2/alpha=3"}, {Int("alpha"), Int("beta")}); // Null fallback strings shouldn't interfere with type inference - AssertInspect({{"/alpha=xyz/beta=x", ""}, {"/alpha=7/beta=xyz", ""}}, - {Int("alpha"), Str("beta")}); + AssertInspect({"/alpha=xyz/beta=x", "/alpha=7/beta=xyz"}, {Int("alpha"), Str("beta")}); // Cannot infer if the only values are null - AssertInspectError({{"/alpha=xyz", ""}}); + AssertInspectError({"/alpha=xyz"}); // If there are too many digits fall back to string - AssertInspect({{"/alpha=3760212050", ""}}, {Str("alpha")}); + AssertInspect({"/alpha=3760212050"}, {Str("alpha")}); // missing path segments will not cause an error - AssertInspect({{"/alpha=0/beta=1", ""}, {"/beta=2/alpha=3", ""}, {"/gamma=what", ""}}, + AssertInspect({"/alpha=0/beta=1", "/beta=2/alpha=3", "/gamma=what"}, {Int("alpha"), Int("beta"), Str("gamma")}); } @@ -519,27 +512,22 @@ TEST_F(TestPartitioning, HiveDictionaryInference) { factory_ = HivePartitioning::MakeFactory(options); // type is still int32 if possible - AssertInspect({{"/alpha=0/beta=1", ""}}, {DictInt("alpha"), DictInt("beta")}); + AssertInspect({"/alpha=0/beta=1"}, {DictInt("alpha"), DictInt("beta")}); // If there are too many digits fall back to string - AssertInspect({{"/alpha=3760212050", ""}}, {DictStr("alpha")}); + AssertInspect({"/alpha=3760212050"}, {DictStr("alpha")}); // successful dictionary inference - AssertInspect({{"/alpha=a/beta=0", ""}}, {DictStr("alpha"), DictInt("beta")}); - AssertInspect({{"/alpha=a/beta=0", ""}, {"/alpha=a/1", ""}}, - {DictStr("alpha"), DictInt("beta")}); - AssertInspect({{"/alpha=a/beta=0", ""}, {"/alpha=xyz/beta=xyz", ""}}, - {DictStr("alpha"), DictInt("beta")}); - AssertInspect({{"/alpha=a/beta=0", ""}, - {"/alpha=b/beta=0", ""}, - {"/alpha=a/beta=1", ""}, - {"/alpha=b/beta=1", ""}}, + AssertInspect({"/alpha=a/beta=0"}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({"/alpha=a/beta=0", "/alpha=a/1"}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({"/alpha=a/beta=0", "/alpha=xyz/beta=xyz"}, {DictStr("alpha"), DictInt("beta")}); - AssertInspect({{"/alpha=a/beta=-", ""}, - {"/alpha=b/beta=-", ""}, - {"/alpha=a/beta=_", ""}, - {"/alpha=b/beta=_", ""}}, - {DictStr("alpha"), DictStr("beta")}); + AssertInspect( + {"/alpha=a/beta=0", "/alpha=b/beta=0", "/alpha=a/beta=1", "/alpha=b/beta=1"}, + {DictStr("alpha"), DictInt("beta")}); + AssertInspect( + {"/alpha=a/beta=-", "/alpha=b/beta=-", "/alpha=a/beta=_", "/alpha=b/beta=_"}, + {DictStr("alpha"), DictStr("beta")}); } TEST_F(TestPartitioning, HiveNullFallbackPassedOn) { @@ -547,7 +535,7 @@ TEST_F(TestPartitioning, HiveNullFallbackPassedOn) { options.null_fallback = "xyz"; factory_ = HivePartitioning::MakeFactory(options); - EXPECT_OK_AND_ASSIGN(auto schema, factory_->Inspect({{"/alpha=a/beta=0", ""}})); + EXPECT_OK_AND_ASSIGN(auto schema, factory_->Inspect({"/alpha=a/beta=0"})); EXPECT_OK_AND_ASSIGN(auto partitioning, factory_->Finish(schema)); ASSERT_EQ("xyz", std::static_pointer_cast(partitioning)->null_fallback()); @@ -559,12 +547,7 @@ TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) { factory_ = HivePartitioning::MakeFactory(options); auto alpha = DictStr("alpha"); - AssertInspect({{"/alpha=a", ""}, - {"/alpha=b", ""}, - {"/alpha=a", ""}, - {"/alpha=b", ""}, - {"/alpha=c", ""}, - {"/alpha=a", ""}}, + AssertInspect({"/alpha=a", "/alpha=b", "/alpha=a", "/alpha=b", "/alpha=c", "/alpha=a"}, {alpha}); ASSERT_OK_AND_ASSIGN(auto partitioning, factory_->Finish(schema({alpha}))); @@ -591,23 +574,22 @@ TEST_F(TestPartitioning, ExistingSchemaDirectory) { options.schema = schema({field("alpha", int64()), field("beta", dict_type)}); factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options); - AssertInspect({{"/0/1", ""}}, options.schema->fields()); - AssertInspect({{"/0/1/what", ""}}, options.schema->fields()); + AssertInspect({"/0/1"}, options.schema->fields()); + AssertInspect({"/0/1/what"}, options.schema->fields()); // fail if any segment is not parseable as schema type EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"), - factory_->Inspect({{"/0/1", ""}, {"/hello/1", ""}})); + factory_->Inspect({"/0/1", "/hello/1"})); factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options); // Now we don't fail since our type is large enough - AssertInspect({{"/3760212050/1", ""}}, options.schema->fields()); + AssertInspect({"/3760212050/1"}, options.schema->fields()); // If there are still too many digits, fail - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::HasSubstr("Failed to parse string"), - factory_->Inspect({{"/1038581385102940193760212050/1", ""}})); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"), + factory_->Inspect({"/1038581385102940193760212050/1"})); factory_ = DirectoryPartitioning::MakeFactory({"alpha", "beta"}, options); - AssertInspect({{"/0/1", ""}, {"/2", ""}}, options.schema->fields()); + AssertInspect({"/0/1", "/2"}, options.schema->fields()); } TEST_F(TestPartitioning, ExistingSchemaHive) { @@ -617,33 +599,33 @@ TEST_F(TestPartitioning, ExistingSchemaHive) { options.schema = schema({field("a", int64()), field("b", dict_type)}); factory_ = HivePartitioning::MakeFactory(options); - AssertInspect({{"/a=0/b=1", ""}}, options.schema->fields()); - AssertInspect({{"/a=0/b=1/what", ""}}, options.schema->fields()); - AssertInspect({{"/a=0", ""}, {"/b=1", ""}}, options.schema->fields()); + AssertInspect({"/a=0/b=1"}, options.schema->fields()); + AssertInspect({"/a=0/b=1/what"}, options.schema->fields()); + AssertInspect({"/a=0", "/b=1"}, options.schema->fields()); // fail if any segment for field alpha is not parseable as schema type EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr( "Could not cast segments for partition field a to requested type int64"), - factory_->Inspect({{"/a=0/b=1", ""}, {"/a=hello/b=1", ""}})); + factory_->Inspect({"/a=0/b=1", "/a=hello/b=1"})); factory_ = HivePartitioning::MakeFactory(options); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("Requested schema has 2 fields, but only 1 were detected"), - factory_->Inspect({{"/a=0", ""}, {"/a=hello", ""}})); + factory_->Inspect({"/a=0", "/a=hello"})); factory_ = HivePartitioning::MakeFactory(options); // Now we don't fail since our type is large enough - AssertInspect({{"/a=3760212050/b=1", ""}}, options.schema->fields()); + AssertInspect({"/a=3760212050/b=1"}, options.schema->fields()); // If there are still too many digits, fail EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("Failed to parse string"), - factory_->Inspect({{"/a=1038581385102940193760212050/b=1", ""}})); + factory_->Inspect({"/a=1038581385102940193760212050/b=1"})); factory_ = HivePartitioning::MakeFactory(options); - AssertInspect({{"/a=0/b=1", ""}, {"/b=2", ""}}, options.schema->fields()); + AssertInspect({"/a=0/b=1", "/b=2"}, options.schema->fields()); } TEST_F(TestPartitioning, ExistingSchemaFilename) { @@ -653,23 +635,22 @@ TEST_F(TestPartitioning, ExistingSchemaFilename) { options.schema = schema({field("alpha", int64()), field("beta", dict_type)}); factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}, options); - AssertInspect({{"", "0_1_"}}, options.schema->fields()); - AssertInspect({{"", "0_1_what_"}}, options.schema->fields()); + AssertInspect({"0_1_"}, options.schema->fields()); + AssertInspect({"0_1_what_"}, options.schema->fields()); // fail if any segment is not parseable as schema type EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"), - factory_->Inspect({{"", "0_1_"}, {"", "hello_1_"}})); + factory_->Inspect({"0_1_", "hello_1_"})); factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}, options); // Now we don't fail since our type is large enough - AssertInspect({{"", "3760212050_1_"}}, options.schema->fields()); + AssertInspect({"3760212050_1_"}, options.schema->fields()); // If there are still too many digits, fail - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::HasSubstr("Failed to parse string"), - factory_->Inspect({{"", "1038581385102940193760212050_1_"}})); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("Failed to parse string"), + factory_->Inspect({"1038581385102940193760212050_1_"})); factory_ = FilenamePartitioning::MakeFactory({"alpha", "beta"}, options); - AssertInspect({{"", "0_1_"}, {"", "2_"}}, options.schema->fields()); + AssertInspect({"0_1_", "2_"}, options.schema->fields()); } TEST_F(TestPartitioning, UrlEncodedDirectory) { @@ -678,8 +659,8 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { options.schema = schema({field("date", ts), field("time", ts), field("str", utf8())}); factory_ = DirectoryPartitioning::MakeFactory(options.schema->field_names(), options); - AssertInspect({{"/2021-05-04 00:00:00/2021-05-04 07:27:00/%24", ""}, - {"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/foo", ""}}, + AssertInspect({"/2021-05-04 00:00:00/2021-05-04 07:27:00/%24", + "/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/foo"}, options.schema->fields()); auto date = std::make_shared(1620086400, ts); auto time = std::make_shared(1620113220, ts); @@ -691,7 +672,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { // Invalid UTF-8 EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), - factory_->Inspect({{"/%AF/%BF/%CF", ""}})); + factory_->Inspect({"/%AF/%BF/%CF"})); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), partitioning_->Parse({"/%AF/%BF/%CF", ""})); @@ -699,8 +680,8 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { options.schema = schema({field("date", utf8()), field("time", utf8()), field("str", utf8())}); factory_ = DirectoryPartitioning::MakeFactory(options.schema->field_names(), options); - AssertInspect({{"/2021-05-04 00:00:00/2021-05-04 07:27:00/%E3%81%8F%E3%81%BE", ""}, - {"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/foo", ""}}, + AssertInspect({"/2021-05-04 00:00:00/2021-05-04 07:27:00/%E3%81%8F%E3%81%BE", + "/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/foo"}, options.schema->fields()); partitioning_ = std::make_shared( options.schema, ArrayVector(), options.AsPartitioningOptions()); @@ -718,9 +699,9 @@ TEST_F(TestPartitioning, UrlEncodedHive) { factory_ = HivePartitioning::MakeFactory(options); AssertInspect( - {{"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", ""}, - {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", ""}, - {"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", ""}}, + {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", + "/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", + "/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24"}, options.schema->fields()); auto date = std::make_shared(1620086400, ts); @@ -741,9 +722,8 @@ TEST_F(TestPartitioning, UrlEncodedHive) { equal(field_ref("time"), literal(time)), is_null(field_ref("str"))})); // Invalid UTF-8 - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::HasSubstr("was not valid UTF-8"), - factory_->Inspect({{"/date=%AF/time=%BF/str=%CF", ""}})); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), + factory_->Inspect({"/date=%AF/time=%BF/str=%CF"})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("was not valid UTF-8"), partitioning_->Parse({"/date=%AF/time=%BF/str=%CF", ""})); @@ -753,9 +733,9 @@ TEST_F(TestPartitioning, UrlEncodedHive) { schema({field("date", utf8()), field("time", utf8()), field("str", utf8())}); factory_ = HivePartitioning::MakeFactory(options); AssertInspect( - {{"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", ""}, - {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", ""}, - {"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", ""}}, + {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", + "/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", + "/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24"}, options.schema->fields()); partitioning_ = std::make_shared(options.schema, ArrayVector(), options.AsHivePartitioningOptions()); @@ -765,9 +745,8 @@ TEST_F(TestPartitioning, UrlEncodedHive) { equal(field_ref("str"), literal("%24"))})); // Invalid UTF-8 - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::HasSubstr("was not valid UTF-8"), - factory_->Inspect({{"/date=\xAF/time=\xBF/str=\xCF", ""}})); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), + factory_->Inspect({"/date=\xAF/time=\xBF/str=\xCF"})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("was not valid UTF-8"), partitioning_->Parse({"/date=\xAF/time=\xBF/str=\xCF", ""})); @@ -781,15 +760,12 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { options.null_fallback = "$"; factory_ = HivePartitioning::MakeFactory(options); - AssertInspect({{"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " - "07:27:00/str=$", - ""}, - {"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " - "07:27:00/str=%E3%81%8F%E3%81%BE", - ""}, - {"/test%27%3B%20date=2021-05-04 " - "00%3A00%3A00/test%27%3B%20time=2021-05-04 07%3A27%3A00/str=%24", - ""}}, + AssertInspect({"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " + "07:27:00/str=$", + "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " + "07:27:00/str=%E3%81%8F%E3%81%BE", + "/test%27%3B%20date=2021-05-04 " + "00%3A00%3A00/test%27%3B%20time=2021-05-04 07%3A27%3A00/str=%24"}, options.schema->fields()); auto date = std::make_shared(1620086400, ts); @@ -819,7 +795,7 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { // Invalid UTF-8 EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("was not valid UTF-8"), - factory_->Inspect({{"/%AF=2021-05-04/time=2021-05-04 07%3A27%3A00/str=%24", ""}})); + factory_->Inspect({"/%AF=2021-05-04/time=2021-05-04 07%3A27%3A00/str=%24"})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("was not valid UTF-8"), partitioning_->Parse({"/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24", ""})); @@ -999,12 +975,9 @@ TEST(TestStripPrefixAndFilename, Basic) { std::vector input{"/data/year=2019/file.parquet", "/data/year=2019/month=12/file.parquet", "/data/year=2019/month=12/day=01/file.parquet"}; - auto paths = StripPrefixAndFilename(input, "/data"); - std::vector path_directories; - std::transform(paths.begin(), paths.end(), std::back_inserter(path_directories), - [](PartitionPathFormat const& path) { return path.directory; }); - EXPECT_THAT(path_directories, testing::ElementsAre("year=2019", "year=2019/month=12", - "year=2019/month=12/day=01")); + EXPECT_THAT(StripPrefixAndFilename(input, "/data"), + testing::ElementsAre("year=2019", "year=2019/month=12", + "year=2019/month=12/day=01")); } } // namespace dataset diff --git a/r/autobrew b/r/autobrew new file mode 100644 index 00000000000..e653909eb57 --- /dev/null +++ b/r/autobrew @@ -0,0 +1,59 @@ +# Set the default bundle version in case we don't have the exact package version +VERSION="${VERSION:0:5}" +if [ "$VERSION" != "3.0.0" ] && [ "$VERSION" != "4.0.1" ] && [ "$VERSION" != "5.0.0" ] && [ "$VERSION" != "6.0.0" ] && [ "$VERSION" != "6.0.1" ] && [ "$VERSION" != "7.0.0" ]; then +VERSION="7.0.0" +fi + +# Find a bundle +if [ $(arch | grep arm) ]; then +bottle="https://autobrew.github.io/archive/arm64_big_sur/apache-arrow-${VERSION}-arm64_big_sur.tar.xz" +# See: https://github.com/Homebrew/homebrew-core/issues/76537 +#elif [[ ${OSTYPE:6:2} -ge 20 ]]; then +#bottle="https://autobrew.github.io/archive/big_sur/apache-arrow-${VERSION}-big_sur.tar.xz" +elif [[ ${OSTYPE:6:2} -lt 17 ]] && [[ ${VERSION:0:1} -lt 7 ]]; then +bottle="https://autobrew.github.io/archive/el_capitan/apache-arrow-${VERSION}-el_capitan.tar.xz" +else +bottle="https://autobrew.github.io/archive/high_sierra/apache-arrow-${VERSION}-high_sierra.tar.xz" +fi + +# Skip if disabled +if [ "$DISABLE_AUTOBREW" ]; then return 0; fi + +# Debug +echo "Using autobrew bundle: $(basename $bottle)" + +# General setup +BREWDIR="$PWD/.deps" +mkdir -p $BREWDIR +curl -sSL $bottle -o libs.tar.xz +tar -xf libs.tar.xz --strip 1 -C $BREWDIR +rm -f libs.tar.xz + +# Hardcoded flags +AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-management -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common" + +# Aws-sdk-cpp 1.9.xxx requires extra flags +if [ -f "$BREWDIR/lib/libaws-crt-cpp.a" ]; then +echo "Linking to aws-sdk-cpp 1.9" +AWS_LIBS="$AWS_LIBS -laws-crt-cpp -laws-c-io -laws-c-s3 -laws-c-auth -laws-c-http -laws-c-cal -laws-c-compression -laws-c-mqtt" +fi + +PKG_LIBS="-L$BREWDIR/lib -lparquet -larrow_dataset -larrow -larrow_bundled_dependencies -lthrift -llz4 -lsnappy -lzstd $AWS_LIBS -lpthread -lcurl" +PKG_CFLAGS="-I$BREWDIR/include -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_JSON -DARROW_R_WITH_S3" + +# Prevent linking against other libs in /usr/local/lib +for FILE in $BREWDIR/lib/*.a; do + BASENAME=$(basename $FILE) + LIBNAME=$(echo "${BASENAME%.*}" | cut -c4-) + cp -f $FILE $BREWDIR/lib/libbrew$LIBNAME.a + PKG_LIBS=$(echo $PKG_LIBS | sed "s/-l$LIBNAME /-lbrew$LIBNAME /g") +done + +# Cleanup +echo "rm -Rf .deps" >> cleanup +chmod +x cleanup + +# Disable tests on R 3.6 due to buggy custom CRAN compiler +if [ "${R_VERSION:0:1}" = "3" ]; then + rm -f tests/testthat.R +fi diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 14eb6eb397a..312b778aada 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1767,11 +1767,11 @@ extern "C" SEXP _arrow_dataset___HivePartitioning__MakeFactory(SEXP null_fallbac // dataset.cpp #if defined(ARROW_R_WITH_DATASET) -std::shared_ptr dataset___PartitioningFactory__Inspect(const std::shared_ptr& factory, const std::vector>& paths); +std::shared_ptr dataset___PartitioningFactory__Inspect(const std::shared_ptr& factory, const std::vector& paths); extern "C" SEXP _arrow_dataset___PartitioningFactory__Inspect(SEXP factory_sexp, SEXP paths_sexp){ BEGIN_CPP11 arrow::r::Input&>::type factory(factory_sexp); - arrow::r::Input>&>::type paths(paths_sexp); + arrow::r::Input&>::type paths(paths_sexp); return cpp11::as_sexp(dataset___PartitioningFactory__Inspect(factory, paths)); END_CPP11 } diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index d9fee37e7f1..5a931ccb8de 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -36,6 +36,7 @@ #if defined(ARROW_R_WITH_DATASET) #include +#include #endif #include diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp index 7a3273e9abc..aa44a0721e3 100644 --- a/r/src/dataset.cpp +++ b/r/src/dataset.cpp @@ -386,12 +386,8 @@ std::shared_ptr dataset___HivePartitioning__MakeFactory // [[dataset::export]] std::shared_ptr dataset___PartitioningFactory__Inspect( const std::shared_ptr& factory, - const std::vector>& paths) { - std::vector partition_paths(paths.size()); - for (auto& path : paths) { - partition_paths.emplace_back(ds::PartitionPathFormat{path.first, path.second}); - } - return ValueOrStop(factory->Inspect(partition_paths)); + const std::vector& paths) { + return ValueOrStop(factory->Inspect(paths)); } // [[dataset::export]] From 267ac94b5d61ee6ca428048df35489a004725162 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Thu, 5 May 2022 19:16:44 +0530 Subject: [PATCH 16/24] fix: GetOrInferSchema() to have PartitionPathFormat in argument --- cpp/src/arrow/dataset/partition.cc | 29 +++++++----- cpp/src/arrow/dataset/partition.h | 7 +-- cpp/src/arrow/dataset/partition_test.cc | 9 ++-- r/autobrew | 59 ------------------------- 4 files changed, 29 insertions(+), 75 deletions(-) delete mode 100644 r/autobrew diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 6e636b88cc0..49f50622df1 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -814,33 +814,42 @@ PartitionPathFormat StripPrefixAndFilename(const std::string& path, std::move(basename_filename.second)}; } -std::vector StripPrefixAndFilename(const std::vector& paths, - const std::string& prefix) { - std::vector result; +std::vector StripPrefixAndFilename( + const std::vector& paths, const std::string& prefix) { + std::vector result; result.reserve(paths.size()); for (const auto& path : paths) { - result.emplace_back(StripPrefixAndFilename(path, prefix).directory); + result.emplace_back(StripPrefixAndFilename(path, prefix)); } return result; } -std::vector StripPrefixAndFilename(const std::vector& files, - const std::string& prefix) { - std::vector result; +std::vector StripPrefixAndFilename( + const std::vector& files, const std::string& prefix) { + std::vector result; result.reserve(files.size()); for (const auto& info : files) { - result.emplace_back(StripPrefixAndFilename(info.path(), prefix).directory); + result.emplace_back(StripPrefixAndFilename(info.path(), prefix)); } return result; } Result> PartitioningOrFactory::GetOrInferSchema( - const std::vector& paths) { + const std::vector& paths) { + if (auto part = partitioning()) { return part->schema(); } + std::vector partition_paths(paths.size()); + if (factory()->type_name() == "filename") { + std::transform(paths.begin(), paths.end(), std::back_inserter(partition_paths), + [](PartitionPathFormat const& path) { return path.filename; }); + } else { + std::transform(paths.begin(), paths.end(), std::back_inserter(partition_paths), + [](PartitionPathFormat const& path) { return path.directory; }); + } - return factory()->Inspect(paths); + return factory()->Inspect(partition_paths); } } // namespace dataset diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index e18c1d6ddbf..f5368eaa788 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -366,11 +366,11 @@ ARROW_DS_EXPORT PartitionPathFormat StripPrefixAndFilename(const std::string& pa const std::string& prefix); /// \brief Vector version of StripPrefixAndFilename. -ARROW_DS_EXPORT std::vector StripPrefixAndFilename( +ARROW_DS_EXPORT std::vector StripPrefixAndFilename( const std::vector& paths, const std::string& prefix); /// \brief Vector version of StripPrefixAndFilename. -ARROW_DS_EXPORT std::vector StripPrefixAndFilename( +ARROW_DS_EXPORT std::vector StripPrefixAndFilename( const std::vector& files, const std::string& prefix); /// \brief Either a Partitioning or a PartitioningFactory @@ -397,7 +397,8 @@ class ARROW_DS_EXPORT PartitioningOrFactory { const std::shared_ptr& factory() const { return factory_; } /// \brief Get the partition schema, inferring it with the given factory if needed. - Result> GetOrInferSchema(const std::vector& paths); + Result> GetOrInferSchema( + const std::vector& paths); private: std::shared_ptr factory_; diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 157a9c3af44..d872094ef89 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -975,9 +975,12 @@ TEST(TestStripPrefixAndFilename, Basic) { std::vector input{"/data/year=2019/file.parquet", "/data/year=2019/month=12/file.parquet", "/data/year=2019/month=12/day=01/file.parquet"}; - EXPECT_THAT(StripPrefixAndFilename(input, "/data"), - testing::ElementsAre("year=2019", "year=2019/month=12", - "year=2019/month=12/day=01")); + auto paths = StripPrefixAndFilename(input, "/data"); + std::vector path_directories; + std::transform(paths.begin(), paths.end(), std::back_inserter(path_directories), + [](PartitionPathFormat const& path) { return path.directory; }); + EXPECT_THAT(path_directories, testing::ElementsAre("year=2019", "year=2019/month=12", + "year=2019/month=12/day=01")); } } // namespace dataset diff --git a/r/autobrew b/r/autobrew deleted file mode 100644 index e653909eb57..00000000000 --- a/r/autobrew +++ /dev/null @@ -1,59 +0,0 @@ -# Set the default bundle version in case we don't have the exact package version -VERSION="${VERSION:0:5}" -if [ "$VERSION" != "3.0.0" ] && [ "$VERSION" != "4.0.1" ] && [ "$VERSION" != "5.0.0" ] && [ "$VERSION" != "6.0.0" ] && [ "$VERSION" != "6.0.1" ] && [ "$VERSION" != "7.0.0" ]; then -VERSION="7.0.0" -fi - -# Find a bundle -if [ $(arch | grep arm) ]; then -bottle="https://autobrew.github.io/archive/arm64_big_sur/apache-arrow-${VERSION}-arm64_big_sur.tar.xz" -# See: https://github.com/Homebrew/homebrew-core/issues/76537 -#elif [[ ${OSTYPE:6:2} -ge 20 ]]; then -#bottle="https://autobrew.github.io/archive/big_sur/apache-arrow-${VERSION}-big_sur.tar.xz" -elif [[ ${OSTYPE:6:2} -lt 17 ]] && [[ ${VERSION:0:1} -lt 7 ]]; then -bottle="https://autobrew.github.io/archive/el_capitan/apache-arrow-${VERSION}-el_capitan.tar.xz" -else -bottle="https://autobrew.github.io/archive/high_sierra/apache-arrow-${VERSION}-high_sierra.tar.xz" -fi - -# Skip if disabled -if [ "$DISABLE_AUTOBREW" ]; then return 0; fi - -# Debug -echo "Using autobrew bundle: $(basename $bottle)" - -# General setup -BREWDIR="$PWD/.deps" -mkdir -p $BREWDIR -curl -sSL $bottle -o libs.tar.xz -tar -xf libs.tar.xz --strip 1 -C $BREWDIR -rm -f libs.tar.xz - -# Hardcoded flags -AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-management -laws-cpp-sdk-cognito-identity -laws-cpp-sdk-sts -laws-cpp-sdk-s3 -laws-cpp-sdk-core -laws-c-event-stream -laws-checksums -laws-c-common" - -# Aws-sdk-cpp 1.9.xxx requires extra flags -if [ -f "$BREWDIR/lib/libaws-crt-cpp.a" ]; then -echo "Linking to aws-sdk-cpp 1.9" -AWS_LIBS="$AWS_LIBS -laws-crt-cpp -laws-c-io -laws-c-s3 -laws-c-auth -laws-c-http -laws-c-cal -laws-c-compression -laws-c-mqtt" -fi - -PKG_LIBS="-L$BREWDIR/lib -lparquet -larrow_dataset -larrow -larrow_bundled_dependencies -lthrift -llz4 -lsnappy -lzstd $AWS_LIBS -lpthread -lcurl" -PKG_CFLAGS="-I$BREWDIR/include -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_JSON -DARROW_R_WITH_S3" - -# Prevent linking against other libs in /usr/local/lib -for FILE in $BREWDIR/lib/*.a; do - BASENAME=$(basename $FILE) - LIBNAME=$(echo "${BASENAME%.*}" | cut -c4-) - cp -f $FILE $BREWDIR/lib/libbrew$LIBNAME.a - PKG_LIBS=$(echo $PKG_LIBS | sed "s/-l$LIBNAME /-lbrew$LIBNAME /g") -done - -# Cleanup -echo "rm -Rf .deps" >> cleanup -chmod +x cleanup - -# Disable tests on R 3.6 due to buggy custom CRAN compiler -if [ "${R_VERSION:0:1}" = "3" ]; then - rm -f tests/testthat.R -fi From 084fd96cb4acae99239366a29fa9b785721d2163 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Thu, 5 May 2022 19:29:21 +0530 Subject: [PATCH 17/24] refactor: remove including partition.h in r/src/arrow_types.h --- r/src/arrow_types.h | 1 - 1 file changed, 1 deletion(-) diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 5a931ccb8de..d9fee37e7f1 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -36,7 +36,6 @@ #if defined(ARROW_R_WITH_DATASET) #include -#include #endif #include From 6133121d88d6569f4c2d1af2de2d7535a81abdc1 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Thu, 5 May 2022 19:53:28 +0530 Subject: [PATCH 18/24] fix: cpp lint --- cpp/src/arrow/dataset/partition.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 49f50622df1..732f668b01f 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -836,7 +836,6 @@ std::vector StripPrefixAndFilename( Result> PartitioningOrFactory::GetOrInferSchema( const std::vector& paths) { - if (auto part = partitioning()) { return part->schema(); } From 14b27118a1c28ff9e2471443486f353d014b7de2 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 23 May 2022 12:52:30 +0530 Subject: [PATCH 19/24] feat: Parse() method to take the complete path --- cpp/src/arrow/dataset/discovery.cc | 2 +- cpp/src/arrow/dataset/file_base.cc | 12 +-- cpp/src/arrow/dataset/file_parquet.cc | 12 +-- cpp/src/arrow/dataset/partition.cc | 46 ++++---- cpp/src/arrow/dataset/partition.h | 26 ++--- cpp/src/arrow/dataset/partition_test.cc | 137 ++++++++++++------------ 6 files changed, 110 insertions(+), 125 deletions(-) diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc index f8382aa4052..25fa7ff2b70 100644 --- a/cpp/src/arrow/dataset/discovery.cc +++ b/cpp/src/arrow/dataset/discovery.cc @@ -279,7 +279,7 @@ Result> FileSystemDatasetFactory::Finish(FinishOptions std::vector> fragments; for (const auto& info : files_) { - auto fixed_path = StripPrefixAndFilename(info.path(), options_.partition_base_dir); + auto fixed_path = StripPrefix(info.path(), options_.partition_base_dir); ARROW_ASSIGN_OR_RAISE(auto partition, partitioning->Parse(fixed_path)); ARROW_ASSIGN_OR_RAISE(auto fragment, format_->MakeFragment({info, fs_}, partition)); fragments.push_back(fragment); diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index 10277810575..ef5b7dbde4e 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -273,7 +273,7 @@ namespace { Status WriteBatch(std::shared_ptr batch, compute::Expression guarantee, FileSystemDatasetWriteOptions write_options, std::function, - const Partitioning::PartitionPathFormat&)> + const PartitionPathFormat&)> write) { ARROW_ASSIGN_OR_RAISE(auto groups, write_options.partitioning->Partition(batch)); batch.reset(); // drop to hopefully conserve memory @@ -292,7 +292,7 @@ Status WriteBatch(std::shared_ptr batch, compute::Expression guaran for (std::size_t index = 0; index < groups.batches.size(); index++) { auto partition_expression = and_(groups.expressions[index], guarantee); auto next_batch = groups.batches[index]; - Partitioning::PartitionPathFormat destination; + PartitionPathFormat destination; ARROW_ASSIGN_OR_RAISE(destination, write_options.partitioning->Format(partition_expression)); RETURN_NOT_OK(write(next_batch, destination)); @@ -337,10 +337,10 @@ class DatasetWritingSinkNodeConsumer : public compute::SinkNodeConsumer { return WriteBatch( batch, guarantee, write_options_, [this](std::shared_ptr next_batch, - const Partitioning::PartitionPathFormat& destination) { + const PartitionPathFormat& destination) { return task_group_.AddTask([this, next_batch, destination] { Future<> has_room = dataset_writer_->WriteRecordBatch( - next_batch, destination.directory, destination.prefix); + next_batch, destination.directory, destination.filename); if (!has_room.is_finished()) { // We don't have to worry about sequencing backpressure here since // task_group_ serves as our sequencer. If batches continue to arrive after @@ -481,11 +481,11 @@ class TeeNode : public compute::MapNode { compute::Expression guarantee) { return WriteBatch(batch, guarantee, write_options_, [this](std::shared_ptr next_batch, - const Partitioning::PartitionPathFormat& destination) { + const PartitionPathFormat& destination) { return task_group_.AddTask([this, next_batch, destination] { util::tracing::Span span; Future<> has_room = dataset_writer_->WriteRecordBatch( - next_batch, destination.directory, destination.prefix); + next_batch, destination.directory, destination.filename); if (!has_room.is_finished()) { this->Pause(); return has_room.Then([this] { this->Resume(); }); diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 4b1ded92a4a..0ab72a877ba 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -919,18 +919,10 @@ Result>> ParquetDatasetFactory::InspectSchem std::vector stripped(paths_with_row_group_ids_.size()); size_t i = 0; - - if (options_.partitioning.factory()->type_name() == "filename") { - for (const auto& e : paths_with_row_group_ids_) { + for (const auto& e : paths_with_row_group_ids_) { stripped[i++] = - StripPrefixAndFilename(e.first, options_.partition_base_dir).filename; + StripPrefixAndFilename(e.first, options_.partition_base_dir); } - } else { - for (const auto& e : paths_with_row_group_ids_) { - stripped[i++] = - StripPrefixAndFilename(e.first, options_.partition_base_dir).directory; - } - } ARROW_ASSIGN_OR_RAISE(auto partition_schema, factory->Inspect(stripped)); schemas.push_back(std::move(partition_schema)); diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 732f668b01f..4c0426e579c 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -82,7 +82,7 @@ std::shared_ptr Partitioning::Default() { std::string type_name() const override { return "default"; } - Result Parse(const PartitionPathFormat& path) const override { + Result Parse(const std::string& path) const override { return compute::literal(true); } @@ -251,7 +251,7 @@ Result KeyValuePartitioning::ConvertKey(const Key& key) con } Result KeyValuePartitioning::Parse( - const PartitionPathFormat& path) const { + const std::string& path) const { std::vector expressions; ARROW_ASSIGN_OR_RAISE(auto parsed, ParseKeys(path)); @@ -377,8 +377,8 @@ DirectoryPartitioning::DirectoryPartitioning(std::shared_ptr schema, } Result> DirectoryPartitioning::ParseKeys( - const PartitionPathFormat& path) const { - std::vector segments = fs::internal::SplitAbstractPath(path.directory); + const std::string& path) const { + std::vector segments = fs::internal::SplitAbstractPath(fs::internal::GetAbstractPathParent(path).first); return ParsePartitionSegments(segments); } @@ -390,9 +390,9 @@ FilenamePartitioning::FilenamePartitioning(std::shared_ptr schema, } Result> FilenamePartitioning::ParseKeys( - const PartitionPathFormat& path) const { + const std::string& path) const { std::vector segments = fs::internal::SplitAbstractPath( - StripNonPrefix(path.filename), kFilenamePartitionSep); + StripNonPrefix(path), kFilenamePartitionSep); return ParsePartitionSegments(segments); } @@ -719,10 +719,10 @@ Result> HivePartitioning::ParseKey( } Result> HivePartitioning::ParseKeys( - const PartitionPathFormat& path) const { + const std::string& path) const { std::vector keys; - for (const auto& segment : fs::internal::SplitAbstractPath(path.directory)) { + for (const auto& segment : fs::internal::SplitAbstractPath(fs::internal::GetAbstractPathParent(path).first)) { ARROW_ASSIGN_OR_RAISE(auto maybe_key, ParseKey(segment, hive_options_)); if (auto key = maybe_key) { keys.push_back(std::move(*key)); @@ -805,18 +805,20 @@ std::shared_ptr HivePartitioning::MakeFactory( return std::shared_ptr(new HivePartitioningFactory(options)); } -PartitionPathFormat StripPrefixAndFilename(const std::string& path, - const std::string& prefix) { +std::string StripPrefix(const std::string& path, const std::string& prefix) { auto maybe_base_less = fs::internal::RemoveAncestor(prefix, path); auto base_less = maybe_base_less ? std::string(*maybe_base_less) : path; - auto basename_filename = fs::internal::GetAbstractPathParent(base_less); - return PartitionPathFormat{std::move(basename_filename.first), - std::move(basename_filename.second)}; + return base_less; } -std::vector StripPrefixAndFilename( +std::string StripPrefixAndFilename(const std::string& path, const std::string& prefix) { + auto base_less = StripPrefix(path, prefix); + auto basename_filename = fs::internal::GetAbstractPathParent(base_less); + return basename_filename.first; +} +std::vector StripPrefixAndFilename( const std::vector& paths, const std::string& prefix) { - std::vector result; + std::vector result; result.reserve(paths.size()); for (const auto& path : paths) { result.emplace_back(StripPrefixAndFilename(path, prefix)); @@ -824,9 +826,9 @@ std::vector StripPrefixAndFilename( return result; } -std::vector StripPrefixAndFilename( +std::vector StripPrefixAndFilename( const std::vector& files, const std::string& prefix) { - std::vector result; + std::vector result; result.reserve(files.size()); for (const auto& info : files) { result.emplace_back(StripPrefixAndFilename(info.path(), prefix)); @@ -835,19 +837,11 @@ std::vector StripPrefixAndFilename( } Result> PartitioningOrFactory::GetOrInferSchema( - const std::vector& paths) { + const std::vector& paths) { if (auto part = partitioning()) { return part->schema(); } std::vector partition_paths(paths.size()); - if (factory()->type_name() == "filename") { - std::transform(paths.begin(), paths.end(), std::back_inserter(partition_paths), - [](PartitionPathFormat const& path) { return path.filename; }); - } else { - std::transform(paths.begin(), paths.end(), std::back_inserter(partition_paths), - [](PartitionPathFormat const& path) { return path.directory; }); - } - return factory()->Inspect(partition_paths); } diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index f5368eaa788..0bfef786531 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -80,7 +80,7 @@ class ARROW_DS_EXPORT Partitioning { const std::shared_ptr& batch) const = 0; /// \brief Parse a path into a partition expression - virtual Result Parse(const PartitionPathFormat& path) const = 0; + virtual Result Parse(const std::string& path) const = 0; virtual Result Format(const compute::Expression& expr) const = 0; @@ -174,7 +174,7 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning { Result Partition( const std::shared_ptr& batch) const override; - Result Parse(const PartitionPathFormat& path) const override; + Result Parse(const std::string& path) const override; Result Format(const compute::Expression& expr) const override; @@ -191,7 +191,7 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning { } } - virtual Result> ParseKeys(const PartitionPathFormat& path) const = 0; + virtual Result> ParseKeys(const std::string& path) const = 0; virtual Result FormatValues(const ScalarVector& values) const = 0; @@ -231,7 +231,7 @@ class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning { std::vector field_names, PartitioningFactoryOptions = {}); private: - Result> ParseKeys(const PartitionPathFormat& path) const override; + Result> ParseKeys(const std::string& path) const override; Result FormatValues(const ScalarVector& values) const override; }; @@ -288,7 +288,7 @@ class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning { private: const HivePartitioningOptions hive_options_; - Result> ParseKeys(const PartitionPathFormat& path) const override; + Result> ParseKeys(const std::string& path) const override; Result FormatValues(const ScalarVector& values) const override; }; @@ -310,8 +310,8 @@ class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning { std::string type_name() const override { return name_; } - Result Parse(const PartitionPathFormat& path) const override { - return parse_impl_(path.directory); + Result Parse(const std::string& path) const override { + return parse_impl_(path); } Result Format(const compute::Expression& expr) const override { @@ -353,24 +353,26 @@ class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning { std::vector field_names, PartitioningFactoryOptions = {}); private: - Result> ParseKeys(const PartitionPathFormat& path) const override; + Result> ParseKeys(const std::string& path) const override; Result FormatValues(const ScalarVector& values) const override; }; +ARROW_DS_EXPORT std::string StripPrefix(const std::string& path, const std::string& prefix); + /// \brief Extracts the directory and filename and removes the prefix of a path /// /// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") -> /// {"year=2019","c.txt"}` -ARROW_DS_EXPORT PartitionPathFormat StripPrefixAndFilename(const std::string& path, +ARROW_DS_EXPORT std::string StripPrefixAndFilename(const std::string& path, const std::string& prefix); /// \brief Vector version of StripPrefixAndFilename. -ARROW_DS_EXPORT std::vector StripPrefixAndFilename( +ARROW_DS_EXPORT std::vector StripPrefixAndFilename( const std::vector& paths, const std::string& prefix); /// \brief Vector version of StripPrefixAndFilename. -ARROW_DS_EXPORT std::vector StripPrefixAndFilename( +ARROW_DS_EXPORT std::vector StripPrefixAndFilename( const std::vector& files, const std::string& prefix); /// \brief Either a Partitioning or a PartitioningFactory @@ -398,7 +400,7 @@ class ARROW_DS_EXPORT PartitioningOrFactory { /// \brief Get the partition schema, inferring it with the given factory if needed. Result> GetOrInferSchema( - const std::vector& paths); + const std::vector& paths); private: std::shared_ptr factory_; diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index d872094ef89..1198dd9486b 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -43,11 +43,11 @@ namespace dataset { class TestPartitioning : public ::testing::Test { public: - void AssertParseError(const PartitionPathFormat& path) { + void AssertParseError(const std::string& path) { ASSERT_RAISES(Invalid, partitioning_->Parse(path)); } - void AssertParse(const PartitionPathFormat& path, compute::Expression expected) { + void AssertParse(const std::string& path, compute::Expression expected) { ASSERT_OK_AND_ASSIGN(auto parsed, partitioning_->Parse(path)); ASSERT_EQ(parsed, expected); } @@ -64,11 +64,14 @@ class TestPartitioning : public ::testing::Test { ASSERT_OK_AND_ASSIGN(auto formatted, partitioning_->Format(expr)); ASSERT_EQ(formatted.directory, expected_directory); ASSERT_EQ(formatted.filename, expected_filename); - + + // if ((formatted.filename).empty()){ + // formatted.filename = "format.parquet"; + // } // ensure the formatted path round trips the relevant components of the partition // expression: roundtripped should be a subset of expr ASSERT_OK_AND_ASSIGN(compute::Expression roundtripped, - partitioning_->Parse(formatted)); + partitioning_->Parse(formatted.directory+formatted.filename)); ASSERT_OK_AND_ASSIGN(roundtripped, roundtripped.Bind(*written_schema_)); ASSERT_OK_AND_ASSIGN(auto simplified, SimplifyWithGuarantee(roundtripped, expr)); ASSERT_EQ(simplified, literal(true)); @@ -187,20 +190,20 @@ TEST_F(TestPartitioning, DirectoryPartitioning) { partitioning_ = std::make_shared( schema({field("alpha", int32()), field("beta", utf8())})); - AssertParse({"/0/hello", ""}, and_(equal(field_ref("alpha"), literal(0)), + AssertParse("/0/hello", and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal("hello")))); - AssertParse({"/3", ""}, equal(field_ref("alpha"), literal(3))); - AssertParseError({"/world/0", ""}); // reversed order - AssertParseError({"/0.0/foo", ""}); // invalid alpha - AssertParseError({"/3.25", ""}); // invalid alpha with missing beta - AssertParse({"", ""}, literal(true)); // no segments to parse + AssertParse("/3", equal(field_ref("alpha"), literal(3))); + AssertParseError("/world/0"); // reversed order + AssertParseError("/0.0/foo"); // invalid alpha + AssertParseError("/3.25"); // invalid alpha with missing beta + AssertParse("", literal(true)); // no segments to parse // gotcha someday: - AssertParse({"/0/dat.parquet", ""}, + AssertParse("/0/dat.parquet", and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal("dat.parquet")))); - AssertParse({"/0/foo/ignored=2341", ""}, + AssertParse("/0/foo/ignored=2341", and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal("foo")))); } @@ -209,15 +212,15 @@ TEST_F(TestPartitioning, FilenamePartitioning) { partitioning_ = std::make_shared( schema({field("alpha", int32()), field("beta", utf8())})); - AssertParse({"", "0_hello_"}, and_(equal(field_ref("alpha"), literal(0)), + AssertParse("0_hello_", and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal("hello")))); - AssertParse({"", "0_"}, equal(field_ref("alpha"), literal(0))); - AssertParseError({"", "world_0_"}); // reversed order - AssertParseError({"", "0.0_foo_"}); // invalid alpha - AssertParseError({"", "3.25_"}); // invalid alpha with missing beta - AssertParse({"", ""}, literal(true)); // no segments to parse + AssertParse("0_", equal(field_ref("alpha"), literal(0))); + AssertParseError("world_0_"); // reversed order + AssertParseError("0.0_foo_"); // invalid alpha + AssertParseError("3.25_"); // invalid alpha with missing beta + AssertParse("", literal(true)); // no segments to parse - AssertParse({"", "0_foo_ignored=2341"}, and_(equal(field_ref("alpha"), literal(0)), + AssertParse("0_foo_ignored=2341", and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal("foo")))); } @@ -284,7 +287,7 @@ TEST_F(TestPartitioning, DirectoryPartitioningWithTemporal) { schema({field("year", int32()), field("month", int8()), field("day", temporal)})); ASSERT_OK_AND_ASSIGN(auto day, StringScalar("2020-06-08").CastTo(temporal)); - AssertParse({"/2020/06/2020-06-08", ""}, + AssertParse("/2020/06/2020-06-08", and_({equal(field_ref("year"), literal(2020)), equal(field_ref("month"), literal(6)), equal(field_ref("day"), literal(day))})); @@ -386,10 +389,10 @@ TEST_F(TestPartitioning, DictionaryHasUniqueValues) { std::make_shared(index_and_dictionary, alpha->type()); auto path = "/" + expected_dictionary->GetString(i); - AssertParse({path, ""}, equal(field_ref("alpha"), literal(dictionary_scalar))); + AssertParse(path, equal(field_ref("alpha"), literal(dictionary_scalar))); } - AssertParseError({"/yosemite", ""}); // not in inspected dictionary + AssertParseError("/yosemite"); // not in inspected dictionary } TEST_F(TestPartitioning, DiscoverSchemaSegfault) { @@ -402,28 +405,28 @@ TEST_F(TestPartitioning, HivePartitioning) { partitioning_ = std::make_shared( schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz"); - AssertParse({"/alpha=0/beta=3.25", ""}, and_(equal(field_ref("alpha"), literal(0)), + AssertParse("/alpha=0/beta=3.25", and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal(3.25f)))); - AssertParse({"/beta=3.25/alpha=0", ""}, and_(equal(field_ref("beta"), literal(3.25f)), + AssertParse("/beta=3.25/alpha=0", and_(equal(field_ref("beta"), literal(3.25f)), equal(field_ref("alpha"), literal(0)))); - AssertParse({"/alpha=0", ""}, equal(field_ref("alpha"), literal(0))); + AssertParse("/alpha=0", equal(field_ref("alpha"), literal(0))); AssertParse( - {"/alpha=xyz/beta=3.25", ""}, + "/alpha=xyz/beta=3.25", and_(is_null(field_ref("alpha")), equal(field_ref("beta"), literal(3.25f)))); - AssertParse({"/beta=3.25", ""}, equal(field_ref("beta"), literal(3.25f))); - AssertParse({"", ""}, literal(true)); + AssertParse("/beta=3.25", equal(field_ref("beta"), literal(3.25f))); + AssertParse("", literal(true)); - AssertParse({"/alpha=0/beta=3.25/ignored=2341", ""}, + AssertParse("/alpha=0/beta=3.25/ignored=2341", and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal(3.25f)))); - AssertParse({"/alpha=0/beta=3.25/ignored=2341", ""}, + AssertParse("/alpha=0/beta=3.25/ignored=2341", and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal(3.25f)))); - AssertParse({"/ignored=2341", ""}, literal(true)); + AssertParse("/ignored=2341", literal(true)); - AssertParseError({"/alpha=0.0/beta=3.25", ""}); // conversion of "0.0" to int32 fails + AssertParseError("/alpha=0.0/beta=3.25"); // conversion of "0.0" to int32 fails } TEST_F(TestPartitioning, HivePartitioningFormat) { @@ -561,10 +564,10 @@ TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) { std::make_shared(index_and_dictionary, alpha->type()); auto path = "/alpha=" + expected_dictionary->GetString(i); - AssertParse({path, ""}, equal(field_ref("alpha"), literal(dictionary_scalar))); + AssertParse(path, equal(field_ref("alpha"), literal(dictionary_scalar))); } - AssertParseError({"/alpha=yosemite", ""}); // not in inspected dictionary + AssertParseError("/alpha=yosemite"); // not in inspected dictionary } TEST_F(TestPartitioning, ExistingSchemaDirectory) { @@ -665,7 +668,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { auto date = std::make_shared(1620086400, ts); auto time = std::make_shared(1620113220, ts); partitioning_ = std::make_shared(options.schema, ArrayVector()); - AssertParse({"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24", ""}, + AssertParse("/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24", and_({equal(field_ref("date"), literal(date)), equal(field_ref("time"), literal(time)), equal(field_ref("str"), literal("$"))})); @@ -674,7 +677,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), factory_->Inspect({"/%AF/%BF/%CF"})); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), - partitioning_->Parse({"/%AF/%BF/%CF", ""})); + partitioning_->Parse("/%AF/%BF/%CF")); options.segment_encoding = SegmentEncoding::None; options.schema = @@ -685,7 +688,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { options.schema->fields()); partitioning_ = std::make_shared( options.schema, ArrayVector(), options.AsPartitioningOptions()); - AssertParse({"/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24", ""}, + AssertParse("/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24", and_({equal(field_ref("date"), literal("2021-05-04 00%3A00%3A00")), equal(field_ref("time"), literal("2021-05-04 07%3A27%3A00")), equal(field_ref("str"), literal("%24"))})); @@ -708,16 +711,16 @@ TEST_F(TestPartitioning, UrlEncodedHive) { auto time = std::make_shared(1620113220, ts); partitioning_ = std::make_shared(options.schema, ArrayVector(), options.AsHivePartitioningOptions()); - AssertParse({"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", ""}, + AssertParse("/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", and_({equal(field_ref("date"), literal(date)), equal(field_ref("time"), literal(time)), is_null(field_ref("str"))})); AssertParse( - {"/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", ""}, + "/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", and_({equal(field_ref("date"), literal(date)), equal(field_ref("time"), literal(time)), equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))})); // URL-encoded null fallback value - AssertParse({"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", ""}, + AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", and_({equal(field_ref("date"), literal(date)), equal(field_ref("time"), literal(time)), is_null(field_ref("str"))})); @@ -726,7 +729,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) { factory_->Inspect({"/date=%AF/time=%BF/str=%CF"})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("was not valid UTF-8"), - partitioning_->Parse({"/date=%AF/time=%BF/str=%CF", ""})); + partitioning_->Parse("/date=%AF/time=%BF/str=%CF")); options.segment_encoding = SegmentEncoding::None; options.schema = @@ -739,7 +742,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) { options.schema->fields()); partitioning_ = std::make_shared(options.schema, ArrayVector(), options.AsHivePartitioningOptions()); - AssertParse({"/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", ""}, + AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", and_({equal(field_ref("date"), literal("2021-05-04 00%3A00%3A00")), equal(field_ref("time"), literal("2021-05-04 07%3A27%3A00")), equal(field_ref("str"), literal("%24"))})); @@ -749,7 +752,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) { factory_->Inspect({"/date=\xAF/time=\xBF/str=\xCF"})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("was not valid UTF-8"), - partitioning_->Parse({"/date=\xAF/time=\xBF/str=\xCF", ""})); + partitioning_->Parse("/date=\xAF/time=\xBF/str=\xCF")); } TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { @@ -773,22 +776,19 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { partitioning_ = std::make_shared(options.schema, ArrayVector(), options.AsHivePartitioningOptions()); AssertParse( - {"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " + "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " "07:27:00/str=$", - ""}, and_({equal(field_ref("test'; date"), literal(date)), equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))})); - AssertParse({"/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " + AssertParse("/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " "07:27:00/str=%E3%81%8F%E3%81%BE", - ""}, and_({equal(field_ref("test'; date"), literal(date)), equal(field_ref("test'; time"), literal(time)), equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))})); // URL-encoded null fallback value AssertParse( - {"/test%27%3B%20date=2021-05-04 00%3A00%3A00/test%27%3B%20time=2021-05-04 " + "/test%27%3B%20date=2021-05-04 00%3A00%3A00/test%27%3B%20time=2021-05-04 " "07%3A27%3A00/str=%24", - ""}, and_({equal(field_ref("test'; date"), literal(date)), equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))})); @@ -798,7 +798,7 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { factory_->Inspect({"/%AF=2021-05-04/time=2021-05-04 07%3A27%3A00/str=%24"})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("was not valid UTF-8"), - partitioning_->Parse({"/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24", ""})); + partitioning_->Parse("/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24")); } TEST_F(TestPartitioning, EtlThenHive) { @@ -823,18 +823,18 @@ TEST_F(TestPartitioning, EtlThenHive) { auto etl_segments_end = segments.begin() + etl_fields.size(); auto etl_path = fs::internal::JoinAbstractPath(segments.begin(), etl_segments_end); - ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_part.Parse({etl_path, ""})); + ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_part.Parse(etl_path)); auto alphabeta_segments_end = etl_segments_end + alphabeta_fields.size(); auto alphabeta_path = fs::internal::JoinAbstractPath(etl_segments_end, alphabeta_segments_end); ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr, - alphabeta_part.Parse({alphabeta_path, ""})); + alphabeta_part.Parse(alphabeta_path)); return and_(etl_expr, alphabeta_expr); }); - AssertParse({"/1999/12/31/00/alpha=0/beta=3.25", ""}, + AssertParse("/1999/12/31/00/alpha=0/beta=3.25", and_({equal(field_ref("year"), literal(1999)), equal(field_ref("month"), literal(12)), equal(field_ref("day"), literal(31)), @@ -842,7 +842,7 @@ TEST_F(TestPartitioning, EtlThenHive) { and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal(3.25f)))})); - AssertParseError({"/20X6/03/21/05/alpha=0/beta=3.25", ""}); + AssertParseError("/20X6/03/21/05/alpha=0/beta=3.25"); } TEST_F(TestPartitioning, Set) { @@ -883,9 +883,9 @@ TEST_F(TestPartitioning, Set) { auto x_in = [&](std::vector set) { return call("is_in", {field_ref("x")}, compute::SetLookupOptions{ints(set)}); }; - AssertParse({"/x in [1]", ""}, x_in({1})); - AssertParse({"/x in [1 4 5]", ""}, x_in({1, 4, 5})); - AssertParse({"/x in []", ""}, x_in({})); + AssertParse("/x in [1]", x_in({1})); + AssertParse("/x in [1 4 5]", x_in({1, 4, 5})); + AssertParse("/x in []", x_in({})); } // An adhoc partitioning which parses segments like "/x=[-3.25, 0.0)" @@ -896,11 +896,11 @@ class RangePartitioning : public Partitioning { std::string type_name() const override { return "range"; } - Result Parse(const PartitionPathFormat& path) const override { + Result Parse(const std::string& path) const override { std::vector ranges; HivePartitioningOptions options; - for (auto segment : fs::internal::SplitAbstractPath(path.directory)) { + for (auto segment : fs::internal::SplitAbstractPath(path)) { ARROW_ASSIGN_OR_RAISE(auto key, HivePartitioning::ParseKey(segment, options)); if (!key) { return Status::Invalid("can't parse '", segment, "' as a range"); @@ -955,7 +955,7 @@ TEST_F(TestPartitioning, Range) { partitioning_ = std::make_shared( schema({field("x", float64()), field("y", float64()), field("z", float64())})); - AssertParse({"/x=[-1.5 0.0)/y=[0.0 1.5)/z=(1.5 3.0]", ""}, + AssertParse("/x=[-1.5 0.0)/y=[0.0 1.5)/z=(1.5 3.0]", and_({and_(greater_equal(field_ref("x"), literal(-1.5)), less(field_ref("x"), literal(0.0))), and_(greater_equal(field_ref("y"), literal(0.0)), @@ -965,21 +965,18 @@ TEST_F(TestPartitioning, Range) { } TEST(TestStripPrefixAndFilename, Basic) { - ASSERT_EQ(StripPrefixAndFilename("", "").directory, ""); - ASSERT_EQ(StripPrefixAndFilename("a.csv", "").directory, ""); - ASSERT_EQ(StripPrefixAndFilename("a/b.csv", "").directory, "a"); - ASSERT_EQ(StripPrefixAndFilename("/a/b/c.csv", "/a").directory, "b"); - ASSERT_EQ(StripPrefixAndFilename("/a/b/c/d.csv", "/a").directory, "b/c"); - ASSERT_EQ(StripPrefixAndFilename("/a/b/c.csv", "/a/b").directory, ""); + ASSERT_EQ(StripPrefixAndFilename("", ""), ""); + ASSERT_EQ(StripPrefixAndFilename("a.csv", ""), ""); + ASSERT_EQ(StripPrefixAndFilename("a/b.csv", ""), "a"); + ASSERT_EQ(StripPrefixAndFilename("/a/b/c.csv", "/a"), "b"); + ASSERT_EQ(StripPrefixAndFilename("/a/b/c/d.csv", "/a"), "b/c"); + ASSERT_EQ(StripPrefixAndFilename("/a/b/c.csv", "/a/b"), ""); std::vector input{"/data/year=2019/file.parquet", "/data/year=2019/month=12/file.parquet", "/data/year=2019/month=12/day=01/file.parquet"}; auto paths = StripPrefixAndFilename(input, "/data"); - std::vector path_directories; - std::transform(paths.begin(), paths.end(), std::back_inserter(path_directories), - [](PartitionPathFormat const& path) { return path.directory; }); - EXPECT_THAT(path_directories, testing::ElementsAre("year=2019", "year=2019/month=12", + EXPECT_THAT(paths, testing::ElementsAre("year=2019", "year=2019/month=12", "year=2019/month=12/day=01")); } From a524acca1f0dcd689bb4115ec7c375ba001dfb3f Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 23 May 2022 12:57:20 +0530 Subject: [PATCH 20/24] feat: [Python] Parse method to take the entire path --- python/pyarrow/_dataset.pyx | 6 ++---- python/pyarrow/includes/libarrow_dataset.pxd | 6 +----- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index d75caeacdc4..7e3fe0bfe21 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1319,11 +1319,9 @@ cdef class Partitioning(_Weakrefable): cdef inline shared_ptr[CPartitioning] unwrap(self): return self.wrapped - def parse(self, directory="", filename=""): + def parse(self, path): cdef CResult[CExpression] result - cdef CPartitionPathFormat path - path.directory = tobytes(directory) - path.filename = tobytes(filename) + path = tobytes(path) result = self.partitioning.Parse(path) return Expression.wrap(GetResultValue(result)) diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd index 8a831a3a734..64e59e05613 100644 --- a/python/pyarrow/includes/libarrow_dataset.pxd +++ b/python/pyarrow/includes/libarrow_dataset.pxd @@ -277,13 +277,9 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: CCSVConvertOptions convert_options CCSVReadOptions read_options - cdef cppclass CPartitionPathFormat "arrow::dataset::PartitionPathFormat": - c_string directory - c_string filename - cdef cppclass CPartitioning "arrow::dataset::Partitioning": c_string type_name() const - CResult[CExpression] Parse(const CPartitionPathFormat & path) const + CResult[CExpression] Parse(const c_string & path) const const shared_ptr[CSchema] & schema() cdef cppclass CSegmentEncoding" arrow::dataset::SegmentEncoding": From e08b2e5f704285dbac47a916be3543899cc78196 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 23 May 2022 14:19:03 +0530 Subject: [PATCH 21/24] fix: python test changes for new parse method signature --- cpp/src/arrow/dataset/file_parquet.cc | 2 +- cpp/src/arrow/dataset/partition.cc | 3 +-- python/pyarrow/_dataset.pyx | 3 +-- python/pyarrow/tests/test_dataset.py | 14 +++++++------- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 0ab72a877ba..0361520892c 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -894,7 +894,7 @@ ParquetDatasetFactory::CollectParquetFragments(const Partitioning& partitioning) auto row_groups = Iota(metadata_subset->num_row_groups()); auto partition_expression = - partitioning.Parse(StripPrefixAndFilename(path, options_.partition_base_dir)) + partitioning.Parse(StripPrefix(path, options_.partition_base_dir)) .ValueOr(compute::literal(true)); ARROW_ASSIGN_OR_RAISE( diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 4c0426e579c..73d4f4b433f 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -841,8 +841,7 @@ Result> PartitioningOrFactory::GetOrInferSchema( if (auto part = partitioning()) { return part->schema(); } - std::vector partition_paths(paths.size()); - return factory()->Inspect(partition_paths); + return factory()->Inspect(paths); } } // namespace dataset diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 7e3fe0bfe21..10aeafbdbaa 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1321,8 +1321,7 @@ cdef class Partitioning(_Weakrefable): def parse(self, path): cdef CResult[CExpression] result - path = tobytes(path) - result = self.partitioning.Parse(path) + result = self.partitioning.Parse(tobytes(path)) return Expression.wrap(GetResultValue(result)) @property diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index ca9ed3f2a1d..288301fd4b4 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -582,7 +582,7 @@ def test_partitioning(): ) assert len(partitioning.dictionaries) == 2 assert all(x is None for x in partitioning.dictionaries) - expr = partitioning.parse('/3/3.14') + expr = partitioning.parse('/3/3.14/') assert isinstance(expr, ds.Expression) expected = (ds.field('group') == 3) & (ds.field('key') == 3.14) @@ -591,7 +591,7 @@ def test_partitioning(): with pytest.raises(pa.ArrowInvalid): partitioning.parse('/prefix/3/aaa') - expr = partitioning.parse('/3') + expr = partitioning.parse('/3/') expected = ds.field('group') == 3 assert expr.equals(expected) @@ -604,20 +604,20 @@ def test_partitioning(): ) assert len(partitioning.dictionaries) == 2 assert all(x is None for x in partitioning.dictionaries) - expr = partitioning.parse('/alpha=0/beta=3') + expr = partitioning.parse('/alpha=0/beta=3/') expected = ( (ds.field('alpha') == ds.scalar(0)) & (ds.field('beta') == ds.scalar(3)) ) assert expr.equals(expected) - expr = partitioning.parse('/alpha=xyz/beta=3') + expr = partitioning.parse('/alpha=xyz/beta=3/') expected = ( (ds.field('alpha').is_null() & (ds.field('beta') == ds.scalar(3))) ) assert expr.equals(expected) - for shouldfail in ['/alpha=one/beta=2', '/alpha=one', '/beta=two']: + for shouldfail in ['/alpha=one/beta=2/', '/alpha=one/', '/beta=two/']: with pytest.raises(pa.ArrowInvalid): partitioning.parse(shouldfail) @@ -629,14 +629,14 @@ def test_partitioning(): ) assert len(partitioning.dictionaries) == 2 assert all(x is None for x in partitioning.dictionaries) - expr = partitioning.parse('', '3_3.14_') + expr = partitioning.parse('3_3.14_') assert isinstance(expr, ds.Expression) expected = (ds.field('group') == 3) & (ds.field('key') == 3.14) assert expr.equals(expected) with pytest.raises(pa.ArrowInvalid): - partitioning.parse('', 'prefix_3_aaa_') + partitioning.parse('prefix_3_aaa_') partitioning = ds.DirectoryPartitioning( pa.schema([ From 1a61b0ca1891471d2d277797b37e44e0ca4d5134 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Mon, 23 May 2022 14:34:04 +0530 Subject: [PATCH 22/24] fix: cpp & python lint --- cpp/src/arrow/dataset/file_base.cc | 10 ++-- cpp/src/arrow/dataset/file_parquet.cc | 5 +- cpp/src/arrow/dataset/partition.cc | 22 ++++---- cpp/src/arrow/dataset/partition.h | 8 +-- cpp/src/arrow/dataset/partition_test.cc | 68 +++++++++++-------------- 5 files changed, 54 insertions(+), 59 deletions(-) diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index ef5b7dbde4e..2e05706bbbc 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -270,11 +270,11 @@ Future<> FileWriter::Finish() { namespace { -Status WriteBatch(std::shared_ptr batch, compute::Expression guarantee, - FileSystemDatasetWriteOptions write_options, - std::function, - const PartitionPathFormat&)> - write) { +Status WriteBatch( + std::shared_ptr batch, compute::Expression guarantee, + FileSystemDatasetWriteOptions write_options, + std::function, const PartitionPathFormat&)> + write) { ARROW_ASSIGN_OR_RAISE(auto groups, write_options.partitioning->Partition(batch)); batch.reset(); // drop to hopefully conserve memory diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 0361520892c..f2a3903208f 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -920,9 +920,8 @@ Result>> ParquetDatasetFactory::InspectSchem size_t i = 0; for (const auto& e : paths_with_row_group_ids_) { - stripped[i++] = - StripPrefixAndFilename(e.first, options_.partition_base_dir); - } + stripped[i++] = StripPrefixAndFilename(e.first, options_.partition_base_dir); + } ARROW_ASSIGN_OR_RAISE(auto partition_schema, factory->Inspect(stripped)); schemas.push_back(std::move(partition_schema)); diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 73d4f4b433f..d37dfd97d39 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -250,8 +250,7 @@ Result KeyValuePartitioning::ConvertKey(const Key& key) con compute::literal(std::move(converted))); } -Result KeyValuePartitioning::Parse( - const std::string& path) const { +Result KeyValuePartitioning::Parse(const std::string& path) const { std::vector expressions; ARROW_ASSIGN_OR_RAISE(auto parsed, ParseKeys(path)); @@ -378,7 +377,8 @@ DirectoryPartitioning::DirectoryPartitioning(std::shared_ptr schema, Result> DirectoryPartitioning::ParseKeys( const std::string& path) const { - std::vector segments = fs::internal::SplitAbstractPath(fs::internal::GetAbstractPathParent(path).first); + std::vector segments = + fs::internal::SplitAbstractPath(fs::internal::GetAbstractPathParent(path).first); return ParsePartitionSegments(segments); } @@ -391,8 +391,8 @@ FilenamePartitioning::FilenamePartitioning(std::shared_ptr schema, Result> FilenamePartitioning::ParseKeys( const std::string& path) const { - std::vector segments = fs::internal::SplitAbstractPath( - StripNonPrefix(path), kFilenamePartitionSep); + std::vector segments = + fs::internal::SplitAbstractPath(StripNonPrefix(path), kFilenamePartitionSep); return ParsePartitionSegments(segments); } @@ -722,7 +722,8 @@ Result> HivePartitioning::ParseKeys( const std::string& path) const { std::vector keys; - for (const auto& segment : fs::internal::SplitAbstractPath(fs::internal::GetAbstractPathParent(path).first)) { + for (const auto& segment : + fs::internal::SplitAbstractPath(fs::internal::GetAbstractPathParent(path).first)) { ARROW_ASSIGN_OR_RAISE(auto maybe_key, ParseKey(segment, hive_options_)); if (auto key = maybe_key) { keys.push_back(std::move(*key)); @@ -816,8 +817,9 @@ std::string StripPrefixAndFilename(const std::string& path, const std::string& p auto basename_filename = fs::internal::GetAbstractPathParent(base_less); return basename_filename.first; } -std::vector StripPrefixAndFilename( - const std::vector& paths, const std::string& prefix) { + +std::vector StripPrefixAndFilename(const std::vector& paths, + const std::string& prefix) { std::vector result; result.reserve(paths.size()); for (const auto& path : paths) { @@ -826,8 +828,8 @@ std::vector StripPrefixAndFilename( return result; } -std::vector StripPrefixAndFilename( - const std::vector& files, const std::string& prefix) { +std::vector StripPrefixAndFilename(const std::vector& files, + const std::string& prefix) { std::vector result; result.reserve(files.size()); for (const auto& info : files) { diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index 0bfef786531..cd225f13787 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -358,14 +358,15 @@ class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning { Result FormatValues(const ScalarVector& values) const override; }; -ARROW_DS_EXPORT std::string StripPrefix(const std::string& path, const std::string& prefix); +ARROW_DS_EXPORT std::string StripPrefix(const std::string& path, + const std::string& prefix); /// \brief Extracts the directory and filename and removes the prefix of a path /// /// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") -> /// {"year=2019","c.txt"}` ARROW_DS_EXPORT std::string StripPrefixAndFilename(const std::string& path, - const std::string& prefix); + const std::string& prefix); /// \brief Vector version of StripPrefixAndFilename. ARROW_DS_EXPORT std::vector StripPrefixAndFilename( @@ -399,8 +400,7 @@ class ARROW_DS_EXPORT PartitioningOrFactory { const std::shared_ptr& factory() const { return factory_; } /// \brief Get the partition schema, inferring it with the given factory if needed. - Result> GetOrInferSchema( - const std::vector& paths); + Result> GetOrInferSchema(const std::vector& paths); private: std::shared_ptr factory_; diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 1198dd9486b..b023b1d5bc5 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -64,14 +64,14 @@ class TestPartitioning : public ::testing::Test { ASSERT_OK_AND_ASSIGN(auto formatted, partitioning_->Format(expr)); ASSERT_EQ(formatted.directory, expected_directory); ASSERT_EQ(formatted.filename, expected_filename); - + // if ((formatted.filename).empty()){ // formatted.filename = "format.parquet"; // } // ensure the formatted path round trips the relevant components of the partition // expression: roundtripped should be a subset of expr ASSERT_OK_AND_ASSIGN(compute::Expression roundtripped, - partitioning_->Parse(formatted.directory+formatted.filename)); + partitioning_->Parse(formatted.directory + formatted.filename)); ASSERT_OK_AND_ASSIGN(roundtripped, roundtripped.Bind(*written_schema_)); ASSERT_OK_AND_ASSIGN(auto simplified, SimplifyWithGuarantee(roundtripped, expr)); ASSERT_EQ(simplified, literal(true)); @@ -191,7 +191,7 @@ TEST_F(TestPartitioning, DirectoryPartitioning) { schema({field("alpha", int32()), field("beta", utf8())})); AssertParse("/0/hello", and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), literal("hello")))); + equal(field_ref("beta"), literal("hello")))); AssertParse("/3", equal(field_ref("alpha"), literal(3))); AssertParseError("/world/0"); // reversed order AssertParseError("/0.0/foo"); // invalid alpha @@ -199,13 +199,11 @@ TEST_F(TestPartitioning, DirectoryPartitioning) { AssertParse("", literal(true)); // no segments to parse // gotcha someday: - AssertParse("/0/dat.parquet", - and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), literal("dat.parquet")))); + AssertParse("/0/dat.parquet", and_(equal(field_ref("alpha"), literal(0)), + equal(field_ref("beta"), literal("dat.parquet")))); - AssertParse("/0/foo/ignored=2341", - and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), literal("foo")))); + AssertParse("/0/foo/ignored=2341", and_(equal(field_ref("alpha"), literal(0)), + equal(field_ref("beta"), literal("foo")))); } TEST_F(TestPartitioning, FilenamePartitioning) { @@ -213,7 +211,7 @@ TEST_F(TestPartitioning, FilenamePartitioning) { schema({field("alpha", int32()), field("beta", utf8())})); AssertParse("0_hello_", and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), literal("hello")))); + equal(field_ref("beta"), literal("hello")))); AssertParse("0_", equal(field_ref("alpha"), literal(0))); AssertParseError("world_0_"); // reversed order AssertParseError("0.0_foo_"); // invalid alpha @@ -221,7 +219,7 @@ TEST_F(TestPartitioning, FilenamePartitioning) { AssertParse("", literal(true)); // no segments to parse AssertParse("0_foo_ignored=2341", and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), literal("foo")))); + equal(field_ref("beta"), literal("foo")))); } TEST_F(TestPartitioning, DirectoryPartitioningFormat) { @@ -406,13 +404,12 @@ TEST_F(TestPartitioning, HivePartitioning) { schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz"); AssertParse("/alpha=0/beta=3.25", and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), literal(3.25f)))); + equal(field_ref("beta"), literal(3.25f)))); AssertParse("/beta=3.25/alpha=0", and_(equal(field_ref("beta"), literal(3.25f)), - equal(field_ref("alpha"), literal(0)))); + equal(field_ref("alpha"), literal(0)))); AssertParse("/alpha=0", equal(field_ref("alpha"), literal(0))); - AssertParse( - "/alpha=xyz/beta=3.25", - and_(is_null(field_ref("alpha")), equal(field_ref("beta"), literal(3.25f)))); + AssertParse("/alpha=xyz/beta=3.25", and_(is_null(field_ref("alpha")), + equal(field_ref("beta"), literal(3.25f)))); AssertParse("/beta=3.25", equal(field_ref("beta"), literal(3.25f))); AssertParse("", literal(true)); @@ -714,11 +711,10 @@ TEST_F(TestPartitioning, UrlEncodedHive) { AssertParse("/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", and_({equal(field_ref("date"), literal(date)), equal(field_ref("time"), literal(time)), is_null(field_ref("str"))})); - AssertParse( - "/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", - and_({equal(field_ref("date"), literal(date)), - equal(field_ref("time"), literal(time)), - equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))})); + AssertParse("/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", + and_({equal(field_ref("date"), literal(date)), + equal(field_ref("time"), literal(time)), + equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))})); // URL-encoded null fallback value AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", and_({equal(field_ref("date"), literal(date)), @@ -727,9 +723,8 @@ TEST_F(TestPartitioning, UrlEncodedHive) { // Invalid UTF-8 EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), factory_->Inspect({"/date=%AF/time=%BF/str=%CF"})); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::HasSubstr("was not valid UTF-8"), - partitioning_->Parse("/date=%AF/time=%BF/str=%CF")); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), + partitioning_->Parse("/date=%AF/time=%BF/str=%CF")); options.segment_encoding = SegmentEncoding::None; options.schema = @@ -750,9 +745,8 @@ TEST_F(TestPartitioning, UrlEncodedHive) { // Invalid UTF-8 EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), factory_->Inspect({"/date=\xAF/time=\xBF/str=\xCF"})); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::HasSubstr("was not valid UTF-8"), - partitioning_->Parse("/date=\xAF/time=\xBF/str=\xCF")); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("was not valid UTF-8"), + partitioning_->Parse("/date=\xAF/time=\xBF/str=\xCF")); } TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { @@ -777,18 +771,19 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { options.AsHivePartitioningOptions()); AssertParse( "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " - "07:27:00/str=$", + "07:27:00/str=$", and_({equal(field_ref("test'; date"), literal(date)), equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))})); - AssertParse("/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " - "07:27:00/str=%E3%81%8F%E3%81%BE", - and_({equal(field_ref("test'; date"), literal(date)), - equal(field_ref("test'; time"), literal(time)), - equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))})); + AssertParse( + "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " + "07:27:00/str=%E3%81%8F%E3%81%BE", + and_({equal(field_ref("test'; date"), literal(date)), + equal(field_ref("test'; time"), literal(time)), + equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))})); // URL-encoded null fallback value AssertParse( "/test%27%3B%20date=2021-05-04 00%3A00%3A00/test%27%3B%20time=2021-05-04 " - "07%3A27%3A00/str=%24", + "07%3A27%3A00/str=%24", and_({equal(field_ref("test'; date"), literal(date)), equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))})); @@ -828,8 +823,7 @@ TEST_F(TestPartitioning, EtlThenHive) { auto alphabeta_segments_end = etl_segments_end + alphabeta_fields.size(); auto alphabeta_path = fs::internal::JoinAbstractPath(etl_segments_end, alphabeta_segments_end); - ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr, - alphabeta_part.Parse(alphabeta_path)); + ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr, alphabeta_part.Parse(alphabeta_path)); return and_(etl_expr, alphabeta_expr); }); @@ -977,7 +971,7 @@ TEST(TestStripPrefixAndFilename, Basic) { "/data/year=2019/month=12/day=01/file.parquet"}; auto paths = StripPrefixAndFilename(input, "/data"); EXPECT_THAT(paths, testing::ElementsAre("year=2019", "year=2019/month=12", - "year=2019/month=12/day=01")); + "year=2019/month=12/day=01")); } } // namespace dataset From 2ae2ea3a60da519d466ff77dbfcf3186a4c2c806 Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Tue, 24 May 2022 04:58:59 +0530 Subject: [PATCH 23/24] fix: modify tests to have correct parse strings --- cpp/src/arrow/dataset/partition_test.cc | 87 +++++++++++++------------ 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index b023b1d5bc5..86b8c4f0b9d 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -190,17 +190,17 @@ TEST_F(TestPartitioning, DirectoryPartitioning) { partitioning_ = std::make_shared( schema({field("alpha", int32()), field("beta", utf8())})); - AssertParse("/0/hello", and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), literal("hello")))); - AssertParse("/3", equal(field_ref("alpha"), literal(3))); - AssertParseError("/world/0"); // reversed order - AssertParseError("/0.0/foo"); // invalid alpha - AssertParseError("/3.25"); // invalid alpha with missing beta + AssertParse("/0/hello/", and_(equal(field_ref("alpha"), literal(0)), + equal(field_ref("beta"), literal("hello")))); + AssertParse("/3/", equal(field_ref("alpha"), literal(3))); + AssertParseError("/world/0/"); // reversed order + AssertParseError("/0.0/foo/"); // invalid alpha + AssertParseError("/3.25/"); // invalid alpha with missing beta AssertParse("", literal(true)); // no segments to parse // gotcha someday: - AssertParse("/0/dat.parquet", and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), literal("dat.parquet")))); + AssertParse("/0/dat.parquet/", and_(equal(field_ref("alpha"), literal(0)), + equal(field_ref("beta"), literal("dat.parquet")))); AssertParse("/0/foo/ignored=2341", and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal("foo")))); @@ -285,7 +285,7 @@ TEST_F(TestPartitioning, DirectoryPartitioningWithTemporal) { schema({field("year", int32()), field("month", int8()), field("day", temporal)})); ASSERT_OK_AND_ASSIGN(auto day, StringScalar("2020-06-08").CastTo(temporal)); - AssertParse("/2020/06/2020-06-08", + AssertParse("/2020/06/2020-06-08/", and_({equal(field_ref("year"), literal(2020)), equal(field_ref("month"), literal(6)), equal(field_ref("day"), literal(day))})); @@ -386,11 +386,11 @@ TEST_F(TestPartitioning, DictionaryHasUniqueValues) { auto dictionary_scalar = std::make_shared(index_and_dictionary, alpha->type()); - auto path = "/" + expected_dictionary->GetString(i); + auto path = "/" + expected_dictionary->GetString(i) + "/"; AssertParse(path, equal(field_ref("alpha"), literal(dictionary_scalar))); } - AssertParseError("/yosemite"); // not in inspected dictionary + AssertParseError("/yosemite/"); // not in inspected dictionary } TEST_F(TestPartitioning, DiscoverSchemaSegfault) { @@ -403,27 +403,27 @@ TEST_F(TestPartitioning, HivePartitioning) { partitioning_ = std::make_shared( schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz"); - AssertParse("/alpha=0/beta=3.25", and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), literal(3.25f)))); - AssertParse("/beta=3.25/alpha=0", and_(equal(field_ref("beta"), literal(3.25f)), - equal(field_ref("alpha"), literal(0)))); - AssertParse("/alpha=0", equal(field_ref("alpha"), literal(0))); - AssertParse("/alpha=xyz/beta=3.25", and_(is_null(field_ref("alpha")), - equal(field_ref("beta"), literal(3.25f)))); - AssertParse("/beta=3.25", equal(field_ref("beta"), literal(3.25f))); + AssertParse("/alpha=0/beta=3.25/", and_(equal(field_ref("alpha"), literal(0)), + equal(field_ref("beta"), literal(3.25f)))); + AssertParse("/beta=3.25/alpha=0/", and_(equal(field_ref("beta"), literal(3.25f)), + equal(field_ref("alpha"), literal(0)))); + AssertParse("/alpha=0/", equal(field_ref("alpha"), literal(0))); + AssertParse("/alpha=xyz/beta=3.25/", and_(is_null(field_ref("alpha")), + equal(field_ref("beta"), literal(3.25f)))); + AssertParse("/beta=3.25/", equal(field_ref("beta"), literal(3.25f))); AssertParse("", literal(true)); - AssertParse("/alpha=0/beta=3.25/ignored=2341", + AssertParse("/alpha=0/beta=3.25/ignored=2341/", and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal(3.25f)))); - AssertParse("/alpha=0/beta=3.25/ignored=2341", + AssertParse("/alpha=0/beta=3.25/ignored=2341/", and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal(3.25f)))); - AssertParse("/ignored=2341", literal(true)); + AssertParse("/ignored=2341/", literal(true)); - AssertParseError("/alpha=0.0/beta=3.25"); // conversion of "0.0" to int32 fails + AssertParseError("/alpha=0.0/beta=3.25/"); // conversion of "0.0" to int32 fails } TEST_F(TestPartitioning, HivePartitioningFormat) { @@ -560,11 +560,11 @@ TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) { auto dictionary_scalar = std::make_shared(index_and_dictionary, alpha->type()); - auto path = "/alpha=" + expected_dictionary->GetString(i); + auto path = "/alpha=" + expected_dictionary->GetString(i) + "/"; AssertParse(path, equal(field_ref("alpha"), literal(dictionary_scalar))); } - AssertParseError("/alpha=yosemite"); // not in inspected dictionary + AssertParseError("/alpha=yosemite/"); // not in inspected dictionary } TEST_F(TestPartitioning, ExistingSchemaDirectory) { @@ -665,7 +665,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { auto date = std::make_shared(1620086400, ts); auto time = std::make_shared(1620113220, ts); partitioning_ = std::make_shared(options.schema, ArrayVector()); - AssertParse("/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24", + AssertParse("/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24/", and_({equal(field_ref("date"), literal(date)), equal(field_ref("time"), literal(time)), equal(field_ref("str"), literal("$"))})); @@ -685,7 +685,7 @@ TEST_F(TestPartitioning, UrlEncodedDirectory) { options.schema->fields()); partitioning_ = std::make_shared( options.schema, ArrayVector(), options.AsPartitioningOptions()); - AssertParse("/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24", + AssertParse("/2021-05-04 00%3A00%3A00/2021-05-04 07%3A27%3A00/%24/", and_({equal(field_ref("date"), literal("2021-05-04 00%3A00%3A00")), equal(field_ref("time"), literal("2021-05-04 07%3A27%3A00")), equal(field_ref("str"), literal("%24"))})); @@ -708,15 +708,16 @@ TEST_F(TestPartitioning, UrlEncodedHive) { auto time = std::make_shared(1620113220, ts); partitioning_ = std::make_shared(options.schema, ArrayVector(), options.AsHivePartitioningOptions()); - AssertParse("/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$", + AssertParse("/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=$/", and_({equal(field_ref("date"), literal(date)), equal(field_ref("time"), literal(time)), is_null(field_ref("str"))})); - AssertParse("/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE", - and_({equal(field_ref("date"), literal(date)), - equal(field_ref("time"), literal(time)), - equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))})); + AssertParse( + "/date=2021-05-04 00:00:00/time=2021-05-04 07:27:00/str=%E3%81%8F%E3%81%BE/", + and_({equal(field_ref("date"), literal(date)), + equal(field_ref("time"), literal(time)), + equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))})); // URL-encoded null fallback value - AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", + AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24/", and_({equal(field_ref("date"), literal(date)), equal(field_ref("time"), literal(time)), is_null(field_ref("str"))})); @@ -737,7 +738,7 @@ TEST_F(TestPartitioning, UrlEncodedHive) { options.schema->fields()); partitioning_ = std::make_shared(options.schema, ArrayVector(), options.AsHivePartitioningOptions()); - AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24", + AssertParse("/date=2021-05-04 00%3A00%3A00/time=2021-05-04 07%3A27%3A00/str=%24/", and_({equal(field_ref("date"), literal("2021-05-04 00%3A00%3A00")), equal(field_ref("time"), literal("2021-05-04 07%3A27%3A00")), equal(field_ref("str"), literal("%24"))})); @@ -771,19 +772,19 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { options.AsHivePartitioningOptions()); AssertParse( "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " - "07:27:00/str=$", + "07:27:00/str=$/", and_({equal(field_ref("test'; date"), literal(date)), equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))})); AssertParse( "/test%27%3B%20date=2021-05-04 00:00:00/test%27%3B%20time=2021-05-04 " - "07:27:00/str=%E3%81%8F%E3%81%BE", + "07:27:00/str=%E3%81%8F%E3%81%BE/", and_({equal(field_ref("test'; date"), literal(date)), equal(field_ref("test'; time"), literal(time)), equal(field_ref("str"), literal("\xE3\x81\x8F\xE3\x81\xBE"))})); // URL-encoded null fallback value AssertParse( "/test%27%3B%20date=2021-05-04 00%3A00%3A00/test%27%3B%20time=2021-05-04 " - "07%3A27%3A00/str=%24", + "07%3A27%3A00/str=%24/", and_({equal(field_ref("test'; date"), literal(date)), equal(field_ref("test'; time"), literal(time)), is_null(field_ref("str"))})); @@ -793,7 +794,7 @@ TEST_F(TestPartitioning, UrlEncodedHiveWithKeyEncoded) { factory_->Inspect({"/%AF=2021-05-04/time=2021-05-04 07%3A27%3A00/str=%24"})); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("was not valid UTF-8"), - partitioning_->Parse("/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24")); + partitioning_->Parse("/%AF=2021-05-04/%BF=2021-05-04 07%3A27%3A00/str=%24/")); } TEST_F(TestPartitioning, EtlThenHive) { @@ -818,17 +819,17 @@ TEST_F(TestPartitioning, EtlThenHive) { auto etl_segments_end = segments.begin() + etl_fields.size(); auto etl_path = fs::internal::JoinAbstractPath(segments.begin(), etl_segments_end); - ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_part.Parse(etl_path)); + ARROW_ASSIGN_OR_RAISE(auto etl_expr, etl_part.Parse(etl_path + "/")); auto alphabeta_segments_end = etl_segments_end + alphabeta_fields.size(); auto alphabeta_path = fs::internal::JoinAbstractPath(etl_segments_end, alphabeta_segments_end); - ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr, alphabeta_part.Parse(alphabeta_path)); - + ARROW_ASSIGN_OR_RAISE(auto alphabeta_expr, + alphabeta_part.Parse(alphabeta_path + "/")); return and_(etl_expr, alphabeta_expr); }); - AssertParse("/1999/12/31/00/alpha=0/beta=3.25", + AssertParse("/1999/12/31/00/alpha=0/beta=3.25/", and_({equal(field_ref("year"), literal(1999)), equal(field_ref("month"), literal(12)), equal(field_ref("day"), literal(31)), @@ -836,7 +837,7 @@ TEST_F(TestPartitioning, EtlThenHive) { and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal(3.25f)))})); - AssertParseError("/20X6/03/21/05/alpha=0/beta=3.25"); + AssertParseError("/20X6/03/21/05/alpha=0/beta=3.25/"); } TEST_F(TestPartitioning, Set) { From 718c924f1d6965e89806c9369b3f41ffff15c24a Mon Sep 17 00:00:00 2001 From: Sanjiban Sengupta Date: Thu, 26 May 2022 13:36:35 +0530 Subject: [PATCH 24/24] fix: processing segments in ParseKeys method of FunctionPartitioning --- cpp/src/arrow/dataset/partition.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index d37dfd97d39..6b4d601db01 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -391,8 +391,9 @@ FilenamePartitioning::FilenamePartitioning(std::shared_ptr schema, Result> FilenamePartitioning::ParseKeys( const std::string& path) const { - std::vector segments = - fs::internal::SplitAbstractPath(StripNonPrefix(path), kFilenamePartitionSep); + std::vector segments = fs::internal::SplitAbstractPath( + StripNonPrefix(fs::internal::GetAbstractPathParent(path).second), + kFilenamePartitionSep); return ParsePartitionSegments(segments); }