From a4202a931ca5c914e82ca2cc65bb1971a0730798 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 21 Jan 2021 11:45:32 -1000 Subject: [PATCH 01/33] Merge/rebase --- cpp/src/arrow/dataset/partition_test.cc | 73 ++++++++++++++++++++----- 1 file changed, 60 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 286848d9ae9..a412e9f7b8f 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -27,6 +27,7 @@ #include #include "arrow/compute/api_scalar.h" +#include "arrow/compute/api_vector.h" #include "arrow/dataset/scanner_internal.h" #include "arrow/dataset/test_util.h" #include "arrow/filesystem/path_util.h" @@ -77,6 +78,31 @@ class TestPartitioning : public ::testing::Test { ASSERT_OK_AND_ASSIGN(partitioning_, factory_->Finish(actual)); } + void AssertPartition(const std::shared_ptr partitioning, + const std::shared_ptr full_batch, + const std::vector>& expected_partition_indices) { + ASSERT_OK_AND_ASSIGN(auto partition_results, partitioning->Partition(full_batch)); + ASSERT_EQ(partition_results.batches.size(), expected_partition_indices.size()); + auto max_index = + std::min(partition_results.batches.size(), expected_partition_indices.size()); + for (int partition_index = 0; partition_index < max_index; partition_index++) { + std::shared_ptr actual = partition_results.batches[partition_index]; + std::shared_ptr indices_arr; + ChunkedArrayFromVector({expected_partition_indices[partition_index]}, + &indices_arr); + auto expected = compute::Take(full_batch, indices_arr); + ASSERT_EQ(expected, actual); + } + } + + void AssertPartition(const std::shared_ptr partitioning, + const std::shared_ptr schema, + const std::string& record_batch_json, + const std::vector>& expected_partition_indices) { + auto record_batch = RecordBatchFromJSON(schema, record_batch_json); + AssertPartition(partitioning, record_batch, expected_partition_indices); + } + void AssertInspectError(const std::vector& paths) { ASSERT_RAISES(Invalid, factory_->Inspect(paths)); } @@ -103,6 +129,21 @@ class TestPartitioning : public ::testing::Test { std::shared_ptr written_schema_; }; +TEST_F(TestPartitioning, Basic) { + auto schema_ = schema({field("a", int32()), field("b", utf8())}); + auto partitioning = std::make_shared(schema_); + std::string json = R"([{"a": 3, "b": "x"}, + {"a": 3, "b": "x"}, + {"a": 1, "b": null}, + {"a": null, "b": null}, + {"a": null, "b": "z"}, + {"a": null, "b": null} + ])"; + AssertPartition(partitioning, schema_, json, {{0, 1}, {2}, {3, 5}, {4}}); +} + +TEST_F(TestPartitioning, StructDictionaryNull) {} + TEST_F(TestPartitioning, DirectoryPartitioning) { partitioning_ = std::make_shared( schema({field("alpha", int32()), field("beta", utf8())})); @@ -600,20 +641,26 @@ TEST(GroupTest, Basics) { } TEST(GroupTest, WithNulls) { - auto has_nulls = checked_pointer_cast( - ArrayFromJSON(struct_({field("a", utf8()), field("b", int32())}), R"([ - {"a": "ex", "b": 0}, - {"a": null, "b": 0}, - {"a": "why", "b": 0}, - {"a": "ex", "b": 1}, - {"a": "why", "b": 0}, - {"a": "ex", "b": 1}, - {"a": "ex", "b": 0}, - {"a": "why", "b": null} - ])")); - ASSERT_RAISES(NotImplemented, MakeGroupings(*has_nulls)); + AssertGrouping({field("a", utf8()), field("b", int32())}, + R"([ + {"a": "ex", "b": 0}, + {"a": null, "b": 0}, + {"a": null, "b": 0}, + {"a": "ex", "b": 1}, + {"a": null, "b": null}, + {"a": "ex", "b": 1}, + {"a": "ex", "b": 0}, + {"a": "why", "b": null} + ])", + R"([ + {"a": "ex", "b": 0, "ids": [0, 6]}, + {"a": null, "b": 0, "ids": [1, 2]}, + {"a": "ex", "b": 1, "ids": [3, 5]}, + {"a": null, "b": null, "ids": [4]}, + {"a": "why", "b": null, "ids": [7]} + ])"); - has_nulls = checked_pointer_cast( + auto has_nulls = checked_pointer_cast( ArrayFromJSON(struct_({field("a", utf8()), field("b", int32())}), R"([ {"a": "ex", "b": 0}, null, From 53853b621cae0c5acb7442d66f2e5891a249b493 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 25 Jan 2021 08:15:00 -1000 Subject: [PATCH 02/33] WIP commit --- cpp/src/arrow/compute/api_vector.cc | 5 ++- cpp/src/arrow/compute/api_vector.h | 20 ++++++++- cpp/src/arrow/compute/kernels/vector_hash.cc | 8 +++- cpp/src/arrow/pretty_print.cc | 45 ++++++++++++++++++++ cpp/src/arrow/pretty_print.h | 13 ++++++ 5 files changed, 86 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index f5ab46ac603..0082d48112d 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -74,8 +74,9 @@ Result> Unique(const Datum& value, ExecContext* ctx) { return result.make_array(); } -Result DictionaryEncode(const Datum& value, ExecContext* ctx) { - return CallFunction("dictionary_encode", {value}, ctx); +Result DictionaryEncode(const Datum& value, const DictionaryEncodeOptions& options, + ExecContext* ctx) { + return CallFunction("dictionary_encode", {value}, &options, ctx); } const char kValuesFieldName[] = "values"; diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 9e9cad9e5d9..9dcf4df2894 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -63,6 +63,23 @@ enum class SortOrder { Descending, }; +struct DictionaryEncodeOptions : public FunctionOptions { + /// Configure how null values will be encoded + enum NullEncodingBehavior { + /// the null value will be added to the dictionary with a proper index + ENCODE, + /// the null value will be masked in the indices array + MASK, + }; + + explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK) + : null_encoding_behavior(null_encoding) {} + + static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); } + + NullEncodingBehavior null_encoding_behavior = MASK; +}; + /// \brief One sort key for PartitionNthIndices (TODO) and SortIndices struct ARROW_EXPORT SortKey { explicit SortKey(std::string name, SortOrder order = SortOrder::Ascending) @@ -296,7 +313,8 @@ Result> ValueCounts(const Datum& value, /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result DictionaryEncode(const Datum& data, ExecContext* ctx = NULLPTR); +Result DictionaryEncode(const Datum& data, const DictionaryEncodeOptions& options, + ExecContext* ctx = NULLPTR); // ---------------------------------------------------------------------- // Deprecated functions diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 34d18c24a0c..0a4a1aff5f9 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -641,7 +641,8 @@ const FunctionDoc value_counts_doc( const FunctionDoc dictionary_encode_doc( "Dictionary-encode array", - ("Return a dictionary-encoded version of the input array."), {"array"}); + ("Return a dictionary-encoded version of the input array."), {"array"}, + "DictionaryEncodeOptions"); } // namespace @@ -687,11 +688,14 @@ void RegisterVectorHash(FunctionRegistry* registry) { // ---------------------------------------------------------------------- // dictionary_encode + const auto kDefaultDictionaryEncodeOptions = DictionaryEncodeOptions::Defaults(); + base.finalize = DictEncodeFinalize; // Unique and ValueCounts output unchunked arrays base.output_chunked = true; auto dict_encode = std::make_shared("dictionary_encode", Arity::Unary(), - &dictionary_encode_doc); + &dictionary_encode_doc, + &kDefaultDictionaryEncodeOptions); AddHashKernels(dict_encode.get(), base, OutputType(DictEncodeOutput)); // Calling dictionary_encode on dictionary input not supported, but if it diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index 8c2ac376d1e..d61e6cde2b6 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -670,4 +670,49 @@ Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options, return Status::OK(); } +void GdbPrintArray(const Array& arr, int indent) { + PrettyPrintOptions options; + options.indent = indent; + auto print_st = PrettyPrint(arr, options, &std::cout); + if (!print_st.ok()) { + std::cout << "Could not print: " << print_st.message(); + } +} + +void GdbPrintRecordBatch(const RecordBatch& rb, int indent) { + PrettyPrintOptions options; + options.indent = indent; + auto print_st = PrettyPrint(rb, options, &std::cout); + if (!print_st.ok()) { + std::cout << "Could not print: " << print_st.message(); + } +} + +void GdbPrintTable(const Table& table, int indent) { + PrettyPrintOptions options; + options.indent = indent; + auto print_st = PrettyPrint(table, options, &std::cout); + if (!print_st.ok()) { + std::cout << "Could not print: " << print_st.message(); + } +} + +void GdbPrintChunkedArray(const ChunkedArray& chunked_arr, int indent) { + PrettyPrintOptions options; + options.indent = indent; + auto print_st = PrettyPrint(chunked_arr, options, &std::cout); + if (!print_st.ok()) { + std::cout << "Could not print: " << print_st.message(); + } +} + +void GdbPrintSchema(const Schema& schema, int indent) { + PrettyPrintOptions options; + options.indent = indent; + auto print_st = PrettyPrint(schema, options, &std::cout); + if (!print_st.ok()) { + std::cout << "Could not print: " << print_st.message(); + } +} + } // namespace arrow diff --git a/cpp/src/arrow/pretty_print.h b/cpp/src/arrow/pretty_print.h index 9d2c72c7186..43948b8f149 100644 --- a/cpp/src/arrow/pretty_print.h +++ b/cpp/src/arrow/pretty_print.h @@ -120,4 +120,17 @@ Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options, ARROW_EXPORT Status DebugPrint(const Array& arr, int indent); +// These print routines are used in the gdb pretty printers which are +// not capable of passing "out" params and do a poor job of overload resolution + +ARROW_EXPORT void GdbPrintArray(const Array& arr, int indent); + +ARROW_EXPORT void GdbPrintRecordBatch(const RecordBatch& rb, int indent); + +ARROW_EXPORT void GdbPrintTable(const Table& table, int indent); + +ARROW_EXPORT void GdbPrintChunkedArray(const ChunkedArray& chunked_arr, int indent); + +ARROW_EXPORT void GdbPrintSchema(const Schema& schema, int indent); + } // namespace arrow From 570de2c963b26ff21494409921be884229028eb5 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 25 Jan 2021 17:24:30 -1000 Subject: [PATCH 03/33] Added tests of vector_hash for inputs with nulls. Added ability to specify encoded nulls when encoding a dictionary --- cpp/src/arrow/compute/api_vector.h | 4 +- .../arrow/compute/kernels/scalar_cast_test.cc | 3 +- cpp/src/arrow/compute/kernels/vector_hash.cc | 67 +++++++++++--- .../compute/kernels/vector_hash_benchmark.cc | 6 +- .../arrow/compute/kernels/vector_hash_test.cc | 88 +++++++++++++++++-- cpp/src/arrow/dataset/partition.cc | 14 +-- cpp/src/arrow/dataset/partition_test.cc | 66 ++++++++------ cpp/src/arrow/python/arrow_to_pandas.cc | 9 +- .../parquet/arrow/arrow_reader_writer_test.cc | 4 +- 9 files changed, 199 insertions(+), 62 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 9dcf4df2894..6a334dffda1 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -70,9 +70,11 @@ struct DictionaryEncodeOptions : public FunctionOptions { ENCODE, /// the null value will be masked in the indices array MASK, + /// the null value will not be included in the dictionary + SKIP }; - explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK) + explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = SKIP) : null_encoding_behavior(null_encoding) {} static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 99a56346c1b..d84aefa5b19 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1472,7 +1472,8 @@ TEST(Cast, FromDictionary) { ASSERT_OK_AND_ASSIGN(auto no_nulls, Take(*dict, *indices)); ASSERT_EQ(no_nulls->null_count(), 0); - ASSERT_OK_AND_ASSIGN(Datum encoded, DictionaryEncode(no_nulls)); + ASSERT_OK_AND_ASSIGN(Datum encoded, + DictionaryEncode(no_nulls, DictionaryEncodeOptions::Defaults())); // Make a new dict array with nullptr bitmap buffer auto data = encoded.array()->Copy(); diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 0a4a1aff5f9..3ea8f905745 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -173,12 +173,16 @@ class DictEncodeAction final : public ActionBase { template void ObserveNullFound(Index index) { - indices_builder_.UnsafeAppendNull(); + if (index < 0) { + indices_builder_.UnsafeAppendNull(); + } else { + indices_builder_.UnsafeAppend(index); + } } template void ObserveNullNotFound(Index index) { - indices_builder_.UnsafeAppendNull(); + ObserveNullFound(index); } template @@ -206,6 +210,9 @@ class DictEncodeAction final : public ActionBase { class HashKernel : public KernelState { public: + HashKernel() : options_(DictionaryEncodeOptions::Defaults()) {} + explicit HashKernel(const DictionaryEncodeOptions& options) : options_(options) {} + // Reset for another run. virtual Status Reset() = 0; @@ -229,6 +236,7 @@ class HashKernel : public KernelState { virtual Status Append(const ArrayData& arr) = 0; protected: + DictionaryEncodeOptions options_; std::mutex lock_; }; @@ -241,8 +249,9 @@ template class RegularHashKernel : public HashKernel { public: - RegularHashKernel(const std::shared_ptr& type, MemoryPool* pool) - : pool_(pool), type_(type), action_(type, pool) {} + RegularHashKernel(const std::shared_ptr& type, + const DictionaryEncodeOptions& options, MemoryPool* pool) + : HashKernel(options), pool_(pool), type_(type), action_(type, pool) {} Status Reset() override { memo_table_.reset(new MemoTable(pool_, 0)); @@ -282,7 +291,9 @@ class RegularHashKernel : public HashKernel { &unused_memo_index); }, [this]() { - if (with_memo_visit_null) { + if (with_memo_visit_null || + options_.null_encoding_behavior == + DictionaryEncodeOptions::NullEncodingBehavior::ENCODE) { auto on_found = [this](int32_t memo_index) { action_.ObserveNullFound(memo_index); }; @@ -345,18 +356,23 @@ class RegularHashKernel : public HashKernel { // ---------------------------------------------------------------------- // Hash kernel implementation for nulls -template +template class NullHashKernel : public HashKernel { public: - NullHashKernel(const std::shared_ptr& type, MemoryPool* pool) + NullHashKernel(const std::shared_ptr& type, + const DictionaryEncodeOptions& options, MemoryPool* pool) : pool_(pool), type_(type), action_(type, pool) {} Status Reset() override { return action_.Reset(); } - Status Append(const ArrayData& arr) override { + Status Append(const ArrayData& arr) override { return DoAppend(arr); } + + template + enable_if_t DoAppend(const ArrayData& arr) { RETURN_NOT_OK(action_.Reserve(arr.length)); for (int64_t i = 0; i < arr.length; ++i) { if (i == 0) { + seen_null_ = true; action_.ObserveNullNotFound(0); } else { action_.ObserveNullFound(0); @@ -365,12 +381,31 @@ class NullHashKernel : public HashKernel { return Status::OK(); } + template + enable_if_t DoAppend(const ArrayData& arr) { + Status s = Status::OK(); + RETURN_NOT_OK(action_.Reserve(arr.length)); + for (int64_t i = 0; i < arr.length; ++i) { + if (seen_null_ == false && i == 0) { + seen_null_ = true; + action_.ObserveNullNotFound(0, &s); + } else { + action_.ObserveNullFound(0); + } + } + return s; + } + Status Flush(Datum* out) override { return action_.Flush(out); } Status FlushFinal(Datum* out) override { return action_.FlushFinal(out); } Status GetDictionary(std::shared_ptr* out) override { - // TODO(wesm): handle null being a valid dictionary value - auto null_array = std::make_shared(0); + std::shared_ptr null_array; + if (seen_null_) { + null_array = std::make_shared(1); + } else { + null_array = std::make_shared(0); + } *out = null_array->data(); return Status::OK(); } @@ -380,6 +415,7 @@ class NullHashKernel : public HashKernel { protected: MemoryPool* pool_; std::shared_ptr type_; + bool seen_null_ = false; Action action_; }; @@ -451,8 +487,12 @@ struct HashKernelTraits> { template std::unique_ptr HashInitImpl(KernelContext* ctx, const KernelInitArgs& args) { using HashKernelType = typename HashKernelTraits::HashKernel; - auto result = ::arrow::internal::make_unique(args.inputs[0].type, - ctx->memory_pool()); + DictionaryEncodeOptions options; + if (auto options_ptr = static_cast(args.options)) { + options = *options_ptr; + } + auto result = ::arrow::internal::make_unique( + args.inputs[0].type, options, ctx->memory_pool()); ctx->SetStatus(result->Reset()); return std::move(result); } @@ -507,6 +547,8 @@ KernelInit GetHashInit(Type::type type_id) { } } +using DictionaryEncodeState = OptionsWrapper; + template std::unique_ptr DictionaryHashInit(KernelContext* ctx, const KernelInitArgs& args) { @@ -529,6 +571,7 @@ std::unique_ptr DictionaryHashInit(KernelContext* ctx, DCHECK(false) << "Unsupported dictionary index type"; break; } + DictionaryEncodeOptions options = DictionaryEncodeOptions::Defaults(); return ::arrow::internal::make_unique(std::move(indices_hasher)); } diff --git a/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc index 3be549d05ce..d6b203181eb 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc @@ -46,7 +46,7 @@ static void BuildDictionary(benchmark::State& state) { // NOLINT non-const refe ArrayFromVector(is_valid, values, &arr); while (state.KeepRunning()) { - ABORT_NOT_OK(DictionaryEncode(arr).status()); + ABORT_NOT_OK(DictionaryEncode(arr, DictionaryEncodeOptions::Defaults()).status()); } state.counters["null_percent"] = static_cast(arr->null_count()) / arr->length() * 100; @@ -73,7 +73,7 @@ static void BuildStringDictionary( ArrayFromVector(data, &arr); while (state.KeepRunning()) { - ABORT_NOT_OK(DictionaryEncode(arr).status()); + ABORT_NOT_OK(DictionaryEncode(arr, DictionaryEncodeOptions::Defaults()).status()); } state.SetBytesProcessed(state.iterations() * total_bytes); state.SetItemsProcessed(state.iterations() * data.size()); @@ -169,7 +169,7 @@ void BenchDictionaryEncode(benchmark::State& state, const ParamType& params) { std::shared_ptr arr; params.GenerateTestData(&arr); while (state.KeepRunning()) { - ABORT_NOT_OK(DictionaryEncode(arr).status()); + ABORT_NOT_OK(DictionaryEncode(arr, DictionaryEncodeOptions::Defaults()).status()); } params.SetMetadata(state); } diff --git a/cpp/src/arrow/compute/kernels/vector_hash_test.cc b/cpp/src/arrow/compute/kernels/vector_hash_test.cc index e9ae4a64d97..4dc106138d7 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash_test.cc @@ -126,7 +126,8 @@ void CheckDictEncode(const std::shared_ptr& input, auto type = dictionary(expected_indices->type(), expected_values->type()); DictionaryArray expected(type, expected_indices, expected_values); - ASSERT_OK_AND_ASSIGN(Datum datum_out, DictionaryEncode(input)); + ASSERT_OK_AND_ASSIGN(Datum datum_out, + DictionaryEncode(input, DictionaryEncodeOptions::Defaults())); std::shared_ptr result = MakeArray(datum_out.array()); ASSERT_OK(result->ValidateFull()); @@ -204,7 +205,8 @@ TYPED_TEST(TestHashKernelPrimitive, ZeroChunks) { auto type = TypeTraits::type_singleton(); auto zero_chunks = std::make_shared(ArrayVector{}, type); - ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(zero_chunks)); + ASSERT_OK_AND_ASSIGN( + Datum result, DictionaryEncode(zero_chunks, DictionaryEncodeOptions::Defaults())); ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY); AssertChunkedEqual(*result.chunked_array(), @@ -305,6 +307,11 @@ TEST_F(TestHashKernel, ValueCountsBoolean) { ArrayFromJSON(boolean(), "[false]"), ArrayFromJSON(int64(), "[2]")); } +TEST_F(TestHashKernel, ValueCountsNull) { + CheckValueCounts( + null(), {nullptr, nullptr, nullptr}, {true, false, true}, {nullptr}, {false}, {3}); +} + TEST_F(TestHashKernel, DictEncodeBoolean) { CheckDictEncode(boolean(), {true, true, false, true, false}, {true, false, true, true, true}, {true, false}, {}, @@ -365,7 +372,8 @@ TYPED_TEST(TestHashKernelBinaryTypes, ZeroChunks) { auto type = this->type(); auto zero_chunks = std::make_shared(ArrayVector{}, type); - ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(zero_chunks)); + ASSERT_OK_AND_ASSIGN( + Datum result, DictionaryEncode(zero_chunks, DictionaryEncodeOptions::Defaults())); ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY); AssertChunkedEqual(*result.chunked_array(), @@ -381,7 +389,8 @@ TYPED_TEST(TestHashKernelBinaryTypes, TwoChunks) { ArrayFromJSON(type, "[\"b\"]"), }, type); - ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(two_chunks)); + ASSERT_OK_AND_ASSIGN(Datum result, + DictionaryEncode(two_chunks, DictionaryEncodeOptions::Defaults())); auto dict_type = dictionary(int32(), type); auto dictionary = ArrayFromJSON(type, R"(["a", "b"])"); @@ -542,6 +551,12 @@ TEST_F(TestHashKernel, UniqueDecimal) { {true, false, true, true}, expected, {1, 0, 1}); } +TEST_F(TestHashKernel, UniqueNull) { + CheckUnique(null(), {nullptr, nullptr}, {false, true}, + {nullptr}, {false}); + CheckUnique(null(), {}, {}, {}, {}); +} + TEST_F(TestHashKernel, ValueCountsDecimal) { std::vector values{12, 12, 11, 12}; std::vector expected{12, 0, 11}; @@ -586,6 +601,33 @@ TEST_F(TestHashKernel, DictionaryUniqueAndValueCounts) { auto different_dictionaries = *ChunkedArray::Make({input, input2}); ASSERT_RAISES(Invalid, Unique(different_dictionaries)); ASSERT_RAISES(Invalid, ValueCounts(different_dictionaries)); + + // Dictionary with encoded nulls + auto dict_with_null = ArrayFromJSON(int64(), "[10, null, 30, 40]"); + input = std::make_shared(dict_ty, indices, dict_with_null); + ex_uniques = std::make_shared(dict_ty, ex_indices, dict_with_null); + CheckUnique(input, ex_uniques); + + CheckValueCounts(input, ex_uniques, ex_counts); + + // Dictionary with masked nulls + auto indices_with_null = + ArrayFromJSON(index_ty, "[3, 0, 0, 0, null, null, 3, 0, null, 3, 0, null]"); + auto ex_indices_with_null = ArrayFromJSON(index_ty, "[3, 0, null]"); + ex_uniques = std::make_shared(dict_ty, ex_indices_with_null, dict); + input = std::make_shared(dict_ty, indices_with_null, dict); + CheckUnique(input, ex_uniques); + + CheckValueCounts(input, ex_uniques, ex_counts); + + // Dictionary with encoded AND masked nulls + auto some_indices_with_null = + ArrayFromJSON(index_ty, "[3, 0, 0, 0, 1, 1, 3, 0, null, 3, 0, null]"); + ex_uniques = + std::make_shared(dict_ty, ex_indices_with_null, dict_with_null); + input = std::make_shared(dict_ty, indices_with_null, dict_with_null); + CheckUnique(input, ex_uniques); + CheckValueCounts(input, ex_uniques, ex_counts); } } @@ -640,7 +682,8 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) { ASSERT_ARRAYS_EQUAL(*ex_counts, *counts->field(1)); // Dictionary encode - ASSERT_OK_AND_ASSIGN(Datum encoded_out, DictionaryEncode(carr)); + ASSERT_OK_AND_ASSIGN(Datum encoded_out, + DictionaryEncode(carr, DictionaryEncodeOptions::Defaults())); ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind()); AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array()); @@ -649,13 +692,42 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) { TEST_F(TestHashKernel, ZeroLengthDictionaryEncode) { // ARROW-7008 auto values = ArrayFromJSON(utf8(), "[]"); - ASSERT_OK_AND_ASSIGN(Datum datum_result, DictionaryEncode(values)); + ASSERT_OK_AND_ASSIGN(Datum datum_result, + DictionaryEncode(values, DictionaryEncodeOptions::Defaults())); std::shared_ptr result = datum_result.make_array(); const auto& dict_result = checked_cast(*result); ASSERT_OK(dict_result.ValidateFull()); } +TEST_F(TestHashKernel, NullEncodingSchemes) { + auto values = ArrayFromJSON(uint8(), "[1, 1, null, 2, null]"); + + // Masking should put null in the indices array + auto expected_mask_indices = ArrayFromJSON(int32(), "[0, 0, null, 1, null]"); + auto expected_mask_dictionary = ArrayFromJSON(uint8(), "[1, 2]"); + auto dictionary_type = dictionary(int32(), uint8()); + std::shared_ptr expected = std::make_shared( + dictionary_type, expected_mask_indices, expected_mask_dictionary); + + ASSERT_OK_AND_ASSIGN(Datum datum_result, + DictionaryEncode(values, DictionaryEncodeOptions::Defaults())); + std::shared_ptr result = datum_result.make_array(); + AssertArraysEqual(*expected, *result); + + // Encoding should put null in the dictionary + auto expected_encoded_indices = ArrayFromJSON(int32(), "[0, 0, 1, 2, 1]"); + auto expected_encoded_dict = ArrayFromJSON(uint8(), "[1, null, 2]"); + expected = std::make_shared(dictionary_type, expected_encoded_indices, + expected_encoded_dict); + + auto options = DictionaryEncodeOptions::Defaults(); + options.null_encoding_behavior = DictionaryEncodeOptions::ENCODE; + ASSERT_OK_AND_ASSIGN(datum_result, DictionaryEncode(values, options)); + result = datum_result.make_array(); + AssertArraysEqual(*expected, *result); +} + TEST_F(TestHashKernel, ChunkedArrayZeroChunk) { // ARROW-6857 auto chunked_array = std::make_shared(ArrayVector{}, utf8()); @@ -670,7 +742,9 @@ TEST_F(TestHashKernel, ChunkedArrayZeroChunk) { "[]"); AssertArraysEqual(*expected, *result_array); - ASSERT_OK_AND_ASSIGN(Datum result_datum, DictionaryEncode(chunked_array)); + ASSERT_OK_AND_ASSIGN( + Datum result_datum, + DictionaryEncode(chunked_array, DictionaryEncodeOptions::Defaults())); auto dict_type = dictionary(int32(), chunked_array->type()); ASSERT_EQ(result_datum.kind(), Datum::CHUNKED_ARRAY); diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index d6a3723d055..879e28597f5 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -578,10 +578,6 @@ class StructDictionary { Encoded out{nullptr, std::make_shared()}; for (const auto& column : columns) { - if (column->null_count() != 0) { - return Status::NotImplemented("Grouping on a field with nulls"); - } - RETURN_NOT_OK(out.dictionary->AddOne(column, &out.indices)); } @@ -626,7 +622,11 @@ class StructDictionary { private: Status AddOne(Datum column, std::shared_ptr* fused_indices) { if (column.type()->id() != Type::DICTIONARY) { - ARROW_ASSIGN_OR_RAISE(column, compute::DictionaryEncode(std::move(column))); + compute::DictionaryEncodeOptions options; + options.null_encoding_behavior = + compute::DictionaryEncodeOptions::NullEncodingBehavior::ENCODE; + ARROW_ASSIGN_OR_RAISE(column, + compute::DictionaryEncode(std::move(column), options)); } auto dict_column = column.array_as(); @@ -664,7 +664,9 @@ class StructDictionary { Status RestoreDictionaryEncoding(std::shared_ptr expected_type, Datum* column) { DCHECK_NE(column->type()->id(), Type::DICTIONARY); - ARROW_ASSIGN_OR_RAISE(*column, compute::DictionaryEncode(std::move(*column))); + ARROW_ASSIGN_OR_RAISE( + *column, compute::DictionaryEncode(std::move(*column), + compute::DictionaryEncodeOptions::Defaults())); if (expected_type->index_type()->id() == Type::INT32) { // dictionary_encode has already yielded the expected index_type diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index a412e9f7b8f..876bc77b0ba 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -80,27 +80,30 @@ class TestPartitioning : public ::testing::Test { void AssertPartition(const std::shared_ptr partitioning, const std::shared_ptr full_batch, - const std::vector>& expected_partition_indices) { + const RecordBatchVector& expected_batches) { ASSERT_OK_AND_ASSIGN(auto partition_results, partitioning->Partition(full_batch)); - ASSERT_EQ(partition_results.batches.size(), expected_partition_indices.size()); - auto max_index = - std::min(partition_results.batches.size(), expected_partition_indices.size()); - for (int partition_index = 0; partition_index < max_index; partition_index++) { + std::shared_ptr rest = full_batch; + ASSERT_EQ(partition_results.batches.size(), expected_batches.size()); + auto max_index = std::min(partition_results.batches.size(), expected_batches.size()); + for (std::size_t partition_index = 0; partition_index < max_index; + partition_index++) { std::shared_ptr actual = partition_results.batches[partition_index]; - std::shared_ptr indices_arr; - ChunkedArrayFromVector({expected_partition_indices[partition_index]}, - &indices_arr); - auto expected = compute::Take(full_batch, indices_arr); - ASSERT_EQ(expected, actual); + AssertBatchesEqual(*expected_batches[partition_index], *actual); } } void AssertPartition(const std::shared_ptr partitioning, const std::shared_ptr schema, const std::string& record_batch_json, - const std::vector>& expected_partition_indices) { + const std::shared_ptr partitioned_schema, + const std::vector& expected_record_batch_strs) { auto record_batch = RecordBatchFromJSON(schema, record_batch_json); - AssertPartition(partitioning, record_batch, expected_partition_indices); + RecordBatchVector expected_batches; + for (const auto& expected_record_batch_str : expected_record_batch_strs) { + expected_batches.push_back( + RecordBatchFromJSON(partitioned_schema, expected_record_batch_str)); + } + AssertPartition(partitioning, record_batch, expected_batches); } void AssertInspectError(const std::vector& paths) { @@ -130,16 +133,21 @@ class TestPartitioning : public ::testing::Test { }; TEST_F(TestPartitioning, Basic) { - auto schema_ = schema({field("a", int32()), field("b", utf8())}); - auto partitioning = std::make_shared(schema_); - std::string json = R"([{"a": 3, "b": "x"}, - {"a": 3, "b": "x"}, - {"a": 1, "b": null}, - {"a": null, "b": null}, - {"a": null, "b": "z"}, - {"a": null, "b": null} + auto partition_schema = schema({field("a", int32()), field("b", utf8())}); + auto schema_ = schema({field("a", int32()), field("b", utf8()), field("c", uint32())}); + auto remaining_schema = schema({field("c", uint32())}); + auto partitioning = std::make_shared(partition_schema); + std::string json = R"([{"a": 3, "b": "x", "c": 0}, + {"a": 3, "b": "x", "c": 1}, + {"a": 1, "b": null, "c": 2}, + {"a": null, "b": null, "c": 3}, + {"a": null, "b": "z", "c": 4}, + {"a": null, "b": null, "c": 5} ])"; - AssertPartition(partitioning, schema_, json, {{0, 1}, {2}, {3, 5}, {4}}); + std::vector expected_batches = {R"([{"c": 0}, {"c": 1}])", R"([{"c": 2}])", + R"([{"c": 3}, {"c": 5}])", + R"([{"c": 4}])"}; + AssertPartition(partitioning, schema_, json, remaining_schema, expected_batches); } TEST_F(TestPartitioning, StructDictionaryNull) {} @@ -643,14 +651,14 @@ TEST(GroupTest, Basics) { TEST(GroupTest, WithNulls) { AssertGrouping({field("a", utf8()), field("b", int32())}, R"([ - {"a": "ex", "b": 0}, - {"a": null, "b": 0}, - {"a": null, "b": 0}, - {"a": "ex", "b": 1}, - {"a": null, "b": null}, - {"a": "ex", "b": 1}, - {"a": "ex", "b": 0}, - {"a": "why", "b": null} + {"a": "ex", "b": 0, "id": 0}, + {"a": null, "b": 0, "id": 1}, + {"a": null, "b": 0, "id": 2}, + {"a": "ex", "b": 1, "id": 3}, + {"a": null, "b": null, "id": 4}, + {"a": "ex", "b": 1, "id": 5}, + {"a": "ex", "b": 0, "id": 6}, + {"a": "why", "b": null, "id": 7} ])", R"([ {"a": "ex", "b": 0, "ids": [0, 6]}, diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 09245285030..cd861deda31 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -2183,7 +2183,10 @@ Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arr "only zero-copy conversions allowed"); } compute::ExecContext ctx(options.pool); - ARROW_ASSIGN_OR_RAISE(Datum out, DictionaryEncode((*arrays)[i], &ctx)); + // TODO: Should we include nulls here? + ARROW_ASSIGN_OR_RAISE( + Datum out, DictionaryEncode((*arrays)[i], + compute::DictionaryEncodeOptions::Defaults(), &ctx)); (*arrays)[i] = out.chunked_array(); (*fields)[i] = (*fields)[i]->WithType((*arrays)[i]->type()); return Status::OK(); @@ -2232,7 +2235,9 @@ Status ConvertChunkedArrayToPandas(const PandasOptions& options, "only zero-copy conversions allowed"); } compute::ExecContext ctx(options.pool); - ARROW_ASSIGN_OR_RAISE(Datum out, DictionaryEncode(arr, &ctx)); + ARROW_ASSIGN_OR_RAISE( + Datum out, + DictionaryEncode(arr, compute::DictionaryEncodeOptions::Defaults(), &ctx)); arr = out.chunked_array(); } diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index ca702152d61..45c04ed5f81 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -79,6 +79,7 @@ using arrow::Status; using arrow::Table; using arrow::TimeUnit; using arrow::compute::DictionaryEncode; +using arrow::compute::DictionaryEncodeOptions; using arrow::internal::checked_cast; using arrow::internal::checked_pointer_cast; using arrow::internal::Iota; @@ -884,7 +885,8 @@ TYPED_TEST(TestParquetIO, SingleColumnOptionalDictionaryWrite) { ASSERT_OK(NullableArray(SMALL_SIZE, 10, kDefaultSeed, &values)); - ASSERT_OK_AND_ASSIGN(Datum out, DictionaryEncode(values)); + ASSERT_OK_AND_ASSIGN(Datum out, + DictionaryEncode(values, DictionaryEncodeOptions::Defaults())); std::shared_ptr dict_values = MakeArray(out.array()); std::shared_ptr schema = MakeSimpleSchema(*dict_values->type(), Repetition::OPTIONAL); From 807c600651eec7f5a4277eacea493b34564ea559 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 25 Jan 2021 17:49:10 -1000 Subject: [PATCH 04/33] Prevent using dictionary columns as partition columns. It wouldn't work. --- cpp/src/arrow/dataset/partition.cc | 16 ++++++++++------ cpp/src/arrow/python/arrow_to_pandas.cc | 1 - 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 879e28597f5..bb52e2d8fbd 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -621,13 +621,17 @@ class StructDictionary { private: Status AddOne(Datum column, std::shared_ptr* fused_indices) { - if (column.type()->id() != Type::DICTIONARY) { - compute::DictionaryEncodeOptions options; - options.null_encoding_behavior = - compute::DictionaryEncodeOptions::NullEncodingBehavior::ENCODE; - ARROW_ASSIGN_OR_RAISE(column, - compute::DictionaryEncode(std::move(column), options)); + if (column.type()->id() == Type::DICTIONARY) { + // compute::DictionaryEncode doesn't support dictionary and, even if it did, it + // would be a null op and return a flat dictionary. In order to group by dictionary + // we would need to be able to create a nested dictionary. + return Status::NotImplemented( + "Cannot use column of type dictionary as grouping criteria"); } + compute::DictionaryEncodeOptions options; + options.null_encoding_behavior = + compute::DictionaryEncodeOptions::NullEncodingBehavior::ENCODE; + ARROW_ASSIGN_OR_RAISE(column, compute::DictionaryEncode(std::move(column), options)); auto dict_column = column.array_as(); dictionaries_.push_back(dict_column->dictionary()); diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index cd861deda31..1c47f9742de 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -2183,7 +2183,6 @@ Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arr "only zero-copy conversions allowed"); } compute::ExecContext ctx(options.pool); - // TODO: Should we include nulls here? ARROW_ASSIGN_OR_RAISE( Datum out, DictionaryEncode((*arrays)[i], compute::DictionaryEncodeOptions::Defaults(), &ctx)); From 353ea9d4e2c106de0cc903f3ba8ff3c34b494522 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 1 Feb 2021 15:41:54 -1000 Subject: [PATCH 05/33] Addressing PR comments --- cpp/src/arrow/compute/api_vector.h | 25 ++++-- .../arrow/compute/kernels/scalar_cast_test.cc | 3 +- cpp/src/arrow/compute/kernels/vector_hash.cc | 77 ++++++++++--------- .../compute/kernels/vector_hash_benchmark.cc | 6 +- .../arrow/compute/kernels/vector_hash_test.cc | 25 ++---- cpp/src/arrow/dataset/partition.cc | 33 +++++--- cpp/src/arrow/dataset/partition_test.cc | 19 +++++ cpp/src/arrow/pretty_print.cc | 45 ----------- cpp/src/arrow/pretty_print.h | 13 ---- .../parquet/arrow/arrow_reader_writer_test.cc | 3 +- 10 files changed, 114 insertions(+), 135 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 6a334dffda1..d67568e1567 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -63,18 +63,17 @@ enum class SortOrder { Descending, }; +/// \brief Options for the dictionary encode function struct DictionaryEncodeOptions : public FunctionOptions { /// Configure how null values will be encoded enum NullEncodingBehavior { /// the null value will be added to the dictionary with a proper index ENCODE, /// the null value will be masked in the indices array - MASK, - /// the null value will not be included in the dictionary - SKIP + MASK }; - explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = SKIP) + explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK) : null_encoding_behavior(null_encoding) {} static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); } @@ -308,15 +307,29 @@ Result> ValueCounts(const Datum& value, ExecContext* ctx = NULLPTR); /// \brief Dictionary-encode values in an array-like object +/// +/// Any nulls encountered in the dictionary will be handled according to the +/// specified null encoding behavior. +/// +/// For example, given values ["a", "b", null, "a", null] the output will be +/// (null_encoding == ENCODE) Indices: [0, 1, 2, 0, 2] / Dict: ["a", "b", null] +/// (null_encoding == MASK) Indices: [0, 1, null, 0, null] / Dict: ["a", "b"] +/// +/// If the input is already dictionary encoded this function is a no-op unless +/// it needs to modify the null_encoding (TODO) +/// /// \param[in] data array-like input /// \param[in] ctx the function execution context, optional +/// \param[in] options configures null encoding behavior /// \return result with same shape and type as input /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result DictionaryEncode(const Datum& data, const DictionaryEncodeOptions& options, - ExecContext* ctx = NULLPTR); +Result DictionaryEncode( + const Datum& data, + const DictionaryEncodeOptions& options = DictionaryEncodeOptions::Defaults(), + ExecContext* ctx = NULLPTR); // ---------------------------------------------------------------------- // Deprecated functions diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index d84aefa5b19..99a56346c1b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1472,8 +1472,7 @@ TEST(Cast, FromDictionary) { ASSERT_OK_AND_ASSIGN(auto no_nulls, Take(*dict, *indices)); ASSERT_EQ(no_nulls->null_count(), 0); - ASSERT_OK_AND_ASSIGN(Datum encoded, - DictionaryEncode(no_nulls, DictionaryEncodeOptions::Defaults())); + ASSERT_OK_AND_ASSIGN(Datum encoded, DictionaryEncode(no_nulls)); // Make a new dict array with nullptr bitmap buffer auto data = encoded.array()->Copy(); diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 3ea8f905745..c7b25347624 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -58,7 +58,10 @@ class UniqueAction final : public ActionBase { using ActionBase::ActionBase; static constexpr bool with_error_status = false; - static constexpr bool with_memo_visit_null = true; + + UniqueAction(const std::shared_ptr& type, const FunctionOptions* options, + MemoryPool* pool) + : ActionBase(type, pool) {} Status Reset() { return Status::OK(); } @@ -76,6 +79,8 @@ class UniqueAction final : public ActionBase { template void ObserveNotFound(Index index) {} + bool ShouldEncodeNulls() { return true; } + Status Flush(Datum* out) { return Status::OK(); } Status FlushFinal(Datum* out) { return Status::OK(); } @@ -89,9 +94,9 @@ class ValueCountsAction final : ActionBase { using ActionBase::ActionBase; static constexpr bool with_error_status = true; - static constexpr bool with_memo_visit_null = true; - ValueCountsAction(const std::shared_ptr& type, MemoryPool* pool) + ValueCountsAction(const std::shared_ptr& type, const FunctionOptions* options, + MemoryPool* pool) : ActionBase(type, pool), count_builder_(pool) {} Status Reserve(const int64_t length) { @@ -147,6 +152,8 @@ class ValueCountsAction final : ActionBase { } } + bool ShouldEncodeNulls() { return true; } + private: Int64Builder count_builder_; }; @@ -159,10 +166,14 @@ class DictEncodeAction final : public ActionBase { using ActionBase::ActionBase; static constexpr bool with_error_status = false; - static constexpr bool with_memo_visit_null = false; - DictEncodeAction(const std::shared_ptr& type, MemoryPool* pool) - : ActionBase(type, pool), indices_builder_(pool) {} + DictEncodeAction(const std::shared_ptr& type, const FunctionOptions* options, + MemoryPool* pool) + : ActionBase(type, pool), indices_builder_(pool) { + if (auto options_ptr = static_cast(options)) { + encode_options_ = *options_ptr; + } + } Status Reset() { indices_builder_.Reset(); @@ -173,7 +184,7 @@ class DictEncodeAction final : public ActionBase { template void ObserveNullFound(Index index) { - if (index < 0) { + if (encode_options_.null_encoding_behavior == DictionaryEncodeOptions::MASK) { indices_builder_.UnsafeAppendNull(); } else { indices_builder_.UnsafeAppend(index); @@ -195,6 +206,10 @@ class DictEncodeAction final : public ActionBase { ObserveFound(index); } + bool ShouldEncodeNulls() { + return encode_options_.null_encoding_behavior == DictionaryEncodeOptions::ENCODE; + } + Status Flush(Datum* out) { std::shared_ptr result; RETURN_NOT_OK(indices_builder_.FinishInternal(&result)); @@ -206,12 +221,13 @@ class DictEncodeAction final : public ActionBase { private: Int32Builder indices_builder_; + DictionaryEncodeOptions encode_options_; }; class HashKernel : public KernelState { public: - HashKernel() : options_(DictionaryEncodeOptions::Defaults()) {} - explicit HashKernel(const DictionaryEncodeOptions& options) : options_(options) {} + HashKernel() : options_(nullptr) {} + explicit HashKernel(const FunctionOptions* options) : options_(options) {} // Reset for another run. virtual Status Reset() = 0; @@ -236,7 +252,7 @@ class HashKernel : public KernelState { virtual Status Append(const ArrayData& arr) = 0; protected: - DictionaryEncodeOptions options_; + const FunctionOptions* options_; std::mutex lock_; }; @@ -245,13 +261,12 @@ class HashKernel : public KernelState { // (NullType has a separate implementation) template + bool with_error_status = Action::with_error_status> class RegularHashKernel : public HashKernel { public: - RegularHashKernel(const std::shared_ptr& type, - const DictionaryEncodeOptions& options, MemoryPool* pool) - : HashKernel(options), pool_(pool), type_(type), action_(type, pool) {} + RegularHashKernel(const std::shared_ptr& type, const FunctionOptions* options, + MemoryPool* pool) + : HashKernel(options), pool_(pool), type_(type), action_(type, options, pool) {} Status Reset() override { memo_table_.reset(new MemoTable(pool_, 0)); @@ -291,9 +306,7 @@ class RegularHashKernel : public HashKernel { &unused_memo_index); }, [this]() { - if (with_memo_visit_null || - options_.null_encoding_behavior == - DictionaryEncodeOptions::NullEncodingBehavior::ENCODE) { + if (action_.ShouldEncodeNulls()) { auto on_found = [this](int32_t memo_index) { action_.ObserveNullFound(memo_index); }; @@ -329,16 +342,14 @@ class RegularHashKernel : public HashKernel { [this]() { // Null Status s = Status::OK(); - if (with_memo_visit_null) { - auto on_found = [this](int32_t memo_index) { - action_.ObserveNullFound(memo_index); - }; - auto on_not_found = [this, &s](int32_t memo_index) { - action_.ObserveNullNotFound(memo_index, &s); - }; + auto on_found = [this](int32_t memo_index) { + action_.ObserveNullFound(memo_index); + }; + auto on_not_found = [this, &s](int32_t memo_index) { + action_.ObserveNullNotFound(memo_index, &s); + }; + if (action_.ShouldEncodeNulls()) { memo_table_->GetOrInsertNull(std::move(on_found), std::move(on_not_found)); - } else { - action_.ObserveNullNotFound(-1); } return s; }); @@ -359,9 +370,9 @@ class RegularHashKernel : public HashKernel { template class NullHashKernel : public HashKernel { public: - NullHashKernel(const std::shared_ptr& type, - const DictionaryEncodeOptions& options, MemoryPool* pool) - : pool_(pool), type_(type), action_(type, pool) {} + NullHashKernel(const std::shared_ptr& type, const FunctionOptions* options, + MemoryPool* pool) + : pool_(pool), type_(type), action_(type, options, pool) {} Status Reset() override { return action_.Reset(); } @@ -487,12 +498,8 @@ struct HashKernelTraits> { template std::unique_ptr HashInitImpl(KernelContext* ctx, const KernelInitArgs& args) { using HashKernelType = typename HashKernelTraits::HashKernel; - DictionaryEncodeOptions options; - if (auto options_ptr = static_cast(args.options)) { - options = *options_ptr; - } auto result = ::arrow::internal::make_unique( - args.inputs[0].type, options, ctx->memory_pool()); + args.inputs[0].type, args.options, ctx->memory_pool()); ctx->SetStatus(result->Reset()); return std::move(result); } diff --git a/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc index d6b203181eb..3be549d05ce 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc @@ -46,7 +46,7 @@ static void BuildDictionary(benchmark::State& state) { // NOLINT non-const refe ArrayFromVector(is_valid, values, &arr); while (state.KeepRunning()) { - ABORT_NOT_OK(DictionaryEncode(arr, DictionaryEncodeOptions::Defaults()).status()); + ABORT_NOT_OK(DictionaryEncode(arr).status()); } state.counters["null_percent"] = static_cast(arr->null_count()) / arr->length() * 100; @@ -73,7 +73,7 @@ static void BuildStringDictionary( ArrayFromVector(data, &arr); while (state.KeepRunning()) { - ABORT_NOT_OK(DictionaryEncode(arr, DictionaryEncodeOptions::Defaults()).status()); + ABORT_NOT_OK(DictionaryEncode(arr).status()); } state.SetBytesProcessed(state.iterations() * total_bytes); state.SetItemsProcessed(state.iterations() * data.size()); @@ -169,7 +169,7 @@ void BenchDictionaryEncode(benchmark::State& state, const ParamType& params) { std::shared_ptr arr; params.GenerateTestData(&arr); while (state.KeepRunning()) { - ABORT_NOT_OK(DictionaryEncode(arr, DictionaryEncodeOptions::Defaults()).status()); + ABORT_NOT_OK(DictionaryEncode(arr).status()); } params.SetMetadata(state); } diff --git a/cpp/src/arrow/compute/kernels/vector_hash_test.cc b/cpp/src/arrow/compute/kernels/vector_hash_test.cc index 4dc106138d7..f4cd7dbf41f 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash_test.cc @@ -126,8 +126,7 @@ void CheckDictEncode(const std::shared_ptr& input, auto type = dictionary(expected_indices->type(), expected_values->type()); DictionaryArray expected(type, expected_indices, expected_values); - ASSERT_OK_AND_ASSIGN(Datum datum_out, - DictionaryEncode(input, DictionaryEncodeOptions::Defaults())); + ASSERT_OK_AND_ASSIGN(Datum datum_out, DictionaryEncode(input)); std::shared_ptr result = MakeArray(datum_out.array()); ASSERT_OK(result->ValidateFull()); @@ -205,8 +204,7 @@ TYPED_TEST(TestHashKernelPrimitive, ZeroChunks) { auto type = TypeTraits::type_singleton(); auto zero_chunks = std::make_shared(ArrayVector{}, type); - ASSERT_OK_AND_ASSIGN( - Datum result, DictionaryEncode(zero_chunks, DictionaryEncodeOptions::Defaults())); + ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(zero_chunks)); ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY); AssertChunkedEqual(*result.chunked_array(), @@ -372,8 +370,7 @@ TYPED_TEST(TestHashKernelBinaryTypes, ZeroChunks) { auto type = this->type(); auto zero_chunks = std::make_shared(ArrayVector{}, type); - ASSERT_OK_AND_ASSIGN( - Datum result, DictionaryEncode(zero_chunks, DictionaryEncodeOptions::Defaults())); + ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(zero_chunks)); ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY); AssertChunkedEqual(*result.chunked_array(), @@ -389,8 +386,7 @@ TYPED_TEST(TestHashKernelBinaryTypes, TwoChunks) { ArrayFromJSON(type, "[\"b\"]"), }, type); - ASSERT_OK_AND_ASSIGN(Datum result, - DictionaryEncode(two_chunks, DictionaryEncodeOptions::Defaults())); + ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(two_chunks)); auto dict_type = dictionary(int32(), type); auto dictionary = ArrayFromJSON(type, R"(["a", "b"])"); @@ -682,8 +678,7 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) { ASSERT_ARRAYS_EQUAL(*ex_counts, *counts->field(1)); // Dictionary encode - ASSERT_OK_AND_ASSIGN(Datum encoded_out, - DictionaryEncode(carr, DictionaryEncodeOptions::Defaults())); + ASSERT_OK_AND_ASSIGN(Datum encoded_out, DictionaryEncode(carr)); ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind()); AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array()); @@ -692,8 +687,7 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) { TEST_F(TestHashKernel, ZeroLengthDictionaryEncode) { // ARROW-7008 auto values = ArrayFromJSON(utf8(), "[]"); - ASSERT_OK_AND_ASSIGN(Datum datum_result, - DictionaryEncode(values, DictionaryEncodeOptions::Defaults())); + ASSERT_OK_AND_ASSIGN(Datum datum_result, DictionaryEncode(values)); std::shared_ptr result = datum_result.make_array(); const auto& dict_result = checked_cast(*result); @@ -710,8 +704,7 @@ TEST_F(TestHashKernel, NullEncodingSchemes) { std::shared_ptr expected = std::make_shared( dictionary_type, expected_mask_indices, expected_mask_dictionary); - ASSERT_OK_AND_ASSIGN(Datum datum_result, - DictionaryEncode(values, DictionaryEncodeOptions::Defaults())); + ASSERT_OK_AND_ASSIGN(Datum datum_result, DictionaryEncode(values)); std::shared_ptr result = datum_result.make_array(); AssertArraysEqual(*expected, *result); @@ -742,9 +735,7 @@ TEST_F(TestHashKernel, ChunkedArrayZeroChunk) { "[]"); AssertArraysEqual(*expected, *result_array); - ASSERT_OK_AND_ASSIGN( - Datum result_datum, - DictionaryEncode(chunked_array, DictionaryEncodeOptions::Defaults())); + ASSERT_OK_AND_ASSIGN(Datum result_datum, DictionaryEncode(chunked_array)); auto dict_type = dictionary(int32(), chunked_array->type()); ASSERT_EQ(result_datum.kind(), Datum::CHUNKED_ARRAY); diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index bb52e2d8fbd..2cd9fac1f3e 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -622,16 +622,27 @@ class StructDictionary { private: Status AddOne(Datum column, std::shared_ptr* fused_indices) { if (column.type()->id() == Type::DICTIONARY) { - // compute::DictionaryEncode doesn't support dictionary and, even if it did, it - // would be a null op and return a flat dictionary. In order to group by dictionary - // we would need to be able to create a nested dictionary. - return Status::NotImplemented( - "Cannot use column of type dictionary as grouping criteria"); + if (column.null_count() != 0) { + // TODO Optimize this by allowign DictionaryEncode to transfer a null-masked + // dictionary to a null-encoded dictionary. At the moment we decode and then + // encode causing one extra copy, and a potentially expansive decoding copy at + // that. + ARROW_ASSIGN_OR_RAISE( + auto decoded_dictionary, + compute::Cast( + column, + std::static_pointer_cast(column.type())->value_type(), + compute::CastOptions())); + column = decoded_dictionary; + } + } + if (column.type()->id() != Type::DICTIONARY) { + compute::DictionaryEncodeOptions options; + options.null_encoding_behavior = + compute::DictionaryEncodeOptions::NullEncodingBehavior::ENCODE; + ARROW_ASSIGN_OR_RAISE(column, + compute::DictionaryEncode(std::move(column), options)); } - compute::DictionaryEncodeOptions options; - options.null_encoding_behavior = - compute::DictionaryEncodeOptions::NullEncodingBehavior::ENCODE; - ARROW_ASSIGN_OR_RAISE(column, compute::DictionaryEncode(std::move(column), options)); auto dict_column = column.array_as(); dictionaries_.push_back(dict_column->dictionary()); @@ -668,9 +679,7 @@ class StructDictionary { Status RestoreDictionaryEncoding(std::shared_ptr expected_type, Datum* column) { DCHECK_NE(column->type()->id(), Type::DICTIONARY); - ARROW_ASSIGN_OR_RAISE( - *column, compute::DictionaryEncode(std::move(*column), - compute::DictionaryEncodeOptions::Defaults())); + ARROW_ASSIGN_OR_RAISE(*column, compute::DictionaryEncode(std::move(*column))); if (expected_type->index_type()->id() == Type::INT32) { // dictionary_encode has already yielded the expected index_type diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 876bc77b0ba..e7c4baf85ad 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -668,6 +668,25 @@ TEST(GroupTest, WithNulls) { {"a": "why", "b": null, "ids": [7]} ])"); + AssertGrouping({field("a", dictionary(int32(), utf8())), field("b", int32())}, + R"([ + {"a": "ex", "b": 0, "id": 0}, + {"a": null, "b": 0, "id": 1}, + {"a": null, "b": 0, "id": 2}, + {"a": "ex", "b": 1, "id": 3}, + {"a": null, "b": null, "id": 4}, + {"a": "ex", "b": 1, "id": 5}, + {"a": "ex", "b": 0, "id": 6}, + {"a": "why", "b": null, "id": 7} + ])", + R"([ + {"a": "ex", "b": 0, "ids": [0, 6]}, + {"a": null, "b": 0, "ids": [1, 2]}, + {"a": "ex", "b": 1, "ids": [3, 5]}, + {"a": null, "b": null, "ids": [4]}, + {"a": "why", "b": null, "ids": [7]} + ])"); + auto has_nulls = checked_pointer_cast( ArrayFromJSON(struct_({field("a", utf8()), field("b", int32())}), R"([ {"a": "ex", "b": 0}, diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index d61e6cde2b6..8c2ac376d1e 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -670,49 +670,4 @@ Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options, return Status::OK(); } -void GdbPrintArray(const Array& arr, int indent) { - PrettyPrintOptions options; - options.indent = indent; - auto print_st = PrettyPrint(arr, options, &std::cout); - if (!print_st.ok()) { - std::cout << "Could not print: " << print_st.message(); - } -} - -void GdbPrintRecordBatch(const RecordBatch& rb, int indent) { - PrettyPrintOptions options; - options.indent = indent; - auto print_st = PrettyPrint(rb, options, &std::cout); - if (!print_st.ok()) { - std::cout << "Could not print: " << print_st.message(); - } -} - -void GdbPrintTable(const Table& table, int indent) { - PrettyPrintOptions options; - options.indent = indent; - auto print_st = PrettyPrint(table, options, &std::cout); - if (!print_st.ok()) { - std::cout << "Could not print: " << print_st.message(); - } -} - -void GdbPrintChunkedArray(const ChunkedArray& chunked_arr, int indent) { - PrettyPrintOptions options; - options.indent = indent; - auto print_st = PrettyPrint(chunked_arr, options, &std::cout); - if (!print_st.ok()) { - std::cout << "Could not print: " << print_st.message(); - } -} - -void GdbPrintSchema(const Schema& schema, int indent) { - PrettyPrintOptions options; - options.indent = indent; - auto print_st = PrettyPrint(schema, options, &std::cout); - if (!print_st.ok()) { - std::cout << "Could not print: " << print_st.message(); - } -} - } // namespace arrow diff --git a/cpp/src/arrow/pretty_print.h b/cpp/src/arrow/pretty_print.h index 43948b8f149..9d2c72c7186 100644 --- a/cpp/src/arrow/pretty_print.h +++ b/cpp/src/arrow/pretty_print.h @@ -120,17 +120,4 @@ Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options, ARROW_EXPORT Status DebugPrint(const Array& arr, int indent); -// These print routines are used in the gdb pretty printers which are -// not capable of passing "out" params and do a poor job of overload resolution - -ARROW_EXPORT void GdbPrintArray(const Array& arr, int indent); - -ARROW_EXPORT void GdbPrintRecordBatch(const RecordBatch& rb, int indent); - -ARROW_EXPORT void GdbPrintTable(const Table& table, int indent); - -ARROW_EXPORT void GdbPrintChunkedArray(const ChunkedArray& chunked_arr, int indent); - -ARROW_EXPORT void GdbPrintSchema(const Schema& schema, int indent); - } // namespace arrow diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 45c04ed5f81..c6304ec4213 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -885,8 +885,7 @@ TYPED_TEST(TestParquetIO, SingleColumnOptionalDictionaryWrite) { ASSERT_OK(NullableArray(SMALL_SIZE, 10, kDefaultSeed, &values)); - ASSERT_OK_AND_ASSIGN(Datum out, - DictionaryEncode(values, DictionaryEncodeOptions::Defaults())); + ASSERT_OK_AND_ASSIGN(Datum out, DictionaryEncode(values)); std::shared_ptr dict_values = MakeArray(out.array()); std::shared_ptr schema = MakeSimpleSchema(*dict_values->type(), Repetition::OPTIONAL); From 68cf487a7f4ea4ee1a156233a6509f583538f692 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 1 Feb 2021 15:44:51 -1000 Subject: [PATCH 06/33] Taking out an extraneous using that I missed in the last commit --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index c6304ec4213..ca702152d61 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -79,7 +79,6 @@ using arrow::Status; using arrow::Table; using arrow::TimeUnit; using arrow::compute::DictionaryEncode; -using arrow::compute::DictionaryEncodeOptions; using arrow::internal::checked_cast; using arrow::internal::checked_pointer_cast; using arrow::internal::Iota; From ae0b8595ac7a4d22c8276488cfcf7c3405f2fc87 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 4 Feb 2021 10:46:49 -1000 Subject: [PATCH 07/33] WIP --- cpp/src/arrow/dataset/expression.cc | 2 ++ cpp/src/arrow/dataset/expression.h | 3 +++ cpp/src/arrow/dataset/partition_test.cc | 3 +++ 3 files changed, 8 insertions(+) diff --git a/cpp/src/arrow/dataset/expression.cc b/cpp/src/arrow/dataset/expression.cc index 56339430ee9..d5bcd3fb0eb 100644 --- a/cpp/src/arrow/dataset/expression.cc +++ b/cpp/src/arrow/dataset/expression.cc @@ -51,6 +51,8 @@ Expression::Expression(Parameter parameter) Expression literal(Datum lit) { return Expression(std::move(lit)); } +Expression null_literal() { return Expression(Datum()); } + Expression field_ref(FieldRef ref) { return Expression(Expression::Parameter{std::move(ref), {}}); } diff --git a/cpp/src/arrow/dataset/expression.h b/cpp/src/arrow/dataset/expression.h index 13c714b2d72..33ffdddb8a6 100644 --- a/cpp/src/arrow/dataset/expression.h +++ b/cpp/src/arrow/dataset/expression.h @@ -135,6 +135,9 @@ inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equ ARROW_DS_EXPORT Expression literal(Datum lit); +ARROW_DS_EXPORT +Expression null_literal(); + template Expression literal(Arg&& arg) { return literal(Datum(std::forward(arg))); diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index e7c4baf85ad..91555f9d1fd 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -341,6 +341,9 @@ TEST_F(TestPartitioning, HivePartitioningFormat) { equal(field_ref("beta"), literal(3.25f))}), "alpha=0/beta=3.25"); + AssertFormat(equal(field_ref("alpha"), literal(MakeNullScalar(int32()))), + "alpha=_HIVE_DEFAULT_PARTITION_"); + // written_schema_ is incompatible with partitioning_'s schema written_schema_ = schema({field("alpha", utf8()), field("beta", utf8())}); AssertFormatError( From c941bae6ceb74608cb3087e69754daa00e3358bc Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 4 Feb 2021 23:00:22 -1000 Subject: [PATCH 08/33] Adding the null fallback logic to the python half --- cpp/src/arrow/dataset/partition.cc | 24 +- cpp/src/arrow/dataset/partition.h | 19 +- cpp/src/arrow/dataset/partition_test.cc | 21 +- python/pyarrow/_dataset.pyx | 20 +- python/pyarrow/includes/libarrow_dataset.pxd | 7 +- python/pyarrow/tests/test_dataset.py | 1316 ++++++++++-------- 6 files changed, 781 insertions(+), 626 deletions(-) diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 2cd9fac1f3e..595cce8021d 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -410,12 +410,17 @@ std::shared_ptr DirectoryPartitioning::MakeFactory( } util::optional HivePartitioning::ParseKey( - const std::string& segment) { + const std::string& segment, const std::string& null_fallback) { auto name_end = string_view(segment).find_first_of('='); + // Keep for backwards compatibility, this would be produced by arrow <= 3 if (name_end == string_view::npos) { return util::nullopt; } + auto value = segment.substr(name_end + 1); + if (value == null_fallback) { + return util::nullopt; + } return Key{segment.substr(0, name_end), segment.substr(name_end + 1)}; } @@ -424,7 +429,7 @@ std::vector HivePartitioning::ParseKeys( std::vector keys; for (const auto& segment : fs::internal::SplitAbstractPath(path)) { - if (auto key = ParseKey(segment)) { + if (auto key = ParseKey(segment, null_fallback_)) { keys.push_back(std::move(*key)); } } @@ -438,12 +443,10 @@ Result HivePartitioning::FormatValues(const ScalarVector& values) c for (int i = 0; i < schema_->num_fields(); ++i) { const std::string& name = schema_->field(i)->name(); - if (values[i] == nullptr) { - if (!NextValid(values, i)) break; - + if (values[i] == nullptr || !values[i]->is_valid) { // If no key is available just provide a placeholder segment to maintain the // field_index <-> path nesting relation - segments[i] = name; + segments[i] = name + "=" + null_fallback_; } else { segments[i] = name + "=" + values[i]->ToString(); } @@ -454,8 +457,8 @@ Result HivePartitioning::FormatValues(const ScalarVector& values) c class HivePartitioningFactory : public KeyValuePartitioningFactory { public: - explicit HivePartitioningFactory(PartitioningFactoryOptions options) - : KeyValuePartitioningFactory(options) {} + explicit HivePartitioningFactory(HivePartitioningFactoryOptions options) + : KeyValuePartitioningFactory(options), null_fallback_(options.null_fallback) {} std::string type_name() const override { return "hive"; } @@ -463,7 +466,7 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory { const std::vector& paths) override { for (auto path : paths) { for (auto&& segment : fs::internal::SplitAbstractPath(path)) { - if (auto key = HivePartitioning::ParseKey(segment)) { + if (auto key = HivePartitioning::ParseKey(segment, null_fallback_)) { RETURN_NOT_OK(InsertRepr(key->name, key->value)); } } @@ -491,11 +494,12 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory { } private: + const std::string null_fallback_; std::vector field_names_; }; std::shared_ptr HivePartitioning::MakeFactory( - PartitioningFactoryOptions options) { + HivePartitioningFactoryOptions options) { return std::shared_ptr(new HivePartitioningFactory(options)); } diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index 944434e64f7..5cdf7a1df66 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -92,6 +92,11 @@ struct PartitioningFactoryOptions { bool infer_dictionary = false; }; +struct HivePartitioningFactoryOptions : PartitioningFactoryOptions { + /// The hive partitioning scheme maps null to a hard coded fallback string. + std::string null_fallback; +}; + /// \brief PartitioningFactory provides creation of a partitioning when the /// specific schema must be inferred from available paths (no explicit schema is known). class ARROW_DS_EXPORT PartitioningFactory { @@ -175,6 +180,8 @@ class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning { Result FormatValues(const ScalarVector& values) const override; }; +static constexpr char kDefaultHiveNullFallback[] = "__HIVE_DEFAULT_PARTITION__"; + /// \brief Multi-level, directory based partitioning /// originating from Apache Hive with all data files stored in the /// leaf directories. Data is partitioned by static values of a @@ -188,17 +195,21 @@ class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning { public: // If a field in schema is of dictionary type, the corresponding element of dictionaries // must be contain the dictionary of values for that field. - explicit HivePartitioning(std::shared_ptr schema, ArrayVector dictionaries = {}) - : KeyValuePartitioning(std::move(schema), std::move(dictionaries)) {} + explicit HivePartitioning(std::shared_ptr schema, ArrayVector dictionaries = {}, + std::string null_fallback = kDefaultHiveNullFallback) + : KeyValuePartitioning(std::move(schema), std::move(dictionaries)), + null_fallback_(null_fallback) {} std::string type_name() const override { return "hive"; } - static util::optional ParseKey(const std::string& segment); + static util::optional ParseKey(const std::string& segment, + const std::string& null_fallback); static std::shared_ptr MakeFactory( - PartitioningFactoryOptions = {}); + HivePartitioningFactoryOptions = {}); private: + const std::string null_fallback_; std::vector ParseKeys(const std::string& path) const override; Result FormatValues(const ScalarVector& values) const override; diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 91555f9d1fd..2558af293da 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -258,6 +258,8 @@ TEST_F(TestPartitioning, DictionaryInference) { // successful dictionary inference AssertInspect({"/a/0"}, {DictStr("alpha"), DictInt("beta")}); AssertInspect({"/a/0", "/a/1"}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({"/a/0", "/a"}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({"/0/a", "/1"}, {DictInt("alpha"), DictStr("beta")}); AssertInspect({"/a/0", "/b/0", "/a/1", "/b/1"}, {DictStr("alpha"), DictInt("beta")}); AssertInspect({"/a/-", "/b/-", "/a/_", "/b/_"}, {DictStr("alpha"), DictStr("beta")}); } @@ -320,7 +322,7 @@ TEST_F(TestPartitioning, HivePartitioning) { TEST_F(TestPartitioning, HivePartitioningFormat) { partitioning_ = std::make_shared( - schema({field("alpha", int32()), field("beta", float32())})); + schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz"); written_schema_ = partitioning_->schema(); @@ -330,9 +332,9 @@ TEST_F(TestPartitioning, HivePartitioningFormat) { AssertFormat(and_(equal(field_ref("beta"), literal(3.25f)), equal(field_ref("alpha"), literal(0))), "alpha=0/beta=3.25"); - AssertFormat(equal(field_ref("alpha"), literal(0)), "alpha=0"); - AssertFormat(equal(field_ref("beta"), literal(3.25f)), "alpha/beta=3.25"); - AssertFormat(literal(true), ""); + AssertFormat(equal(field_ref("alpha"), literal(0)), "alpha=0/beta=xyz"); + AssertFormat(equal(field_ref("beta"), literal(3.25f)), "alpha=xyz/beta=3.25"); + AssertFormat(literal(true), "alpha=xyz/beta=xyz"); ASSERT_OK_AND_ASSIGN(written_schema_, written_schema_->AddField(0, field("gamma", utf8()))); @@ -342,7 +344,7 @@ TEST_F(TestPartitioning, HivePartitioningFormat) { "alpha=0/beta=3.25"); AssertFormat(equal(field_ref("alpha"), literal(MakeNullScalar(int32()))), - "alpha=_HIVE_DEFAULT_PARTITION_"); + "alpha=xyz/beta=xyz"); // written_schema_ is incompatible with partitioning_'s schema written_schema_ = schema({field("alpha", utf8()), field("beta", utf8())}); @@ -374,8 +376,9 @@ TEST_F(TestPartitioning, DiscoverHiveSchema) { } TEST_F(TestPartitioning, HiveDictionaryInference) { - PartitioningFactoryOptions options; + HivePartitioningFactoryOptions options; options.infer_dictionary = true; + options.null_fallback = "xyz"; factory_ = HivePartitioning::MakeFactory(options); // type is still int32 if possible @@ -387,6 +390,8 @@ TEST_F(TestPartitioning, HiveDictionaryInference) { // successful dictionary inference AssertInspect({"/alpha=a/beta=0"}, {DictStr("alpha"), DictInt("beta")}); AssertInspect({"/alpha=a/beta=0", "/alpha=a/1"}, {DictStr("alpha"), DictInt("beta")}); + AssertInspect({"/alpha=a/beta=0", "/alpha=xyz/beta=xyz"}, + {DictStr("alpha"), DictInt("beta")}); AssertInspect( {"/alpha=a/beta=0", "/alpha=b/beta=0", "/alpha=a/beta=1", "/alpha=b/beta=1"}, {DictStr("alpha"), DictInt("beta")}); @@ -396,7 +401,7 @@ TEST_F(TestPartitioning, HiveDictionaryInference) { } TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) { - PartitioningFactoryOptions options; + HivePartitioningFactoryOptions options; options.infer_dictionary = true; factory_ = HivePartitioning::MakeFactory(options); @@ -519,7 +524,7 @@ class RangePartitioning : public Partitioning { std::vector ranges; for (auto segment : fs::internal::SplitAbstractPath(path)) { - auto key = HivePartitioning::ParseKey(segment); + auto key = HivePartitioning::ParseKey(segment, ""); if (!key) { return Status::Invalid("can't parse '", segment, "' as a range"); } diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index c67dbc99d77..5fa2b118ed5 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1546,7 +1546,7 @@ cdef class DirectoryPartitioning(Partitioning): Returns ------- - DirectoryPartitioningFactory + PartitioningFactory To be used in the FileSystemFactoryOptions. """ cdef: @@ -1590,6 +1590,8 @@ cdef class HivePartitioning(Partitioning): corresponding entry of `dictionaries` must be an array containing every value which may be taken by the corresponding column or an error will be raised in parsing. + null_fallback : str + If any field is None then this fallback will be used as a label Returns ------- @@ -1608,13 +1610,15 @@ cdef class HivePartitioning(Partitioning): cdef: CHivePartitioning* hive_partitioning - def __init__(self, Schema schema not None, dictionaries=None): + def __init__(self, Schema schema not None, dictionaries=None, null_fallback="__HIVE_DEFAULT_PARTITION__"): cdef: shared_ptr[CHivePartitioning] c_partitioning + c_string c_null_fallback = tobytes(null_fallback) c_partitioning = make_shared[CHivePartitioning]( pyarrow_unwrap_schema(schema), - _partitioning_dictionaries(schema, dictionaries) + _partitioning_dictionaries(schema, dictionaries), + c_null_fallback ) self.init( c_partitioning) @@ -1623,7 +1627,7 @@ cdef class HivePartitioning(Partitioning): self.hive_partitioning = sp.get() @staticmethod - def discover(infer_dictionary=False, max_partition_dictionary_size=0): + def discover(infer_dictionary=False, max_partition_dictionary_size=0, null_fallback="__HIVE_DEFAULT_PARTITION__"): """ Discover a HivePartitioning. @@ -1639,6 +1643,10 @@ cdef class HivePartitioning(Partitioning): Synonymous with infer_dictionary for backwards compatibility with 1.0: setting this to -1 or None is equivalent to passing infer_dictionary=True. + null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" + When inferring a schema for partition fields this value will be + replaced by null. The default is set to __HIVE_DEFAULT_PARTITION__ + for compatibility with Spark Returns ------- @@ -1646,7 +1654,7 @@ cdef class HivePartitioning(Partitioning): To be used in the FileSystemFactoryOptions. """ cdef: - CPartitioningFactoryOptions c_options + CHivePartitioningFactoryOptions c_options if max_partition_dictionary_size in {-1, None}: infer_dictionary = True @@ -1657,6 +1665,8 @@ cdef class HivePartitioning(Partitioning): if infer_dictionary: c_options.infer_dictionary = True + c_options.null_fallback = tobytes(null_fallback) + return PartitioningFactory.wrap( CHivePartitioning.MakeFactory(c_options)) diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd index 29f9738dedc..93bc0edddc1 100644 --- a/python/pyarrow/includes/libarrow_dataset.pxd +++ b/python/pyarrow/includes/libarrow_dataset.pxd @@ -274,6 +274,11 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: "arrow::dataset::PartitioningFactoryOptions": c_bool infer_dictionary + cdef cppclass CHivePartitioningFactoryOptions \ + "arrow::dataset::HivePartitioningFactoryOptions": + c_bool infer_dictionary, + c_string null_fallback + cdef cppclass CPartitioningFactory "arrow::dataset::PartitioningFactory": pass @@ -293,7 +298,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: @staticmethod shared_ptr[CPartitioningFactory] MakeFactory( - CPartitioningFactoryOptions) + CHivePartitioningFactoryOptions) cdef cppclass CPartitioningOrFactory \ "arrow::dataset::PartitioningOrFactory": diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 796f6d998e8..48ef421694d 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -21,6 +21,7 @@ import textwrap import numpy as np +from numpy.core.fromnumeric import partition import pytest import pyarrow as pa @@ -49,23 +50,25 @@ def _generate_data(n): day = datetime.datetime(2000, 1, 1) interval = datetime.timedelta(days=5) - colors = itertools.cycle(['green', 'blue', 'yellow', 'red', 'orange']) + colors = itertools.cycle(["green", "blue", "yellow", "red", "orange"]) data = [] for i in range(n): data.append((day, i, float(i), next(colors))) day += interval - return pd.DataFrame(data, columns=['date', 'index', 'value', 'color']) + return pd.DataFrame(data, columns=["date", "index", "value", "color"]) def _table_from_pandas(df): - schema = pa.schema([ - pa.field('date', pa.date32()), - pa.field('index', pa.int64()), - pa.field('value', pa.float64()), - pa.field('color', pa.string()), - ]) + schema = pa.schema( + [ + pa.field("date", pa.date32()), + pa.field("index", pa.int64()), + pa.field("value", pa.float64()), + pa.field("color", pa.string()), + ] + ) table = pa.Table.from_pandas(df, schema=schema, preserve_index=False) return table.replace_schema_metadata() @@ -78,26 +81,28 @@ def mockfs(): mockfs = fs._MockFileSystem() directories = [ - 'subdir/1/xxx', - 'subdir/2/yyy', + "subdir/1/xxx", + "subdir/2/yyy", ] for i, directory in enumerate(directories): - path = '{}/file{}.parquet'.format(directory, i) + path = "{}/file{}.parquet".format(directory, i) mockfs.create_dir(directory) with mockfs.open_output_stream(path) as out: data = [ list(range(5)), list(map(float, range(5))), list(map(str, range(5))), - [i] * 5 + [i] * 5, ] - schema = pa.schema([ - pa.field('i64', pa.int64()), - pa.field('f64', pa.float64()), - pa.field('str', pa.string()), - pa.field('const', pa.int64()), - ]) + schema = pa.schema( + [ + pa.field("i64", pa.int64()), + pa.field("f64", pa.float64()), + pa.field("str", pa.string()), + pa.field("const", pa.int64()), + ] + ) batch = pa.record_batch(data, schema=schema) table = pa.Table.from_batches([batch]) @@ -138,10 +143,10 @@ def assert_opens(expected_opened): return fs, assert_opens -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def multisourcefs(request): - request.config.pyarrow.requires('pandas') - request.config.pyarrow.requires('parquet') + request.config.pyarrow.requires("pandas") + request.config.pyarrow.requires("parquet") import pyarrow.parquet as pq df = _generate_data(1000) @@ -153,35 +158,35 @@ def multisourcefs(request): # create a directory containing a flat sequence of parquet files without # any partitioning involved - mockfs.create_dir('plain') + mockfs.create_dir("plain") for i, chunk in enumerate(np.array_split(df_a, 10)): - path = 'plain/chunk-{}.parquet'.format(i) + path = "plain/chunk-{}.parquet".format(i) with mockfs.open_output_stream(path) as out: pq.write_table(_table_from_pandas(chunk), out) # create one with schema partitioning by weekday and color - mockfs.create_dir('schema') + mockfs.create_dir("schema") for part, chunk in df_b.groupby([df_b.date.dt.dayofweek, df_b.color]): - folder = 'schema/{}/{}'.format(*part) - path = '{}/chunk.parquet'.format(folder) + folder = "schema/{}/{}".format(*part) + path = "{}/chunk.parquet".format(folder) mockfs.create_dir(folder) with mockfs.open_output_stream(path) as out: pq.write_table(_table_from_pandas(chunk), out) # create one with hive partitioning by year and month - mockfs.create_dir('hive') + mockfs.create_dir("hive") for part, chunk in df_c.groupby([df_c.date.dt.year, df_c.date.dt.month]): - folder = 'hive/year={}/month={}'.format(*part) - path = '{}/chunk.parquet'.format(folder) + folder = "hive/year={}/month={}".format(*part) + path = "{}/chunk.parquet".format(folder) mockfs.create_dir(folder) with mockfs.open_output_stream(path) as out: pq.write_table(_table_from_pandas(chunk), out) # create one with hive partitioning by color - mockfs.create_dir('hive_color') + mockfs.create_dir("hive_color") for part, chunk in df_d.groupby(["color"]): - folder = 'hive_color/color={}'.format(*part) - path = '{}/chunk.parquet'.format(folder) + folder = "hive_color/color={}".format(*part) + path = "{}/chunk.parquet".format(folder) mockfs.create_dir(folder) with mockfs.open_output_stream(path) as out: pq.write_table(_table_from_pandas(chunk), out) @@ -192,36 +197,40 @@ def multisourcefs(request): @pytest.fixture def dataset(mockfs): format = ds.ParquetFileFormat() - selector = fs.FileSelector('subdir', recursive=True) - options = ds.FileSystemFactoryOptions('subdir') + selector = fs.FileSelector("subdir", recursive=True) + options = ds.FileSystemFactoryOptions("subdir") options.partitioning = ds.DirectoryPartitioning( - pa.schema([ - pa.field('group', pa.int32()), - pa.field('key', pa.string()) - ]) + pa.schema([pa.field("group", pa.int32()), pa.field("key", pa.string())]) ) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) return factory.finish() def test_filesystem_dataset(mockfs): - schema = pa.schema([ - pa.field('const', pa.int64()) - ]) + schema = pa.schema([pa.field("const", pa.int64())]) file_format = ds.ParquetFileFormat() - paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] - partitions = [ds.field('part') == x for x in range(1, 3)] - fragments = [file_format.make_fragment(path, mockfs, part) - for path, part in zip(paths, partitions)] - root_partition = ds.field('level') == ds.scalar(1337) + paths = ["subdir/1/xxx/file0.parquet", "subdir/2/yyy/file1.parquet"] + partitions = [ds.field("part") == x for x in range(1, 3)] + fragments = [ + file_format.make_fragment(path, mockfs, part) + for path, part in zip(paths, partitions) + ] + root_partition = ds.field("level") == ds.scalar(1337) dataset_from_fragments = ds.FileSystemDataset( - fragments, schema=schema, format=file_format, - filesystem=mockfs, root_partition=root_partition, + fragments, + schema=schema, + format=file_format, + filesystem=mockfs, + root_partition=root_partition, ) dataset_from_paths = ds.FileSystemDataset.from_paths( - paths, schema=schema, format=file_format, filesystem=mockfs, - partitions=partitions, root_partition=root_partition, + paths, + schema=schema, + format=file_format, + filesystem=mockfs, + partitions=partitions, + root_partition=root_partition, ) for dataset in [dataset_from_fragments, dataset_from_paths]: @@ -268,8 +277,9 @@ def test_filesystem_dataset(mockfs): ds.FileSystemDataset(fragments, file_format, schema) # validation of root_partition with pytest.raises(TypeError, match="incorrect type"): - ds.FileSystemDataset(fragments, schema=schema, - format=file_format, root_partition=1) + ds.FileSystemDataset( + fragments, schema=schema, format=file_format, root_partition=1 + ) # missing required argument in from_paths with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset.from_paths(fragments, format=file_format) @@ -277,15 +287,15 @@ def test_filesystem_dataset(mockfs): def test_filesystem_dataset_no_filesystem_interaction(): # ARROW-8283 - schema = pa.schema([ - pa.field('f1', pa.int64()) - ]) + schema = pa.schema([pa.field("f1", pa.int64())]) file_format = ds.IpcFileFormat() - paths = ['nonexistingfile.arrow'] + paths = ["nonexistingfile.arrow"] # creating the dataset itself doesn't raise dataset = ds.FileSystemDataset.from_paths( - paths, schema=schema, format=file_format, + paths, + schema=schema, + format=file_format, filesystem=fs.LocalFileSystem(), ) @@ -317,27 +327,27 @@ def test_dataset(dataset): assert isinstance(table, pa.Table) assert len(table) == 10 - condition = ds.field('i64') == 1 + condition = ds.field("i64") == 1 result = dataset.to_table(use_threads=True, filter=condition).to_pydict() # don't rely on the scanning order - assert result['i64'] == [1, 1] - assert result['f64'] == [1., 1.] - assert sorted(result['group']) == [1, 2] - assert sorted(result['key']) == ['xxx', 'yyy'] + assert result["i64"] == [1, 1] + assert result["f64"] == [1.0, 1.0] + assert sorted(result["group"]) == [1, 2] + assert sorted(result["key"]) == ["xxx", "yyy"] def test_scanner(dataset): - scanner = ds.Scanner.from_dataset(dataset, - memory_pool=pa.default_memory_pool()) + scanner = ds.Scanner.from_dataset(dataset, memory_pool=pa.default_memory_pool()) assert isinstance(scanner, ds.Scanner) assert len(list(scanner.scan())) == 2 with pytest.raises(pa.ArrowInvalid): - ds.Scanner.from_dataset(dataset, columns=['unknown']) + ds.Scanner.from_dataset(dataset, columns=["unknown"]) - scanner = ds.Scanner.from_dataset(dataset, columns=['i64'], - memory_pool=pa.default_memory_pool()) + scanner = ds.Scanner.from_dataset( + dataset, columns=["i64"], memory_pool=pa.default_memory_pool() + ) assert isinstance(scanner, ds.Scanner) assert len(list(scanner.scan())) == 2 @@ -358,46 +368,45 @@ def test_abstract_classes(): def test_partitioning(): - schema = pa.schema([ - pa.field('i64', pa.int64()), - pa.field('f64', pa.float64()) - ]) + schema = pa.schema([pa.field("i64", pa.int64()), pa.field("f64", pa.float64())]) for klass in [ds.DirectoryPartitioning, ds.HivePartitioning]: partitioning = klass(schema) assert isinstance(partitioning, ds.Partitioning) partitioning = ds.DirectoryPartitioning( - pa.schema([ - pa.field('group', pa.int64()), - pa.field('key', pa.float64()) - ]) + pa.schema([pa.field("group", pa.int64()), pa.field("key", pa.float64())]) ) - expr = partitioning.parse('/3/3.14') + expr = partitioning.parse("/3/3.14") assert isinstance(expr, ds.Expression) - expected = (ds.field('group') == 3) & (ds.field('key') == 3.14) + expected = (ds.field("group") == 3) & (ds.field("key") == 3.14) assert expr.equals(expected) with pytest.raises(pa.ArrowInvalid): - partitioning.parse('/prefix/3/aaa') + partitioning.parse("/prefix/3/aaa") partitioning = ds.HivePartitioning( - pa.schema([ - pa.field('alpha', pa.int64()), - pa.field('beta', pa.int64()) - ]) - ) - expr = partitioning.parse('/alpha=0/beta=3') - expected = ( - (ds.field('alpha') == ds.scalar(0)) & - (ds.field('beta') == ds.scalar(3)) + pa.schema([pa.field("alpha", pa.int64()), pa.field("beta", pa.int64())]) ) + expr = partitioning.parse("/alpha=0/beta=3") + expected = (ds.field("alpha") == ds.scalar(0)) & (ds.field("beta") == ds.scalar(3)) assert expr.equals(expected) - for shouldfail in ['/alpha=one/beta=2', '/alpha=one', '/beta=two']: + for shouldfail in ["/alpha=one/beta=2", "/alpha=one", "/beta=two"]: with pytest.raises(pa.ArrowInvalid): partitioning.parse(shouldfail) + partitioning = ds.HivePartitioning( + pa.schema([pa.field("alpha", pa.int64()), pa.field("beta", pa.int64())]), + None, + "xyz", + ) + expr = partitioning.parse("/alpha=xyz/beta=3") + expected = (ds.field("alpha") == ds.scalar(None)) & ( + ds.field("beta") == ds.scalar(3) + ) + assert expr.equals(expected) + def test_expression_serialization(): a = ds.scalar(1) @@ -405,14 +414,30 @@ def test_expression_serialization(): c = ds.scalar(True) d = ds.scalar("string") e = ds.scalar(None) - f = ds.scalar({'a': 1}) + f = ds.scalar({"a": 1}) g = ds.scalar(pa.scalar(1)) - all_exprs = [a, b, c, d, e, f, g, a == b, a > b, a & b, a | b, ~c, - d.is_valid(), a.cast(pa.int32(), safe=False), - a.cast(pa.int32(), safe=False), a.isin([1, 2, 3]), - ds.field('i64') > 5, ds.field('i64') == 5, - ds.field('i64') == 7] + all_exprs = [ + a, + b, + c, + d, + e, + f, + g, + a == b, + a > b, + a & b, + a | b, + ~c, + d.is_valid(), + a.cast(pa.int32(), safe=False), + a.cast(pa.int32(), safe=False), + a.isin([1, 2, 3]), + ds.field("i64") > 5, + ds.field("i64") == 5, + ds.field("i64") == 7, + ] for expr in all_exprs: assert isinstance(expr, ds.Expression) restored = pickle.loads(pickle.dumps(expr)) @@ -460,13 +485,13 @@ def test_expression_boolean_operators(): def test_partition_keys(): - a, b, c = [ds.field(f) == f for f in 'abc'] - assert ds._get_partition_keys(a) == {'a': 'a'} - assert ds._get_partition_keys(a & b & c) == {f: f for f in 'abc'} + a, b, c = [ds.field(f) == f for f in "abc"] + assert ds._get_partition_keys(a) == {"a": "a"} + assert ds._get_partition_keys(a & b & c) == {f: f for f in "abc"} - nope = ds.field('d') >= 3 + nope = ds.field("d") >= 3 assert ds._get_partition_keys(nope) == {} - assert ds._get_partition_keys(a & nope) == {'a': 'a'} + assert ds._get_partition_keys(a & nope) == {"a": "a"} def test_parquet_read_options(): @@ -508,25 +533,24 @@ def test_file_format_pickling(): formats = [ ds.IpcFileFormat(), ds.CsvFileFormat(), - ds.CsvFileFormat(pa.csv.ParseOptions(delimiter='\t', - ignore_empty_lines=True)), + ds.CsvFileFormat(pa.csv.ParseOptions(delimiter="\t", ignore_empty_lines=True)), ds.ParquetFileFormat(), ds.ParquetFileFormat( read_options=ds.ParquetReadOptions(use_buffered_stream=True) ), ds.ParquetFileFormat( read_options={ - 'use_buffered_stream': True, - 'buffer_size': 4096, + "use_buffered_stream": True, + "buffer_size": 4096, } - ) + ), ] for file_format in formats: assert pickle.loads(pickle.dumps(file_format)) == file_format -@pytest.mark.parametrize('paths_or_selector', [ - fs.FileSelector('subdir', recursive=True), +@pytest.mark.parametrize( + "paths_or_selector", [ 'subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet', @@ -539,34 +563,33 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): pre_buffer=pre_buffer) ) - options = ds.FileSystemFactoryOptions('subdir') + options = ds.FileSystemFactoryOptions("subdir") options.partitioning = ds.DirectoryPartitioning( - pa.schema([ - pa.field('group', pa.int32()), - pa.field('key', pa.string()) - ]) + pa.schema([pa.field("group", pa.int32()), pa.field("key", pa.string())]) ) - assert options.partition_base_dir == 'subdir' - assert options.selector_ignore_prefixes == ['.', '_'] + assert options.partition_base_dir == "subdir" + assert options.selector_ignore_prefixes == [".", "_"] assert options.exclude_invalid_files is False - factory = ds.FileSystemDatasetFactory( - mockfs, paths_or_selector, format, options - ) + factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options) inspected_schema = factory.inspect() - assert factory.inspect().equals(pa.schema([ - pa.field('i64', pa.int64()), - pa.field('f64', pa.float64()), - pa.field('str', pa.dictionary(pa.int32(), pa.string())), - pa.field('const', pa.int64()), - pa.field('group', pa.int32()), - pa.field('key', pa.string()), - ]), check_metadata=False) + assert factory.inspect().equals( + pa.schema( + [ + pa.field("i64", pa.int64()), + pa.field("f64", pa.float64()), + pa.field("str", pa.dictionary(pa.int32(), pa.string())), + pa.field("const", pa.int64()), + pa.field("group", pa.int32()), + pa.field("key", pa.string()), + ] + ), + check_metadata=False, + ) assert isinstance(factory.inspect_schemas(), list) - assert isinstance(factory.finish(inspected_schema), - ds.FileSystemDataset) + assert isinstance(factory.finish(inspected_schema), ds.FileSystemDataset) assert factory.root_partition.equals(ds.scalar(True)) dataset = factory.finish() @@ -578,9 +601,9 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64()) expected_str = pa.DictionaryArray.from_arrays( pa.array([0, 1, 2, 3, 4], type=pa.int32()), - pa.array("0 1 2 3 4".split(), type=pa.string()) + pa.array("0 1 2 3 4".split(), type=pa.string()), ) - for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']): + for task, group, key in zip(scanner.scan(), [1, 2], ["xxx", "yyy"]): expected_group = pa.array([group] * 5, type=pa.int32()) expected_key = pa.array([key] * 5, type=pa.string()) expected_const = pa.array([group - 1] * 5, type=pa.int64()) @@ -601,15 +624,15 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): def test_make_fragment(multisourcefs): parquet_format = ds.ParquetFileFormat() - dataset = ds.dataset('/plain', filesystem=multisourcefs, - format=parquet_format) + dataset = ds.dataset("/plain", filesystem=multisourcefs, format=parquet_format) for path in dataset.files: fragment = parquet_format.make_fragment(path, multisourcefs) assert fragment.row_groups == [0] - row_group_fragment = parquet_format.make_fragment(path, multisourcefs, - row_groups=[0]) + row_group_fragment = parquet_format.make_fragment( + path, multisourcefs, row_groups=[0] + ) for f in [fragment, row_group_fragment]: assert isinstance(f, ds.ParquetFileFragment) assert f.path == path @@ -618,21 +641,23 @@ def test_make_fragment(multisourcefs): def test_make_csv_fragment_from_buffer(): - content = textwrap.dedent(""" + content = textwrap.dedent( + """ alpha,num,animal a,12,dog b,11,cat c,10,rabbit - """) - buffer = pa.py_buffer(content.encode('utf-8')) + """ + ) + buffer = pa.py_buffer(content.encode("utf-8")) csv_format = ds.CsvFileFormat() fragment = csv_format.make_fragment(buffer) - expected = pa.table([['a', 'b', 'c'], - [12, 11, 10], - ['dog', 'cat', 'rabbit']], - names=['alpha', 'num', 'animal']) + expected = pa.table( + [["a", "b", "c"], [12, 11, 10], ["dog", "cat", "rabbit"]], + names=["alpha", "num", "animal"], + ) assert fragment.to_table().equals(expected) pickled = pickle.loads(pickle.dumps(fragment)) @@ -644,29 +669,26 @@ def test_make_parquet_fragment_from_buffer(): import pyarrow.parquet as pq arrays = [ - pa.array(['a', 'b', 'c']), + pa.array(["a", "b", "c"]), pa.array([12, 11, 10]), - pa.array(['dog', 'cat', 'rabbit']) + pa.array(["dog", "cat", "rabbit"]), ] dictionary_arrays = [ arrays[0].dictionary_encode(), arrays[1], - arrays[2].dictionary_encode() + arrays[2].dictionary_encode(), ] dictionary_format = ds.ParquetFileFormat( read_options=ds.ParquetReadOptions( use_buffered_stream=True, buffer_size=4096, - dictionary_columns=['alpha', 'animal'] + dictionary_columns=["alpha", "animal"], ) ) - cases = [ - (arrays, ds.ParquetFileFormat()), - (dictionary_arrays, dictionary_format) - ] + cases = [(arrays, ds.ParquetFileFormat()), (dictionary_arrays, dictionary_format)] for arrays, format_ in cases: - table = pa.table(arrays, names=['alpha', 'num', 'animal']) + table = pa.table(arrays, names=["alpha", "num", "animal"]) out = pa.BufferOutputStream() pq.write_table(table, out) @@ -683,15 +705,13 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None): import pyarrow.parquet as pq table = pa.table( - [range(8), [1] * 8, ['a'] * 4 + ['b'] * 4], - names=['f1', 'f2', 'part'] + [range(8), [1] * 8, ["a"] * 4 + ["b"] * 4], names=["f1", "f2", "part"] ) path = str(tempdir / "test_parquet_dataset") # write_to_dataset currently requires pandas - pq.write_to_dataset(table, path, - partition_cols=["part"], chunk_size=chunk_size) + pq.write_to_dataset(table, path, partition_cols=["part"], chunk_size=chunk_size) dataset = ds.dataset( path, format="parquet", partitioning="hive", filesystem=filesystem ) @@ -709,11 +729,11 @@ def test_fragments(tempdir): assert len(fragments) == 2 f = fragments[0] - physical_names = ['f1', 'f2'] + physical_names = ["f1", "f2"] # file's schema does not include partition column assert f.physical_schema.names == physical_names assert f.format.inspect(f.path, f.filesystem) == f.physical_schema - assert f.partition_expression.equals(ds.field('part') == 'a') + assert f.partition_expression.equals(ds.field("part") == "a") # By default, the partition column is not part of the schema. result = f.to_table() @@ -723,13 +743,13 @@ def test_fragments(tempdir): # scanning fragment includes partition columns when given the proper # schema. result = f.to_table(schema=dataset.schema) - assert result.column_names == ['f1', 'f2', 'part'] + assert result.column_names == ["f1", "f2", "part"] assert result.equals(table.slice(0, 4)) assert f.physical_schema == result.schema.remove(2) # scanning fragments follow filter predicate - result = f.to_table(schema=dataset.schema, filter=ds.field('f1') < 2) - assert result.column_names == ['f1', 'f2', 'part'] + result = f.to_table(schema=dataset.schema, filter=ds.field("f1") < 2) + assert result.column_names == ["f1", "f2", "part"] @pytest.mark.pandas @@ -738,11 +758,11 @@ def test_fragments_implicit_cast(tempdir): # ARROW-8693 import pyarrow.parquet as pq - table = pa.table([range(8), [1] * 4 + [2] * 4], names=['col', 'part']) + table = pa.table([range(8), [1] * 4 + [2] * 4], names=["col", "part"]) path = str(tempdir / "test_parquet_dataset") pq.write_to_dataset(table, path, partition_cols=["part"]) - part = ds.partitioning(pa.schema([('part', 'int8')]), flavor="hive") + part = ds.partitioning(pa.schema([("part", "int8")]), flavor="hive") dataset = ds.dataset(path, format="parquet", partitioning=part) fragments = dataset.get_fragments(filter=ds.field("part") >= 2) assert len(list(fragments)) == 1 @@ -753,10 +773,8 @@ def test_fragments_implicit_cast(tempdir): def test_fragments_reconstruct(tempdir): table, dataset = _create_dataset_for_fragments(tempdir) - def assert_yields_projected(fragment, row_slice, - columns=None, filter=None): - actual = fragment.to_table( - schema=table.schema, columns=columns, filter=filter) + def assert_yields_projected(fragment, row_slice, columns=None, filter=None): + actual = fragment.to_table(schema=table.schema, columns=columns, filter=filter) column_names = columns if columns else table.column_names assert actual.column_names == column_names @@ -772,40 +790,52 @@ def assert_yields_projected(fragment, row_slice, # manually re-construct a fragment, with explicit schema new_fragment = parquet_format.make_fragment( - fragment.path, fragment.filesystem, - partition_expression=fragment.partition_expression) + fragment.path, + fragment.filesystem, + partition_expression=fragment.partition_expression, + ) assert new_fragment.to_table().equals(fragment.to_table()) assert_yields_projected(new_fragment, (0, 4)) # filter / column projection, inspected schema new_fragment = parquet_format.make_fragment( - fragment.path, fragment.filesystem, - partition_expression=fragment.partition_expression) - assert_yields_projected(new_fragment, (0, 2), filter=ds.field('f1') < 2) + fragment.path, + fragment.filesystem, + partition_expression=fragment.partition_expression, + ) + assert_yields_projected(new_fragment, (0, 2), filter=ds.field("f1") < 2) # filter requiring cast / column projection, inspected schema new_fragment = parquet_format.make_fragment( - fragment.path, fragment.filesystem, - partition_expression=fragment.partition_expression) - assert_yields_projected(new_fragment, (0, 2), - columns=['f1'], filter=ds.field('f1') < 2.0) + fragment.path, + fragment.filesystem, + partition_expression=fragment.partition_expression, + ) + assert_yields_projected( + new_fragment, (0, 2), columns=["f1"], filter=ds.field("f1") < 2.0 + ) # filter on the partition column new_fragment = parquet_format.make_fragment( - fragment.path, fragment.filesystem, - partition_expression=fragment.partition_expression) - assert_yields_projected(new_fragment, (0, 4), - filter=ds.field('part') == 'a') + fragment.path, + fragment.filesystem, + partition_expression=fragment.partition_expression, + ) + assert_yields_projected(new_fragment, (0, 4), filter=ds.field("part") == "a") # Fragments don't contain the partition's columns if not provided to the # `to_table(schema=...)` method. - pattern = (r'No match for FieldRef.Name\(part\) in ' + - fragment.physical_schema.to_string(False, False, False)) + pattern = ( + r"No match for FieldRef.Name\(part\) in " + + fragment.physical_schema.to_string(False, False, False) + ) with pytest.raises(ValueError, match=pattern): new_fragment = parquet_format.make_fragment( - fragment.path, fragment.filesystem, - partition_expression=fragment.partition_expression) - new_fragment.to_table(filter=ds.field('part') == 'a') + fragment.path, + fragment.filesystem, + partition_expression=fragment.partition_expression, + ) + new_fragment.to_table(filter=ds.field("part") == "a") @pytest.mark.pandas @@ -819,21 +849,21 @@ def test_fragments_parquet_row_groups(tempdir): row_group_fragments = list(fragment.split_by_row_group()) assert len(row_group_fragments) == fragment.num_row_groups == 2 result = row_group_fragments[0].to_table(schema=dataset.schema) - assert result.column_names == ['f1', 'f2', 'part'] + assert result.column_names == ["f1", "f2", "part"] assert len(result) == 2 assert result.equals(table.slice(0, 2)) assert row_group_fragments[0].row_groups is not None assert row_group_fragments[0].num_row_groups == 1 assert row_group_fragments[0].row_groups[0].statistics == { - 'f1': {'min': 0, 'max': 1}, - 'f2': {'min': 1, 'max': 1}, + "f1": {"min": 0, "max": 1}, + "f2": {"min": 1, "max": 1}, } - fragment = list(dataset.get_fragments(filter=ds.field('f1') < 1))[0] - row_group_fragments = list(fragment.split_by_row_group(ds.field('f1') < 1)) + fragment = list(dataset.get_fragments(filter=ds.field("f1") < 1))[0] + row_group_fragments = list(fragment.split_by_row_group(ds.field("f1") < 1)) assert len(row_group_fragments) == 1 - result = row_group_fragments[0].to_table(filter=ds.field('f1') < 1) + result = row_group_fragments[0].to_table(filter=ds.field("f1") < 1) assert len(result) == 1 @@ -841,15 +871,15 @@ def test_fragments_parquet_row_groups(tempdir): def test_fragments_parquet_num_row_groups(tempdir): import pyarrow.parquet as pq - table = pa.table({'a': range(8)}) + table = pa.table({"a": range(8)}) pq.write_table(table, tempdir / "test.parquet", row_group_size=2) dataset = ds.dataset(tempdir / "test.parquet", format="parquet") original_fragment = list(dataset.get_fragments())[0] # create fragment with subset of row groups fragment = original_fragment.format.make_fragment( - original_fragment.path, original_fragment.filesystem, - row_groups=[1, 3]) + original_fragment.path, original_fragment.filesystem, row_groups=[1, 3] + ) assert fragment.num_row_groups == 2 # ensure that parsing metadata preserves correct number of row groups fragment.ensure_complete_metadata() @@ -862,14 +892,16 @@ def test_fragments_parquet_num_row_groups(tempdir): def test_fragments_parquet_row_groups_dictionary(tempdir): import pandas as pd - df = pd.DataFrame(dict(col1=['a', 'b'], col2=[1, 2])) - df['col1'] = df['col1'].astype("category") + df = pd.DataFrame(dict(col1=["a", "b"], col2=[1, 2])) + df["col1"] = df["col1"].astype("category") import pyarrow.parquet as pq + pq.write_table(pa.table(df), tempdir / "test_filter_dictionary.parquet") import pyarrow.dataset as ds - dataset = ds.dataset(tempdir / 'test_filter_dictionary.parquet') + + dataset = ds.dataset(tempdir / "test_filter_dictionary.parquet") result = dataset.to_table(filter=ds.field("col1") == "a") assert (df.iloc[0] == result.to_pandas()).all().all() @@ -879,9 +911,7 @@ def test_fragments_parquet_row_groups_dictionary(tempdir): @pytest.mark.parquet def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs): fs, assert_opens = open_logging_fs - _, dataset = _create_dataset_for_fragments( - tempdir, chunk_size=2, filesystem=fs - ) + _, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2, filesystem=fs) fragment = list(dataset.get_fragments())[0] # with default discovery, no metadata loaded @@ -931,38 +961,38 @@ def _create_dataset_all_types(tempdir, chunk_size=None): pa.array([1, 10, 42], pa.uint64()), pa.array([1.0, 10.0, 42.0], pa.float32()), pa.array([1.0, 10.0, 42.0], pa.float64()), - pa.array(['a', None, 'z'], pa.utf8()), - pa.array(['a', None, 'z'], pa.binary()), - pa.array([1, 10, 42], pa.timestamp('s')), - pa.array([1, 10, 42], pa.timestamp('ms')), - pa.array([1, 10, 42], pa.timestamp('us')), + pa.array(["a", None, "z"], pa.utf8()), + pa.array(["a", None, "z"], pa.binary()), + pa.array([1, 10, 42], pa.timestamp("s")), + pa.array([1, 10, 42], pa.timestamp("ms")), + pa.array([1, 10, 42], pa.timestamp("us")), pa.array([1, 10, 42], pa.date32()), pa.array([1, 10, 4200000000], pa.date64()), - pa.array([1, 10, 42], pa.time32('s')), - pa.array([1, 10, 42], pa.time64('us')), + pa.array([1, 10, 42], pa.time32("s")), + pa.array([1, 10, 42], pa.time64("us")), ], names=[ - 'boolean', - 'int8', - 'uint8', - 'int16', - 'uint16', - 'int32', - 'uint32', - 'int64', - 'uint64', - 'float', - 'double', - 'utf8', - 'binary', - 'ts[s]', - 'ts[ms]', - 'ts[us]', - 'date32', - 'date64', - 'time32', - 'time64', - ] + "boolean", + "int8", + "uint8", + "int16", + "uint16", + "int32", + "uint32", + "int64", + "uint64", + "float", + "double", + "utf8", + "binary", + "ts[s]", + "ts[ms]", + "ts[us]", + "date32", + "date64", + "time32", + "time64", + ], ) path = str(tempdir / "test_parquet_dataset_all_types") @@ -981,9 +1011,16 @@ def test_parquet_fragment_statistics(tempdir): fragment = list(dataset.get_fragments())[0] import datetime - def dt_s(x): return datetime.datetime(1970, 1, 1, 0, 0, x) - def dt_ms(x): return datetime.datetime(1970, 1, 1, 0, 0, 0, x*1000) - def dt_us(x): return datetime.datetime(1970, 1, 1, 0, 0, 0, x) + + def dt_s(x): + return datetime.datetime(1970, 1, 1, 0, 0, x) + + def dt_ms(x): + return datetime.datetime(1970, 1, 1, 0, 0, 0, x * 1000) + + def dt_us(x): + return datetime.datetime(1970, 1, 1, 0, 0, 0, x) + date = datetime.date time = datetime.time @@ -994,26 +1031,26 @@ def dt_us(x): return datetime.datetime(1970, 1, 1, 0, 0, 0, x) assert row_group.num_rows == 3 assert row_group.total_byte_size > 1000 assert row_group.statistics == { - 'boolean': {'min': False, 'max': True}, - 'int8': {'min': 1, 'max': 42}, - 'uint8': {'min': 1, 'max': 42}, - 'int16': {'min': 1, 'max': 42}, - 'uint16': {'min': 1, 'max': 42}, - 'int32': {'min': 1, 'max': 42}, - 'uint32': {'min': 1, 'max': 42}, - 'int64': {'min': 1, 'max': 42}, - 'uint64': {'min': 1, 'max': 42}, - 'float': {'min': 1.0, 'max': 42.0}, - 'double': {'min': 1.0, 'max': 42.0}, - 'utf8': {'min': 'a', 'max': 'z'}, - 'binary': {'min': b'a', 'max': b'z'}, - 'ts[s]': {'min': dt_s(1), 'max': dt_s(42)}, - 'ts[ms]': {'min': dt_ms(1), 'max': dt_ms(42)}, - 'ts[us]': {'min': dt_us(1), 'max': dt_us(42)}, - 'date32': {'min': date(1970, 1, 2), 'max': date(1970, 2, 12)}, - 'date64': {'min': date(1970, 1, 1), 'max': date(1970, 2, 18)}, - 'time32': {'min': time(0, 0, 1), 'max': time(0, 0, 42)}, - 'time64': {'min': time(0, 0, 0, 1), 'max': time(0, 0, 0, 42)}, + "boolean": {"min": False, "max": True}, + "int8": {"min": 1, "max": 42}, + "uint8": {"min": 1, "max": 42}, + "int16": {"min": 1, "max": 42}, + "uint16": {"min": 1, "max": 42}, + "int32": {"min": 1, "max": 42}, + "uint32": {"min": 1, "max": 42}, + "int64": {"min": 1, "max": 42}, + "uint64": {"min": 1, "max": 42}, + "float": {"min": 1.0, "max": 42.0}, + "double": {"min": 1.0, "max": 42.0}, + "utf8": {"min": "a", "max": "z"}, + "binary": {"min": b"a", "max": b"z"}, + "ts[s]": {"min": dt_s(1), "max": dt_s(42)}, + "ts[ms]": {"min": dt_ms(1), "max": dt_ms(42)}, + "ts[us]": {"min": dt_us(1), "max": dt_us(42)}, + "date32": {"min": date(1970, 1, 2), "max": date(1970, 2, 12)}, + "date64": {"min": date(1970, 1, 1), "max": date(1970, 2, 18)}, + "time32": {"min": time(0, 0, 1), "max": time(0, 0, 42)}, + "time64": {"min": time(0, 0, 0, 1), "max": time(0, 0, 0, 42)}, } @@ -1021,7 +1058,7 @@ def dt_us(x): return datetime.datetime(1970, 1, 1, 0, 0, 0, x) def test_parquet_fragment_statistics_nulls(tempdir): import pyarrow.parquet as pq - table = pa.table({'a': [0, 1, None, None], 'b': ['a', 'b', None, None]}) + table = pa.table({"a": [0, 1, None, None], "b": ["a", "b", None, None]}) pq.write_table(table, tempdir / "test.parquet", row_group_size=2) dataset = ds.dataset(tempdir / "test.parquet", format="parquet") @@ -1048,21 +1085,25 @@ def test_fragments_parquet_row_groups_predicate(tempdir): table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2) fragment = list(dataset.get_fragments())[0] - assert fragment.partition_expression.equals(ds.field('part') == 'a') + assert fragment.partition_expression.equals(ds.field("part") == "a") # predicate may reference a partition field not present in the # physical_schema if an explicit schema is provided to split_by_row_group # filter matches partition_expression: all row groups row_group_fragments = list( - fragment.split_by_row_group(filter=ds.field('part') == 'a', - schema=dataset.schema)) + fragment.split_by_row_group( + filter=ds.field("part") == "a", schema=dataset.schema + ) + ) assert len(row_group_fragments) == 2 # filter contradicts partition_expression: no row groups row_group_fragments = list( - fragment.split_by_row_group(filter=ds.field('part') == 'b', - schema=dataset.schema)) + fragment.split_by_row_group( + filter=ds.field("part") == "b", schema=dataset.schema + ) + ) assert len(row_group_fragments) == 0 @@ -1081,27 +1122,36 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir): # manually re-construct row group fragments new_fragment = parquet_format.make_fragment( - fragment.path, fragment.filesystem, + fragment.path, + fragment.filesystem, partition_expression=fragment.partition_expression, - row_groups=[0]) + row_groups=[0], + ) result = new_fragment.to_table() assert result.equals(row_group_fragments[0].to_table()) # manually re-construct a row group fragment with filter/column projection new_fragment = parquet_format.make_fragment( - fragment.path, fragment.filesystem, + fragment.path, + fragment.filesystem, partition_expression=fragment.partition_expression, - row_groups={1}) - result = new_fragment.to_table(schema=table.schema, columns=['f1', 'part'], - filter=ds.field('f1') < 3, ) - assert result.column_names == ['f1', 'part'] + row_groups={1}, + ) + result = new_fragment.to_table( + schema=table.schema, + columns=["f1", "part"], + filter=ds.field("f1") < 3, + ) + assert result.column_names == ["f1", "part"] assert len(result) == 1 # out of bounds row group index new_fragment = parquet_format.make_fragment( - fragment.path, fragment.filesystem, + fragment.path, + fragment.filesystem, partition_expression=fragment.partition_expression, - row_groups={2}) + row_groups={2}, + ) with pytest.raises(IndexError, match="references row group 2"): new_fragment.to_table() @@ -1110,8 +1160,7 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir): @pytest.mark.parquet def test_fragments_parquet_subset_ids(tempdir, open_logging_fs): fs, assert_opens = open_logging_fs - table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, - filesystem=fs) + table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, filesystem=fs) fragment = list(dataset.get_fragments())[0] # select with row group ids @@ -1138,8 +1187,7 @@ def test_fragments_parquet_subset_ids(tempdir, open_logging_fs): @pytest.mark.parquet def test_fragments_parquet_subset_filter(tempdir, open_logging_fs): fs, assert_opens = open_logging_fs - table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, - filesystem=fs) + table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, filesystem=fs) fragment = list(dataset.get_fragments())[0] # select with filter @@ -1181,62 +1229,62 @@ def test_fragments_parquet_subset_invalid(tempdir): def test_partitioning_factory(mockfs): - paths_or_selector = fs.FileSelector('subdir', recursive=True) + paths_or_selector = fs.FileSelector("subdir", recursive=True) format = ds.ParquetFileFormat() - options = ds.FileSystemFactoryOptions('subdir') - partitioning_factory = ds.DirectoryPartitioning.discover(['group', 'key']) + options = ds.FileSystemFactoryOptions("subdir") + partitioning_factory = ds.DirectoryPartitioning.discover(["group", "key"]) assert isinstance(partitioning_factory, ds.PartitioningFactory) options.partitioning_factory = partitioning_factory - factory = ds.FileSystemDatasetFactory( - mockfs, paths_or_selector, format, options - ) + factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options) inspected_schema = factory.inspect() # i64/f64 from data, group/key from "/1/xxx" and "/2/yyy" paths - expected_schema = pa.schema([ - ("i64", pa.int64()), - ("f64", pa.float64()), - ("str", pa.string()), - ("const", pa.int64()), - ("group", pa.int32()), - ("key", pa.string()), - ]) + expected_schema = pa.schema( + [ + ("i64", pa.int64()), + ("f64", pa.float64()), + ("str", pa.string()), + ("const", pa.int64()), + ("group", pa.int32()), + ("key", pa.string()), + ] + ) assert inspected_schema.equals(expected_schema) hive_partitioning_factory = ds.HivePartitioning.discover() assert isinstance(hive_partitioning_factory, ds.PartitioningFactory) -@pytest.mark.parametrize('infer_dictionary', [False, True]) +@pytest.mark.parametrize("infer_dictionary", [False, True]) def test_partitioning_factory_dictionary(mockfs, infer_dictionary): - paths_or_selector = fs.FileSelector('subdir', recursive=True) + paths_or_selector = fs.FileSelector("subdir", recursive=True) format = ds.ParquetFileFormat() - options = ds.FileSystemFactoryOptions('subdir') + options = ds.FileSystemFactoryOptions("subdir") options.partitioning_factory = ds.DirectoryPartitioning.discover( - ['group', 'key'], infer_dictionary=infer_dictionary) + ["group", "key"], infer_dictionary=infer_dictionary + ) - factory = ds.FileSystemDatasetFactory( - mockfs, paths_or_selector, format, options) + factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options) inferred_schema = factory.inspect() if infer_dictionary: expected_type = pa.dictionary(pa.int32(), pa.string()) - assert inferred_schema.field('key').type == expected_type + assert inferred_schema.field("key").type == expected_type table = factory.finish().to_table().combine_chunks() - actual = table.column('key').chunk(0) - expected = pa.array(['xxx'] * 5 + ['yyy'] * 5).dictionary_encode() + actual = table.column("key").chunk(0) + expected = pa.array(["xxx"] * 5 + ["yyy"] * 5).dictionary_encode() assert actual.equals(expected) # ARROW-9345 ensure filtering on the partition field works - table = factory.finish().to_table(filter=ds.field('key') == 'xxx') - actual = table.column('key').chunk(0) + table = factory.finish().to_table(filter=ds.field("key") == "xxx") + actual = table.column("key").chunk(0) expected = expected.slice(0, 5) assert actual.equals(expected) else: - assert inferred_schema.field('key').type == pa.string() + assert inferred_schema.field("key").type == pa.string() def test_partitioning_function(): @@ -1274,8 +1322,9 @@ def test_partitioning_function(): def _create_single_file(base_dir, table=None, row_group_size=None): import pyarrow.parquet as pq + if table is None: - table = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5}) + table = pa.table({"a": range(9), "b": [0.0] * 4 + [1.0] * 5}) path = base_dir / "test.parquet" pq.write_table(table, path, row_group_size=row_group_size) return table, path @@ -1283,10 +1332,11 @@ def _create_single_file(base_dir, table=None, row_group_size=None): def _create_directory_of_files(base_dir): import pyarrow.parquet as pq - table1 = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5}) + + table1 = pa.table({"a": range(9), "b": [0.0] * 4 + [1.0] * 5}) path1 = base_dir / "test1.parquet" pq.write_table(table1, path1) - table2 = pa.table({'a': range(9, 18), 'b': [0.] * 4 + [1.] * 5}) + table2 = pa.table({"a": range(9, 18), "b": [0.0] * 4 + [1.0] * 5}) path2 = base_dir / "test2.parquet" pq.write_table(table2, path2) return (table1, table2), (path1, path2) @@ -1343,13 +1393,8 @@ def test_open_dataset_list_of_files(tempdir): tables, (path1, path2) = _create_directory_of_files(tempdir) table = pa.concat_tables(tables) - datasets = [ - ds.dataset([path1, path2]), - ds.dataset([str(path1), str(path2)]) - ] - datasets += [ - pickle.loads(pickle.dumps(d)) for d in datasets - ] + datasets = [ds.dataset([path1, path2]), ds.dataset([str(path1), str(path2)])] + datasets += [pickle.loads(pickle.dumps(d)) for d in datasets] for dataset in datasets: assert dataset.schema.equals(table.schema) @@ -1358,7 +1403,7 @@ def test_open_dataset_list_of_files(tempdir): def test_construct_from_single_file(tempdir): - directory = tempdir / 'single-file' + directory = tempdir / "single-file" directory.mkdir() table, path = _create_single_file(directory) relative_path = path.relative_to(directory) @@ -1376,7 +1421,7 @@ def test_construct_from_single_file(tempdir): def test_construct_from_single_directory(tempdir): - directory = tempdir / 'single-directory' + directory = tempdir / "single-directory" directory.mkdir() tables, paths = _create_directory_of_files(directory) @@ -1396,7 +1441,7 @@ def test_construct_from_single_directory(tempdir): def test_construct_from_list_of_files(tempdir): # instantiate from a list of files - directory = tempdir / 'list-of-files' + directory = tempdir / "list-of-files" directory.mkdir() tables, paths = _create_directory_of_files(directory) @@ -1419,18 +1464,19 @@ def test_construct_from_list_of_files(tempdir): def test_construct_from_list_of_mixed_paths_fails(mockfs): # isntantiate from a list of mixed paths files = [ - 'subdir/1/xxx/file0.parquet', - 'subdir/1/xxx/doesnt-exist.parquet', + "subdir/1/xxx/file0.parquet", + "subdir/1/xxx/doesnt-exist.parquet", ] - with pytest.raises(FileNotFoundError, match='doesnt-exist'): + with pytest.raises(FileNotFoundError, match="doesnt-exist"): ds.dataset(files, filesystem=mockfs) def test_construct_from_mixed_child_datasets(mockfs): # isntantiate from a list of mixed paths - a = ds.dataset(['subdir/1/xxx/file0.parquet', - 'subdir/2/yyy/file1.parquet'], filesystem=mockfs) - b = ds.dataset('subdir', filesystem=mockfs) + a = ds.dataset( + ["subdir/1/xxx/file0.parquet", "subdir/2/yyy/file1.parquet"], filesystem=mockfs + ) + b = ds.dataset("subdir", filesystem=mockfs) dataset = ds.dataset([a, b]) @@ -1443,8 +1489,10 @@ def test_construct_from_mixed_child_datasets(mockfs): assert len(dataset.children) == 2 for child in dataset.children: - assert child.files == ['subdir/1/xxx/file0.parquet', - 'subdir/2/yyy/file1.parquet'] + assert child.files == [ + "subdir/1/xxx/file0.parquet", + "subdir/2/yyy/file1.parquet", + ] def test_construct_empty_dataset(): @@ -1453,10 +1501,7 @@ def test_construct_empty_dataset(): assert table.num_rows == 0 assert table.num_columns == 0 - empty = ds.dataset([], schema=pa.schema([ - ('a', pa.int64()), - ('a', pa.string()) - ])) + empty = ds.dataset([], schema=pa.schema([("a", pa.int64()), ("a", pa.string())])) table = empty.to_table() assert table.num_rows == 0 assert table.num_columns == 2 @@ -1464,17 +1509,13 @@ def test_construct_empty_dataset(): def test_construct_from_invalid_sources_raise(multisourcefs): child1 = ds.FileSystemDatasetFactory( - multisourcefs, - fs.FileSelector('/plain'), - format=ds.ParquetFileFormat() + multisourcefs, fs.FileSelector("/plain"), format=ds.ParquetFileFormat() ) child2 = ds.FileSystemDatasetFactory( - multisourcefs, - fs.FileSelector('/schema'), - format=ds.ParquetFileFormat() + multisourcefs, fs.FileSelector("/schema"), format=ds.ParquetFileFormat() ) - with pytest.raises(TypeError, match='Expected.*FileSystemDatasetFactory'): + with pytest.raises(TypeError, match="Expected.*FileSystemDatasetFactory"): ds.dataset([child1, child2]) expected = ( @@ -1495,7 +1536,8 @@ def test_construct_from_invalid_sources_raise(multisourcefs): @pytest.mark.parquet def test_open_dataset_partitioned_directory(tempdir): import pyarrow.parquet as pq - table = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5}) + + table = pa.table({"a": range(9), "b": [0.0] * 4 + [1.0] * 5}) path = tempdir / "dataset" path.mkdir() @@ -1510,15 +1552,13 @@ def test_open_dataset_partitioned_directory(tempdir): _check_dataset_from_path(path, full_table) # specify partition scheme with discovery - dataset = ds.dataset( - str(path), partitioning=ds.partitioning(flavor="hive")) + dataset = ds.dataset(str(path), partitioning=ds.partitioning(flavor="hive")) expected_schema = table.schema.append(pa.field("part", pa.int32())) assert dataset.schema.equals(expected_schema) # specify partition scheme with discovery and relative path with change_cwd(tempdir): - dataset = ds.dataset( - "dataset/", partitioning=ds.partitioning(flavor="hive")) + dataset = ds.dataset("dataset/", partitioning=ds.partitioning(flavor="hive")) expected_schema = table.schema.append(pa.field("part", pa.int32())) assert dataset.schema.equals(expected_schema) @@ -1529,14 +1569,15 @@ def test_open_dataset_partitioned_directory(tempdir): # specify partition scheme with explicit scheme dataset = ds.dataset( str(path), - partitioning=ds.partitioning( - pa.schema([("part", pa.int8())]), flavor="hive")) + partitioning=ds.partitioning(pa.schema([("part", pa.int8())]), flavor="hive"), + ) expected_schema = table.schema.append(pa.field("part", pa.int8())) assert dataset.schema.equals(expected_schema) result = dataset.to_table() expected = full_table.append_column( - "part", pa.array(np.repeat([0, 1, 2], 9), type=pa.int8())) + "part", pa.array(np.repeat([0, 1, 2], 9), type=pa.int8()) + ) assert result.equals(expected) @@ -1583,7 +1624,7 @@ def test_open_union_dataset(tempdir): def test_open_union_dataset_with_additional_kwargs(multisourcefs): - child = ds.dataset('/plain', filesystem=multisourcefs, format='parquet') + child = ds.dataset("/plain", filesystem=multisourcefs, format="parquet") with pytest.raises(ValueError, match="cannot pass any additional"): ds.dataset([child], format="parquet") @@ -1592,33 +1633,57 @@ def test_open_dataset_non_existing_file(): # ARROW-8213: Opening a dataset with a local incorrect path gives confusing # error message with pytest.raises(FileNotFoundError): - ds.dataset('i-am-not-existing.parquet', format='parquet') + ds.dataset("i-am-not-existing.parquet", format="parquet") - with pytest.raises(pa.ArrowInvalid, match='cannot be relative'): - ds.dataset('file:i-am-not-existing.parquet', format='parquet') + with pytest.raises(pa.ArrowInvalid, match="cannot be relative"): + ds.dataset("file:i-am-not-existing.parquet", format="parquet") @pytest.mark.parquet -@pytest.mark.parametrize('partitioning', ["directory", "hive"]) -@pytest.mark.parametrize('partition_keys', [ - (["A", "B", "C"], [1, 2, 3]), - ([1, 2, 3], ["A", "B", "C"]), - (["A", "B", "C"], ["D", "E", "F"]), - ([1, 2, 3], [4, 5, 6]), -]) -def test_open_dataset_partitioned_dictionary_type(tempdir, partitioning, - partition_keys): +@pytest.mark.parametrize("partitioning", ["directory", "hive"]) +@pytest.mark.parametrize("null_fallback", ["xyz", None]) +@pytest.mark.parametrize( + "partition_keys", + [ + (["A", "B", "C"], [1, 2, 3]), + ([1, 2, 3], ["A", "B", "C"]), + (["A", "B", "C"], ["D", "E", "F"]), + ([1, 2, 3], [4, 5, 6]), + ([1, None, 3], ["A", "B", "C"]), + ([1, 2, 3], ["A", None, "C"]), + ([None, 2, 3], [None, 2, 3]), + ], +) +def test_open_dataset_partitioned_dictionary_type( + tempdir, partitioning, null_fallback, partition_keys +): # ARROW-9288 / ARROW-9476 import pyarrow.parquet as pq - table = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5}) + + table = pa.table({"a": range(9), "b": [0.0] * 4 + [1.0] * 5}) + + if None in partition_keys[0] or None in partition_keys[1]: + # Directory partitioning can't handle the first part being null + return if partitioning == "directory": partitioning = ds.DirectoryPartitioning.discover( - ["part1", "part2"], infer_dictionary=True) + ["part1", "part2"], infer_dictionary=True + ) fmt = "{0}/{1}" + null_value = None else: - partitioning = ds.HivePartitioning.discover(infer_dictionary=True) + if null_fallback: + partitioning = ds.HivePartitioning.discover( + infer_dictionary=True, null_fallback=null_fallback + ) + else: + partitioning = ds.HivePartitioning.discover(infer_dictionary=True) fmt = "part1={0}/part2={1}" + if null_fallback: + null_value = null_fallback + else: + null_value = "__HIVE_DEFAULT_PARTITION__" basepath = tempdir / "dataset" basepath.mkdir() @@ -1626,7 +1691,7 @@ def test_open_dataset_partitioned_dictionary_type(tempdir, partitioning, part_keys1, part_keys2 = partition_keys for part1 in part_keys1: for part2 in part_keys2: - path = basepath / fmt.format(part1, part2) + path = basepath / fmt.format(part1 or null_value, part2 or null_value) path.mkdir(parents=True) pq.write_table(table, path / "test.parquet") @@ -1635,11 +1700,10 @@ def test_open_dataset_partitioned_dictionary_type(tempdir, partitioning, def dict_type(key): value_type = pa.string() if isinstance(key, str) else pa.int32() return pa.dictionary(pa.int32(), value_type) + expected_schema = table.schema.append( pa.field("part1", dict_type(part_keys1[0])) - ).append( - pa.field("part2", dict_type(part_keys2[0])) - ) + ).append(pa.field("part2", dict_type(part_keys2[0]))) assert dataset.schema.equals(expected_schema) @@ -1680,15 +1744,14 @@ def s3_example_simple(s3_connection, s3_server): import pyarrow.parquet as pq host, port, access_key, secret_key = s3_connection - uri = ( - "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}" - .format(access_key, secret_key, host, port) + uri = "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}".format( + access_key, secret_key, host, port ) fs, path = FileSystem.from_uri(uri) fs.create_dir("mybucket") - table = pa.table({'a': [1, 2, 3]}) + table = pa.table({"a": [1, 2, 3]}) with fs.open_output_stream("mybucket/data.parquet") as out: pq.write_table(table, out) @@ -1721,9 +1784,7 @@ def test_open_dataset_from_uri_s3_fsspec(s3_example_simple): fs = s3fs.S3FileSystem( key=access_key, secret=secret_key, - client_kwargs={ - 'endpoint_url': 'http://{}:{}'.format(host, port) - } + client_kwargs={"endpoint_url": "http://{}:{}".format(host, port)}, ) # passing as fsspec filesystem @@ -1743,18 +1804,18 @@ def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server): import pyarrow.parquet as pq host, port, access_key, secret_key = s3_connection - bucket = 'theirbucket' - path = 'nested/folder/data.parquet' + bucket = "theirbucket" + path = "nested/folder/data.parquet" uri = "s3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}".format( access_key, secret_key, bucket, path, host, port ) fs, path = FileSystem.from_uri(uri) - assert path == 'theirbucket/nested/folder/data.parquet' + assert path == "theirbucket/nested/folder/data.parquet" fs.create_dir(bucket) - table = pa.table({'a': [1, 2, 3]}) + table = pa.table({"a": [1, 2, 3]}) with fs.open_output_stream(path) as out: pq.write_table(table, out) @@ -1763,27 +1824,25 @@ def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server): assert dataset.to_table().equals(table) # passing filesystem as an uri - template = ( - "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format( - access_key, secret_key, host, port - ) + template = "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format( + access_key, secret_key, host, port ) cases = [ - ('theirbucket/nested/folder/', '/data.parquet'), - ('theirbucket/nested/folder', 'data.parquet'), - ('theirbucket/nested/', 'folder/data.parquet'), - ('theirbucket/nested', 'folder/data.parquet'), - ('theirbucket', '/nested/folder/data.parquet'), - ('theirbucket', 'nested/folder/data.parquet'), + ("theirbucket/nested/folder/", "/data.parquet"), + ("theirbucket/nested/folder", "data.parquet"), + ("theirbucket/nested/", "folder/data.parquet"), + ("theirbucket/nested", "folder/data.parquet"), + ("theirbucket", "/nested/folder/data.parquet"), + ("theirbucket", "nested/folder/data.parquet"), ] for prefix, path in cases: uri = template.format(prefix) dataset = ds.dataset(path, filesystem=uri, format="parquet") assert dataset.to_table().equals(table) - with pytest.raises(pa.ArrowInvalid, match='Missing bucket name'): - uri = template.format('/') - ds.dataset('/theirbucket/nested/folder/data.parquet', filesystem=uri) + with pytest.raises(pa.ArrowInvalid, match="Missing bucket name"): + uri = template.format("/") + ds.dataset("/theirbucket/nested/folder/data.parquet", filesystem=uri) error = ( "The path component of the filesystem URI must point to a directory " @@ -1791,17 +1850,17 @@ def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server): "filesystem URI is `{}`" ) - path = 'theirbucket/doesnt/exist' + path = "theirbucket/doesnt/exist" uri = template.format(path) with pytest.raises(ValueError) as exc: - ds.dataset('data.parquet', filesystem=uri) - assert str(exc.value) == error.format('NotFound', path, uri) + ds.dataset("data.parquet", filesystem=uri) + assert str(exc.value) == error.format("NotFound", path, uri) - path = 'theirbucket/nested/folder/data.parquet' + path = "theirbucket/nested/folder/data.parquet" uri = template.format(path) with pytest.raises(ValueError) as exc: - ds.dataset('data.parquet', filesystem=uri) - assert str(exc.value) == error.format('File', path, uri) + ds.dataset("data.parquet", filesystem=uri) + assert str(exc.value) == error.format("File", path, uri) @pytest.mark.parquet @@ -1846,18 +1905,17 @@ def test_filter_timestamp(tempdir): @pytest.mark.parquet def test_filter_implicit_cast(tempdir): # ARROW-7652 - table = pa.table({'a': pa.array([0, 1, 2, 3, 4, 5], type=pa.int8())}) + table = pa.table({"a": pa.array([0, 1, 2, 3, 4, 5], type=pa.int8())}) _, path = _create_single_file(tempdir, table) dataset = ds.dataset(str(path)) - filter_ = ds.field('a') > 2 + filter_ = ds.field("a") > 2 assert len(dataset.to_table(filter=filter_)) == 3 def test_dataset_union(multisourcefs): child = ds.FileSystemDatasetFactory( - multisourcefs, fs.FileSelector('/plain'), - format=ds.ParquetFileFormat() + multisourcefs, fs.FileSelector("/plain"), format=ds.ParquetFileFormat() ) factory = ds.UnionDatasetFactory([child]) @@ -1870,106 +1928,128 @@ def test_dataset_union(multisourcefs): def test_union_dataset_from_other_datasets(tempdir, multisourcefs): - child1 = ds.dataset('/plain', filesystem=multisourcefs, format='parquet') - child2 = ds.dataset('/schema', filesystem=multisourcefs, format='parquet', - partitioning=['week', 'color']) - child3 = ds.dataset('/hive', filesystem=multisourcefs, format='parquet', - partitioning='hive') + child1 = ds.dataset("/plain", filesystem=multisourcefs, format="parquet") + child2 = ds.dataset( + "/schema", + filesystem=multisourcefs, + format="parquet", + partitioning=["week", "color"], + ) + child3 = ds.dataset( + "/hive", filesystem=multisourcefs, format="parquet", partitioning="hive" + ) assert child1.schema != child2.schema != child3.schema assembled = ds.dataset([child1, child2, child3]) assert isinstance(assembled, ds.UnionDataset) - msg = 'cannot pass any additional arguments' + msg = "cannot pass any additional arguments" with pytest.raises(ValueError, match=msg): ds.dataset([child1, child2], filesystem=multisourcefs) - expected_schema = pa.schema([ - ('date', pa.date32()), - ('index', pa.int64()), - ('value', pa.float64()), - ('color', pa.string()), - ('week', pa.int32()), - ('year', pa.int32()), - ('month', pa.int32()), - ]) + expected_schema = pa.schema( + [ + ("date", pa.date32()), + ("index", pa.int64()), + ("value", pa.float64()), + ("color", pa.string()), + ("week", pa.int32()), + ("year", pa.int32()), + ("month", pa.int32()), + ] + ) assert assembled.schema.equals(expected_schema) assert assembled.to_table().schema.equals(expected_schema) assembled = ds.dataset([child1, child3]) - expected_schema = pa.schema([ - ('date', pa.date32()), - ('index', pa.int64()), - ('value', pa.float64()), - ('color', pa.string()), - ('year', pa.int32()), - ('month', pa.int32()), - ]) + expected_schema = pa.schema( + [ + ("date", pa.date32()), + ("index", pa.int64()), + ("value", pa.float64()), + ("color", pa.string()), + ("year", pa.int32()), + ("month", pa.int32()), + ] + ) assert assembled.schema.equals(expected_schema) assert assembled.to_table().schema.equals(expected_schema) - expected_schema = pa.schema([ - ('month', pa.int32()), - ('color', pa.string()), - ('date', pa.date32()), - ]) + expected_schema = pa.schema( + [ + ("month", pa.int32()), + ("color", pa.string()), + ("date", pa.date32()), + ] + ) assembled = ds.dataset([child1, child3], schema=expected_schema) assert assembled.to_table().schema.equals(expected_schema) - expected_schema = pa.schema([ - ('month', pa.int32()), - ('color', pa.string()), - ('unknown', pa.string()) # fill with nulls - ]) + expected_schema = pa.schema( + [ + ("month", pa.int32()), + ("color", pa.string()), + ("unknown", pa.string()), # fill with nulls + ] + ) assembled = ds.dataset([child1, child3], schema=expected_schema) assert assembled.to_table().schema.equals(expected_schema) # incompatible schemas, date and index columns have conflicting types - table = pa.table([range(9), [0.] * 4 + [1.] * 5, 'abcdefghj'], - names=['date', 'value', 'index']) + table = pa.table( + [range(9), [0.0] * 4 + [1.0] * 5, "abcdefghj"], names=["date", "value", "index"] + ) _, path = _create_single_file(tempdir, table=table) child4 = ds.dataset(path) - with pytest.raises(pa.ArrowInvalid, match='Unable to merge'): + with pytest.raises(pa.ArrowInvalid, match="Unable to merge"): ds.dataset([child1, child4]) def test_dataset_from_a_list_of_local_directories_raises(multisourcefs): - msg = 'points to a directory, but only file paths are supported' + msg = "points to a directory, but only file paths are supported" with pytest.raises(IsADirectoryError, match=msg): - ds.dataset(['/plain', '/schema', '/hive'], filesystem=multisourcefs) + ds.dataset(["/plain", "/schema", "/hive"], filesystem=multisourcefs) def test_union_dataset_filesystem_datasets(multisourcefs): # without partitioning - dataset = ds.dataset([ - ds.dataset('/plain', filesystem=multisourcefs), - ds.dataset('/schema', filesystem=multisourcefs), - ds.dataset('/hive', filesystem=multisourcefs), - ]) - expected_schema = pa.schema([ - ('date', pa.date32()), - ('index', pa.int64()), - ('value', pa.float64()), - ('color', pa.string()), - ]) + dataset = ds.dataset( + [ + ds.dataset("/plain", filesystem=multisourcefs), + ds.dataset("/schema", filesystem=multisourcefs), + ds.dataset("/hive", filesystem=multisourcefs), + ] + ) + expected_schema = pa.schema( + [ + ("date", pa.date32()), + ("index", pa.int64()), + ("value", pa.float64()), + ("color", pa.string()), + ] + ) assert dataset.schema.equals(expected_schema) # with hive partitioning for two hive sources - dataset = ds.dataset([ - ds.dataset('/plain', filesystem=multisourcefs), - ds.dataset('/schema', filesystem=multisourcefs), - ds.dataset('/hive', filesystem=multisourcefs, partitioning='hive') - ]) - expected_schema = pa.schema([ - ('date', pa.date32()), - ('index', pa.int64()), - ('value', pa.float64()), - ('color', pa.string()), - ('year', pa.int32()), - ('month', pa.int32()), - ]) + dataset = ds.dataset( + [ + ds.dataset("/plain", filesystem=multisourcefs), + ds.dataset("/schema", filesystem=multisourcefs), + ds.dataset("/hive", filesystem=multisourcefs, partitioning="hive"), + ] + ) + expected_schema = pa.schema( + [ + ("date", pa.date32()), + ("index", pa.int64()), + ("value", pa.float64()), + ("color", pa.string()), + ("year", pa.int32()), + ("month", pa.int32()), + ] + ) assert dataset.schema.equals(expected_schema) @@ -1977,7 +2057,7 @@ def test_union_dataset_filesystem_datasets(multisourcefs): def test_specified_schema(tempdir): import pyarrow.parquet as pq - table = pa.table({'a': [1, 2, 3], 'b': [.1, .2, .3]}) + table = pa.table({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) pq.write_table(table, tempdir / "data.parquet") def _check_dataset(schema, expected, expected_schema=None): @@ -2000,24 +2080,24 @@ def _check_dataset(schema, expected, expected_schema=None): _check_dataset(schema, expected) # Specifying schema with change column order - schema = pa.schema([('b', 'float64'), ('a', 'int64')]) - expected = pa.table([[.1, .2, .3], [1, 2, 3]], names=['b', 'a']) + schema = pa.schema([("b", "float64"), ("a", "int64")]) + expected = pa.table([[0.1, 0.2, 0.3], [1, 2, 3]], names=["b", "a"]) _check_dataset(schema, expected) # Specifying schema with missing column - schema = pa.schema([('a', 'int64')]) - expected = pa.table([[1, 2, 3]], names=['a']) + schema = pa.schema([("a", "int64")]) + expected = pa.table([[1, 2, 3]], names=["a"]) _check_dataset(schema, expected) # Specifying schema with additional column - schema = pa.schema([('a', 'int64'), ('c', 'int32')]) - expected = pa.table([[1, 2, 3], - pa.array([None, None, None], type='int32')], - names=['a', 'c']) + schema = pa.schema([("a", "int64"), ("c", "int32")]) + expected = pa.table( + [[1, 2, 3], pa.array([None, None, None], type="int32")], names=["a", "c"] + ) _check_dataset(schema, expected) # Specifying with incompatible schema - schema = pa.schema([('a', 'int32'), ('b', 'float64')]) + schema = pa.schema([("a", "int32"), ("b", "float64")]) dataset = ds.dataset(str(tempdir / "data.parquet"), schema=schema) assert dataset.schema.equals(schema) with pytest.raises(TypeError): @@ -2025,10 +2105,14 @@ def _check_dataset(schema, expected, expected_schema=None): def test_ipc_format(tempdir): - table = pa.table({'a': pa.array([1, 2, 3], type="int8"), - 'b': pa.array([.1, .2, .3], type="float64")}) + table = pa.table( + { + "a": pa.array([1, 2, 3], type="int8"), + "b": pa.array([0.1, 0.2, 0.3], type="float64"), + } + ) - path = str(tempdir / 'test.arrow') + path = str(tempdir / "test.arrow") with pa.output_stream(path) as sink: writer = pa.RecordBatchFileWriter(sink, table.schema) writer.write_batch(table.to_batches()[0]) @@ -2046,17 +2130,21 @@ def test_ipc_format(tempdir): @pytest.mark.pandas def test_csv_format(tempdir): - table = pa.table({'a': pa.array([1, 2, 3], type="int64"), - 'b': pa.array([.1, .2, .3], type="float64")}) + table = pa.table( + { + "a": pa.array([1, 2, 3], type="int64"), + "b": pa.array([0.1, 0.2, 0.3], type="float64"), + } + ) - path = str(tempdir / 'test.csv') + path = str(tempdir / "test.csv") table.to_pandas().to_csv(path, index=False) dataset = ds.dataset(path, format=ds.CsvFileFormat()) result = dataset.to_table() assert result.equals(table) - dataset = ds.dataset(path, format='csv') + dataset = ds.dataset(path, format="csv") result = dataset.to_table() assert result.equals(table) @@ -2064,8 +2152,12 @@ def test_csv_format(tempdir): def test_feather_format(tempdir): from pyarrow.feather import write_feather - table = pa.table({'a': pa.array([1, 2, 3], type="int8"), - 'b': pa.array([.1, .2, .3], type="float64")}) + table = pa.table( + { + "a": pa.array([1, 2, 3], type="int8"), + "b": pa.array([0.1, 0.2, 0.3], type="float64"), + } + ) basedir = tempdir / "feather_dataset" basedir.mkdir() @@ -2097,16 +2189,15 @@ def _create_parquet_dataset_simple(root_path): metadata_collector = [] for i in range(4): - table = pa.table({'f1': [i] * 10, 'f2': np.random.randn(10)}) + table = pa.table({"f1": [i] * 10, "f2": np.random.randn(10)}) pq.write_to_dataset( table, str(root_path), metadata_collector=metadata_collector ) - metadata_path = str(root_path / '_metadata') + metadata_path = str(root_path / "_metadata") # write _metadata file pq.write_metadata( - table.schema, metadata_path, - metadata_collector=metadata_collector + table.schema, metadata_path, metadata_collector=metadata_collector ) return metadata_path, table @@ -2151,22 +2242,23 @@ def _create_metadata_file(root_path): metadata_collector.append(metadata) metadata_path = root_path / "_metadata" - pq.write_metadata( - schema, metadata_path, metadata_collector=metadata_collector - ) + pq.write_metadata(schema, metadata_path, metadata_collector=metadata_collector) return metadata_path def _create_parquet_dataset_partitioned(root_path): import pyarrow.parquet as pq - table = pa.table([ - pa.array(range(20)), pa.array(np.random.randn(20)), - pa.array(np.repeat(['a', 'b'], 10))], - names=["f1", "f2", "part"] + table = pa.table( + [ + pa.array(range(20)), + pa.array(np.random.randn(20)), + pa.array(np.repeat(["a", "b"], 10)), + ], + names=["f1", "f2", "part"], ) table = table.replace_schema_metadata({"key": "value"}) - pq.write_to_dataset(table, str(root_path), partition_cols=['part']) + pq.write_to_dataset(table, str(root_path), partition_cols=["part"]) return _create_metadata_file(root_path), table @@ -2219,9 +2311,8 @@ def test_parquet_dataset_lazy_filtering(tempdir, open_logging_fs): # creating the dataset should only open the metadata file with assert_opens([metadata_path]): dataset = ds.parquet_dataset( - metadata_path, - partitioning=ds.partitioning(flavor="hive"), - filesystem=fs) + metadata_path, partitioning=ds.partitioning(flavor="hive"), filesystem=fs + ) # materializing fragments should not open any file with assert_opens([]): @@ -2250,7 +2341,7 @@ def test_parquet_dataset_lazy_filtering(tempdir, open_logging_fs): @pytest.mark.pandas def test_dataset_schema_metadata(tempdir): # ARROW-8802 - df = pd.DataFrame({'a': [1, 2, 3]}) + df = pd.DataFrame({"a": [1, 2, 3]}) path = tempdir / "test.parquet" df.to_parquet(path) dataset = ds.dataset(path) @@ -2269,13 +2360,12 @@ def test_filter_mismatching_schema(tempdir): # ARROW-9146 import pyarrow.parquet as pq - table = pa.table({"col": pa.array([1, 2, 3, 4], type='int32')}) + table = pa.table({"col": pa.array([1, 2, 3, 4], type="int32")}) pq.write_table(table, str(tempdir / "data.parquet")) # specifying explicit schema, but that mismatches the schema of the data schema = pa.schema([("col", pa.int64())]) - dataset = ds.dataset( - tempdir / "data.parquet", format="parquet", schema=schema) + dataset = ds.dataset(tempdir / "data.parquet", format="parquet", schema=schema) # filtering on a column with such type mismatch should give a proper error with pytest.raises(TypeError): @@ -2292,65 +2382,70 @@ def test_dataset_project_only_partition_columns(tempdir): # ARROW-8729 import pyarrow.parquet as pq - table = pa.table({'part': 'a a b b'.split(), 'col': list(range(4))}) + table = pa.table({"part": "a a b b".split(), "col": list(range(4))}) - path = str(tempdir / 'test_dataset') - pq.write_to_dataset(table, path, partition_cols=['part']) - dataset = ds.dataset(path, partitioning='hive') + path = str(tempdir / "test_dataset") + pq.write_to_dataset(table, path, partition_cols=["part"]) + dataset = ds.dataset(path, partitioning="hive") all_cols = dataset.to_table(use_threads=False) - part_only = dataset.to_table(columns=['part'], use_threads=False) + part_only = dataset.to_table(columns=["part"], use_threads=False) - assert all_cols.column('part').equals(part_only.column('part')) + assert all_cols.column("part").equals(part_only.column("part")) @pytest.mark.parquet @pytest.mark.pandas def test_dataset_project_null_column(tempdir): import pandas as pd - df = pd.DataFrame({"col": np.array([None, None, None], dtype='object')}) + + df = pd.DataFrame({"col": np.array([None, None, None], dtype="object")}) f = tempdir / "test_dataset_project_null_column.parquet" df.to_parquet(f, engine="pyarrow") - dataset = ds.dataset(f, format="parquet", - schema=pa.schema([("col", pa.int64())])) - expected = pa.table({'col': pa.array([None, None, None], pa.int64())}) + dataset = ds.dataset(f, format="parquet", schema=pa.schema([("col", pa.int64())])) + expected = pa.table({"col": pa.array([None, None, None], pa.int64())}) assert dataset.to_table().equals(expected) -def _check_dataset_roundtrip(dataset, base_dir, expected_files, - base_dir_path=None, partitioning=None): +def _check_dataset_roundtrip( + dataset, base_dir, expected_files, base_dir_path=None, partitioning=None +): base_dir_path = base_dir_path or base_dir - ds.write_dataset(dataset, base_dir, format="feather", - partitioning=partitioning, use_threads=False) + ds.write_dataset( + dataset, + base_dir, + format="feather", + partitioning=partitioning, + use_threads=False, + ) # check that all files are present file_paths = list(base_dir_path.rglob("*")) assert set(file_paths) == set(expected_files) # check that reading back in as dataset gives the same result - dataset2 = ds.dataset( - base_dir_path, format="feather", partitioning=partitioning) + dataset2 = ds.dataset(base_dir_path, format="feather", partitioning=partitioning) assert dataset2.to_table().equals(dataset.to_table()) @pytest.mark.parquet def test_write_dataset(tempdir): # manually create a written dataset and read as dataset object - directory = tempdir / 'single-file' + directory = tempdir / "single-file" directory.mkdir() _ = _create_single_file(directory) dataset = ds.dataset(directory) # full string path - target = tempdir / 'single-file-target' + target = tempdir / "single-file-target" expected_files = [target / "part-0.feather"] _check_dataset_roundtrip(dataset, str(target), expected_files, target) # pathlib path object - target = tempdir / 'single-file-target2' + target = tempdir / "single-file-target2" expected_files = [target / "part-0.feather"] _check_dataset_roundtrip(dataset, target, expected_files, target) @@ -2362,12 +2457,12 @@ def test_write_dataset(tempdir): # dataset, './single-file-target3', expected_files, target) # Directory of files - directory = tempdir / 'single-directory' + directory = tempdir / "single-directory" directory.mkdir() _ = _create_directory_of_files(directory) dataset = ds.dataset(directory) - target = tempdir / 'single-directory-target' + target = tempdir / "single-directory-target" expected_files = [target / "part-0.feather"] _check_dataset_roundtrip(dataset, str(target), expected_files, target) @@ -2381,28 +2476,32 @@ def test_write_dataset_partitioned(tempdir): dataset = ds.dataset(directory, partitioning=partitioning) # hive partitioning - target = tempdir / 'partitioned-hive-target' + target = tempdir / "partitioned-hive-target" expected_paths = [ - target / "part=a", target / "part=a" / "part-0.feather", - target / "part=b", target / "part=b" / "part-1.feather" + target / "part=a", + target / "part=a" / "part-0.feather", + target / "part=b", + target / "part=b" / "part-1.feather", ] partitioning_schema = ds.partitioning( - pa.schema([("part", pa.string())]), flavor="hive") + pa.schema([("part", pa.string())]), flavor="hive" + ) _check_dataset_roundtrip( - dataset, str(target), expected_paths, target, - partitioning=partitioning_schema) + dataset, str(target), expected_paths, target, partitioning=partitioning_schema + ) # directory partitioning - target = tempdir / 'partitioned-dir-target' + target = tempdir / "partitioned-dir-target" expected_paths = [ - target / "a", target / "a" / "part-0.feather", - target / "b", target / "b" / "part-1.feather" + target / "a", + target / "a" / "part-0.feather", + target / "b", + target / "b" / "part-1.feather", ] - partitioning_schema = ds.partitioning( - pa.schema([("part", pa.string())])) + partitioning_schema = ds.partitioning(pa.schema([("part", pa.string())])) _check_dataset_roundtrip( - dataset, str(target), expected_paths, target, - partitioning=partitioning_schema) + dataset, str(target), expected_paths, target, partitioning=partitioning_schema + ) @pytest.mark.parquet @@ -2413,22 +2512,25 @@ def test_write_dataset_partitioned_dict(tempdir): # directory partitioning, dictionary partition columns dataset = ds.dataset( - directory, - partitioning=ds.HivePartitioning.discover(infer_dictionary=True)) - target = tempdir / 'partitioned-dir-target' + directory, partitioning=ds.HivePartitioning.discover(infer_dictionary=True) + ) + target = tempdir / "partitioned-dir-target" expected_paths = [ - target / "a", target / "a" / "part-0.feather", - target / "b", target / "b" / "part-1.feather" + target / "a", + target / "a" / "part-0.feather", + target / "b", + target / "b" / "part-1.feather", ] - partitioning = ds.partitioning(pa.schema([ - dataset.schema.field('part')]), - dictionaries={'part': pa.array(['a', 'b'])}) + partitioning = ds.partitioning( + pa.schema([dataset.schema.field("part")]), + dictionaries={"part": pa.array(["a", "b"])}, + ) # NB: dictionaries required here since we use partitioning to parse # directories in _check_dataset_roundtrip (not currently required for # the formatting step) _check_dataset_roundtrip( - dataset, str(target), expected_paths, target, - partitioning=partitioning) + dataset, str(target), expected_paths, target, partitioning=partitioning + ) @pytest.mark.parquet @@ -2438,18 +2540,15 @@ def test_write_dataset_use_threads(tempdir): _ = _create_parquet_dataset_partitioned(directory) dataset = ds.dataset(directory, partitioning="hive") - partitioning = ds.partitioning( - pa.schema([("part", pa.string())]), flavor="hive") + partitioning = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive") - target1 = tempdir / 'partitioned1' + target1 = tempdir / "partitioned1" ds.write_dataset( - dataset, target1, format="feather", partitioning=partitioning, - use_threads=True + dataset, target1, format="feather", partitioning=partitioning, use_threads=True ) - target2 = tempdir / 'partitioned2' + target2 = tempdir / "partitioned2" ds.write_dataset( - dataset, target2, format="feather", partitioning=partitioning, - use_threads=False + dataset, target2, format="feather", partitioning=partitioning, use_threads=False ) # check that reading in gives same result @@ -2459,14 +2558,19 @@ def test_write_dataset_use_threads(tempdir): def test_write_table(tempdir): - table = pa.table([ - pa.array(range(20)), pa.array(np.random.randn(20)), - pa.array(np.repeat(['a', 'b'], 10)) - ], names=["f1", "f2", "part"]) - - base_dir = tempdir / 'single' - ds.write_dataset(table, base_dir, - basename_template='dat_{i}.arrow', format="feather") + table = pa.table( + [ + pa.array(range(20)), + pa.array(np.random.randn(20)), + pa.array(np.repeat(["a", "b"], 10)), + ], + names=["f1", "f2", "part"], + ) + + base_dir = tempdir / "single" + ds.write_dataset( + table, base_dir, basename_template="dat_{i}.arrow", format="feather" + ) # check that all files are present file_paths = list(base_dir.rglob("*")) expected_paths = [base_dir / "dat_0.arrow"] @@ -2476,16 +2580,21 @@ def test_write_table(tempdir): assert result.equals(table) # with partitioning - base_dir = tempdir / 'partitioned' - partitioning = ds.partitioning( - pa.schema([("part", pa.string())]), flavor="hive") - ds.write_dataset(table, base_dir, format="feather", - basename_template='dat_{i}.arrow', - partitioning=partitioning) + base_dir = tempdir / "partitioned" + partitioning = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive") + ds.write_dataset( + table, + base_dir, + format="feather", + basename_template="dat_{i}.arrow", + partitioning=partitioning, + ) file_paths = list(base_dir.rglob("*")) expected_paths = [ - base_dir / "part=a", base_dir / "part=a" / "dat_0.arrow", - base_dir / "part=b", base_dir / "part=b" / "dat_1.arrow" + base_dir / "part=a", + base_dir / "part=a" / "dat_0.arrow", + base_dir / "part=b", + base_dir / "part=b" / "dat_1.arrow", ] assert set(file_paths) == set(expected_paths) result = ds.dataset(base_dir, format="ipc", partitioning=partitioning) @@ -2493,59 +2602,65 @@ def test_write_table(tempdir): def test_write_table_multiple_fragments(tempdir): - table = pa.table([ - pa.array(range(10)), pa.array(np.random.randn(10)), - pa.array(np.repeat(['a', 'b'], 5)) - ], names=["f1", "f2", "part"]) - table = pa.concat_tables([table]*2) + table = pa.table( + [ + pa.array(range(10)), + pa.array(np.random.randn(10)), + pa.array(np.repeat(["a", "b"], 5)), + ], + names=["f1", "f2", "part"], + ) + table = pa.concat_tables([table] * 2) # Table with multiple batches written as single Fragment by default - base_dir = tempdir / 'single' + base_dir = tempdir / "single" ds.write_dataset(table, base_dir, format="feather") assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals(table) # Same for single-element list of Table - base_dir = tempdir / 'single-list' + base_dir = tempdir / "single-list" ds.write_dataset([table], base_dir, format="feather") assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals(table) # Provide list of batches to write multiple fragments - base_dir = tempdir / 'multiple' + base_dir = tempdir / "multiple" ds.write_dataset(table.to_batches(), base_dir, format="feather") - assert set(base_dir.rglob("*")) == set( - [base_dir / "part-0.feather"]) + assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals(table) # Provide list of tables to write multiple fragments - base_dir = tempdir / 'multiple-table' + base_dir = tempdir / "multiple-table" ds.write_dataset([table, table], base_dir, format="feather") - assert set(base_dir.rglob("*")) == set( - [base_dir / "part-0.feather"]) - assert ds.dataset(base_dir, format="ipc").to_table().equals( - pa.concat_tables([table]*2) + assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"]) + assert ( + ds.dataset(base_dir, format="ipc") + .to_table() + .equals(pa.concat_tables([table] * 2)) ) def test_write_table_partitioned_dict(tempdir): # ensure writing table partitioned on a dictionary column works without # specifying the dictionary values explicitly - table = pa.table([ - pa.array(range(20)), - pa.array(np.repeat(['a', 'b'], 10)).dictionary_encode(), - ], names=['col', 'part']) + table = pa.table( + [ + pa.array(range(20)), + pa.array(np.repeat(["a", "b"], 10)).dictionary_encode(), + ], + names=["col", "part"], + ) partitioning = ds.partitioning(table.select(["part"]).schema) base_dir = tempdir / "dataset" - ds.write_dataset( - table, base_dir, format="feather", partitioning=partitioning - ) + ds.write_dataset(table, base_dir, format="feather", partitioning=partitioning) # check roundtrip partitioning_read = ds.DirectoryPartitioning.discover( - ["part"], infer_dictionary=True) + ["part"], infer_dictionary=True + ) result = ds.dataset( base_dir, format="ipc", partitioning=partitioning_read ).to_table() @@ -2556,14 +2671,18 @@ def test_write_table_partitioned_dict(tempdir): def test_write_dataset_parquet(tempdir): import pyarrow.parquet as pq - table = pa.table([ - pa.array(range(20)), pa.array(np.random.randn(20)), - pa.array(np.repeat(['a', 'b'], 10)) - ], names=["f1", "f2", "part"]) + table = pa.table( + [ + pa.array(range(20)), + pa.array(np.random.randn(20)), + pa.array(np.repeat(["a", "b"], 10)), + ], + names=["f1", "f2", "part"], + ) # using default "parquet" format string - base_dir = tempdir / 'parquet_dataset' + base_dir = tempdir / "parquet_dataset" ds.write_dataset(table, base_dir, format="parquet") # check that all files are present file_paths = list(base_dir.rglob("*")) @@ -2577,7 +2696,7 @@ def test_write_dataset_parquet(tempdir): for version in ["1.0", "2.0"]: format = ds.ParquetFileFormat() opts = format.make_write_options(version=version) - base_dir = tempdir / 'parquet_dataset_version{0}'.format(version) + base_dir = tempdir / "parquet_dataset_version{0}".format(version) ds.write_dataset(table, base_dir, format=format, file_options=opts) meta = pq.read_metadata(base_dir / "part-0.parquet") assert meta.format_version == version @@ -2602,12 +2721,12 @@ def test_write_dataset_schema_metadata(tempdir): # ensure that schema metadata gets written from pyarrow import feather - table = pa.table({'a': [1, 2, 3]}) - table = table.replace_schema_metadata({b'key': b'value'}) + table = pa.table({"a": [1, 2, 3]}) + table = table.replace_schema_metadata({b"key": b"value"}) ds.write_dataset(table, tempdir, format="feather") schema = feather.read_table(tempdir / "part-0.feather").schema - assert schema.metadata == {b'key': b'value'} + assert schema.metadata == {b"key": b"value"} @pytest.mark.parquet @@ -2615,12 +2734,12 @@ def test_write_dataset_schema_metadata_parquet(tempdir): # ensure that schema metadata gets written import pyarrow.parquet as pq - table = pa.table({'a': [1, 2, 3]}) - table = table.replace_schema_metadata({b'key': b'value'}) + table = pa.table({"a": [1, 2, 3]}) + table = table.replace_schema_metadata({b"key": b"value"}) ds.write_dataset(table, tempdir, format="parquet") schema = pq.read_table(tempdir / "part-0.parquet").schema - assert schema.metadata == {b'key': b'value'} + assert schema.metadata == {b"key": b"value"} @pytest.mark.parquet @@ -2628,22 +2747,23 @@ def test_write_dataset_schema_metadata_parquet(tempdir): def test_write_dataset_s3(s3_example_simple): # write dataset with s3 filesystem _, _, fs, _, host, port, access_key, secret_key = s3_example_simple - uri_template = ( - "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format( - access_key, secret_key, host, port) + uri_template = "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format( + access_key, secret_key, host, port ) - table = pa.table([ - pa.array(range(20)), pa.array(np.random.randn(20)), - pa.array(np.repeat(['a', 'b'], 10))], - names=["f1", "f2", "part"] + table = pa.table( + [ + pa.array(range(20)), + pa.array(np.random.randn(20)), + pa.array(np.repeat(["a", "b"], 10)), + ], + names=["f1", "f2", "part"], ) part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive") # writing with filesystem object ds.write_dataset( - table, "mybucket/dataset", filesystem=fs, format="feather", - partitioning=part + table, "mybucket/dataset", filesystem=fs, format="feather", partitioning=part ) # check rountrip result = ds.dataset( From d502e05f7ab48dcf51001ffae1a9a5fc07bac792 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 8 Feb 2021 07:43:23 -1000 Subject: [PATCH 09/33] WIP --- python/pyarrow/_compute.pyx | 21 ++++++ python/pyarrow/array.pxi | 5 +- python/pyarrow/compute.py | 106 ++++++++++++++++----------- python/pyarrow/includes/libarrow.pxd | 13 ++++ 4 files changed, 102 insertions(+), 43 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index e5a19288b87..8dea882334a 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -648,6 +648,27 @@ class FilterOptions(_FilterOptions): def __init__(self, null_selection_behavior='drop'): self._set_options(null_selection_behavior) +cdef class _DictionaryEncodeOptions(FunctionOptions): + cdef: + unique_ptr[CDictionaryEncodeOptions] dictionary_encode_options + + cdef const CFunctionOptions* get_options(self) except NULL: + return self.dictionary_encode_options.get() + + def _set_options(self, null_encoding_behavior): + if null_encoding_behavior == 'encode': + self.dictionary_encode_options.reset( + new CDictionaryEncodeOptions(CDictionaryEncodeNullEncodingBehavior_ENCODE)) + elif null_encoding_behavior == 'mask': + self.dictionary_encode_options.reset( + new CDictionaryEncodeOptions(CDictionaryEncodeNullEncodingBehavior_MASK)) + else: + raise ValueError('"{}" is not a valid null_encoding_behavior'.format(null_encoding_behavior)) + +class DictionaryEncodeOptions(_DictionaryEncodeOptions): + def __init__(self, null_encoding_behavior='mask'): + self._set_options(null_encoding_behavior) + cdef class _TakeOptions(FunctionOptions): cdef: diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ae9e213b98d..a832b00b1eb 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -842,11 +842,12 @@ cdef class Array(_PandasConvertible): """ return _pc().call_function('unique', [self]) - def dictionary_encode(self): + def dictionary_encode(self, null_encoding='mask'): """ Compute dictionary-encoded representation of array. """ - return _pc().call_function('dictionary_encode', [self]) + options = _pc().DictionaryEncodeOptions(null_encoding) + return _pc().call_function('dictionary_encode', [self], options) def value_counts(self): """ diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 616b2de89ec..1483d97a72d 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -30,6 +30,7 @@ ArraySortOptions, CastOptions, CountOptions, + DictionaryEncodeOptions, FilterOptions, MatchSubstringOptions, MinMaxOptions, @@ -68,14 +69,14 @@ def _get_arg_names(func): arg_names = ["left", "right"] else: raise NotImplementedError( - f"unsupported arity: {func.arity} (function: {func.name})") + f"unsupported arity: {func.arity} (function: {func.name})" + ) return arg_names def _decorate_compute_function(wrapper, exposed_name, func, option_class): - wrapper.__arrow_compute_function__ = dict(name=func.name, - arity=func.arity) + wrapper.__arrow_compute_function__ = dict(name=func.name, arity=func.arity) wrapper.__name__ = exposed_name wrapper.__qualname__ = exposed_name @@ -85,47 +86,64 @@ def _decorate_compute_function(wrapper, exposed_name, func, option_class): summary = cpp_doc.summary if not summary: arg_str = "arguments" if func.arity > 1 else "argument" - summary = ("Call compute function {!r} with the given {}" - .format(func.name, arg_str)) + summary = "Call compute function {!r} with the given {}".format( + func.name, arg_str + ) description = cpp_doc.description arg_names = _get_arg_names(func) - doc_pieces.append("""\ + doc_pieces.append( + """\ {}. - """.format(summary)) + """.format( + summary + ) + ) if description: doc_pieces.append("{}\n\n".format(description)) - doc_pieces.append("""\ + doc_pieces.append( + """\ Parameters ---------- - """) + """ + ) for arg_name in arg_names: - if func.kind in ('vector', 'scalar_aggregate'): - arg_type = 'Array-like' + if func.kind in ("vector", "scalar_aggregate"): + arg_type = "Array-like" else: - arg_type = 'Array-like or scalar-like' - doc_pieces.append("""\ + arg_type = "Array-like or scalar-like" + doc_pieces.append( + """\ {} : {} Argument to compute function - """.format(arg_name, arg_type)) + """.format( + arg_name, arg_type + ) + ) - doc_pieces.append("""\ + doc_pieces.append( + """\ memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the default memory pool. - """) + """ + ) if option_class is not None: - doc_pieces.append("""\ + doc_pieces.append( + """\ options : pyarrow.compute.{0}, optional Parameters altering compute function semantics **kwargs : optional Parameters for {0} constructor. Either `options` or `**kwargs` can be passed, but not both at the same time. - """.format(option_class.__name__)) + """.format( + option_class.__name__ + ) + ) wrapper.__doc__ = "".join(dedent(s) for s in doc_pieces) return wrapper @@ -138,8 +156,9 @@ def _get_options_class(func): try: return globals()[class_name] except KeyError: - warnings.warn("Python binding for {} not exposed" - .format(class_name), RuntimeWarning) + warnings.warn( + "Python binding for {} not exposed".format(class_name), RuntimeWarning + ) return None @@ -149,8 +168,8 @@ def _handle_options(name, option_class, options, kwargs): return option_class(**kwargs) raise TypeError( "Function {!r} called with both an 'options' argument " - "and additional named arguments" - .format(name)) + "and additional named arguments".format(name) + ) if options is not None: if isinstance(options, dict): @@ -158,20 +177,25 @@ def _handle_options(name, option_class, options, kwargs): elif isinstance(options, option_class): return options raise TypeError( - "Function {!r} expected a {} parameter, got {}" - .format(name, option_class, type(options))) + "Function {!r} expected a {} parameter, got {}".format( + name, option_class, type(options) + ) + ) return options -_wrapper_template = dedent("""\ +_wrapper_template = dedent( + """\ def make_wrapper(func, option_class): def {func_name}({args_sig}{kwonly}, memory_pool=None): return func.call([{args_sig}], None, memory_pool) return {func_name} - """) + """ +) -_wrapper_options_template = dedent("""\ +_wrapper_options_template = dedent( + """\ def make_wrapper(func, option_class): def {func_name}({args_sig}{kwonly}, options=None, memory_pool=None, **kwargs): @@ -179,14 +203,15 @@ def {func_name}({args_sig}{kwonly}, options=None, memory_pool=None, kwargs) return func.call([{args_sig}], options, memory_pool) return {func_name} - """) + """ +) def _wrap_function(name, func): option_class = _get_options_class(func) arg_names = _get_arg_names(func) - args_sig = ', '.join(arg_names) - kwonly = '' if arg_names[-1].startswith('*') else ', *' + args_sig = ", ".join(arg_names) + kwonly = "" if arg_names[-1].startswith("*") else ", *" # Generate templated wrapper, so that the signature matches # the documented argument names. @@ -195,9 +220,10 @@ def _wrap_function(name, func): template = _wrapper_options_template else: template = _wrapper_template - exec(template.format(func_name=name, args_sig=args_sig, kwonly=kwonly), - globals(), ns) - wrapper = ns['make_wrapper'](func, option_class) + exec( + template.format(func_name=name, args_sig=args_sig, kwonly=kwonly), globals(), ns + ) + wrapper = ns["make_wrapper"](func, option_class) return _decorate_compute_function(wrapper, name, func, option_class) @@ -213,8 +239,7 @@ def _make_global_functions(): reg = function_registry() # Avoid clashes with Python keywords - rewrites = {'and': 'and_', - 'or': 'or_'} + rewrites = {"and": "and_", "or": "or_"} for cpp_name in reg.list_functions(): name = rewrites.get(cpp_name, cpp_name) @@ -298,8 +323,7 @@ def match_substring(array, pattern): ------- result : pyarrow.Array or pyarrow.ChunkedArray """ - return call_function("match_substring", [array], - MatchSubstringOptions(pattern)) + return call_function("match_substring", [array], MatchSubstringOptions(pattern)) def sum(array): @@ -314,7 +338,7 @@ def sum(array): ------- sum : pyarrow.Scalar """ - return call_function('sum', [array]) + return call_function("sum", [array]) def mode(array, n=1): @@ -346,7 +370,7 @@ def mode(array, n=1): return call_function("mode", [array], options) -def filter(data, mask, null_selection_behavior='drop'): +def filter(data, mask, null_selection_behavior="drop"): """ Select values (or records) from array- or table-like data given boolean filter, where true values are selected. @@ -387,7 +411,7 @@ def filter(data, mask, null_selection_behavior='drop'): ] """ options = FilterOptions(null_selection_behavior) - return call_function('filter', [data, mask], options) + return call_function("filter", [data, mask], options) def take(data, indices, *, boundscheck=True, memory_pool=None): @@ -428,7 +452,7 @@ def take(data, indices, *, boundscheck=True, memory_pool=None): ] """ options = TakeOptions(boundscheck=boundscheck) - return call_function('take', [data, indices], options, memory_pool) + return call_function("take", [data, indices], options, memory_pool) def fill_null(values, fill_value): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index e10ef1e3a5e..983ee0df0f1 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1802,6 +1802,19 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: CFilterOptions(CFilterNullSelectionBehavior null_selection) CFilterNullSelectionBehavior null_selection_behavior + enum CDictionaryEncodeNullEncodingBehavior \ + "arrow::compute::DictionaryEncodeOptions::NullEncodingBehavior": + CDictionaryEncodeNullEncodingBehavior_ENCODE \ + "arrow::compute::DictionaryEncodeOptions::ENCODE" + CDictionaryEncodeNullEncodingBehavior_MASK \ + "arrow::compute::DictionaryEncodeOptions::MASK" + + cdef cppclass CDictionaryEncodeOptions \ + "arrow::compute::DictionaryEncodeOptions"(CFunctionOptions): + CDictionaryEncodeOptions() + CDictionaryEncodeOptions(CDictionaryEncodeNullEncodingBehavior null_encoding) + CDictionaryEncodeNullEncodingBehavior null_encoding + cdef cppclass CTakeOptions \ " arrow::compute::TakeOptions"(CFunctionOptions): CTakeOptions(c_bool boundscheck) From 5b18c961ad265d2600fd6fa9c14219cfdc8335b1 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Wed, 10 Feb 2021 22:51:05 -1000 Subject: [PATCH 10/33] WIP --- cpp/src/arrow/dataset/expression.cc | 4 +++- cpp/src/arrow/dataset/expression.h | 2 +- cpp/src/arrow/dataset/expression_test.cc | 9 ++++++++- cpp/src/arrow/dataset/partition.cc | 14 +++++++++----- cpp/src/arrow/dataset/partition.h | 1 + cpp/src/arrow/dataset/partition_test.cc | 23 ++++++++++++++++------- python/pyarrow/table.pxi | 5 +++-- 7 files changed, 41 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/dataset/expression.cc b/cpp/src/arrow/dataset/expression.cc index d5bcd3fb0eb..6afe9309c54 100644 --- a/cpp/src/arrow/dataset/expression.cc +++ b/cpp/src/arrow/dataset/expression.cc @@ -51,7 +51,9 @@ Expression::Expression(Parameter parameter) Expression literal(Datum lit) { return Expression(std::move(lit)); } -Expression null_literal() { return Expression(Datum()); } +Expression null_literal(const std::shared_ptr& type) { + return Expression(MakeNullScalar(type)); +} Expression field_ref(FieldRef ref) { return Expression(Expression::Parameter{std::move(ref), {}}); diff --git a/cpp/src/arrow/dataset/expression.h b/cpp/src/arrow/dataset/expression.h index 33ffdddb8a6..79d7f077f23 100644 --- a/cpp/src/arrow/dataset/expression.h +++ b/cpp/src/arrow/dataset/expression.h @@ -136,7 +136,7 @@ ARROW_DS_EXPORT Expression literal(Datum lit); ARROW_DS_EXPORT -Expression null_literal(); +Expression null_literal(const std::shared_ptr& type); template Expression literal(Arg&& arg) { diff --git a/cpp/src/arrow/dataset/expression_test.cc b/cpp/src/arrow/dataset/expression_test.cc index 2f0110255ec..81a9c74fad1 100644 --- a/cpp/src/arrow/dataset/expression_test.cc +++ b/cpp/src/arrow/dataset/expression_test.cc @@ -250,6 +250,9 @@ TEST(Expression, Hash) { EXPECT_FALSE(set.emplace(literal(1)).second) << "already inserted"; EXPECT_TRUE(set.emplace(literal(3)).second); + EXPECT_TRUE(set.emplace(null_literal(int32())).second); + EXPECT_FALSE(set.emplace(null_literal(int32())).second) << "already inserted"; + EXPECT_TRUE(set.emplace(null_literal(float32())).second); // NB: no validation on construction; we couldn't execute // add with zero arguments EXPECT_TRUE(set.emplace(call("add", {})).second); @@ -258,7 +261,7 @@ TEST(Expression, Hash) { // NB: unbound expressions don't check for availability in any registry EXPECT_TRUE(set.emplace(call("widgetify", {})).second); - EXPECT_EQ(set.size(), 6); + EXPECT_EQ(set.size(), 8); } TEST(Expression, IsScalarExpression) { @@ -1013,6 +1016,10 @@ TEST(Expression, SimplifyWithGuarantee) { Simplify{greater(field_ref("dict_i32"), literal(int64_t(1)))} .WithGuarantee(equal(field_ref("dict_i32"), literal(0))) .Expect(false); + + Simplify{null_literal(int32())} + .WithGuarantee(null_literal(int32())) + .Expect(literal(true)); } TEST(Expression, SimplifyThenExecute) { diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 595cce8021d..c5f55d73b69 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -147,7 +147,9 @@ Result KeyValuePartitioning::ConvertKey(const Key& key) const { std::shared_ptr converted; - if (field->type()->id() == Type::DICTIONARY) { + if (key.null) { + converted = MakeNullScalar(field->type()); + } else if (field->type()->id() == Type::DICTIONARY) { if (dictionaries_.empty() || dictionaries_[field_index] == nullptr) { return Status::Invalid("No dictionary provided for dictionary field ", field->ToString()); @@ -230,7 +232,7 @@ std::vector DirectoryPartitioning::ParseKeys( for (auto&& segment : fs::internal::SplitAbstractPath(path)) { if (i >= schema_->num_fields()) break; - keys.push_back({schema_->field(i++)->name(), std::move(segment)}); + keys.push_back({schema_->field(i++)->name(), std::move(segment), false}); } return keys; @@ -419,9 +421,9 @@ util::optional HivePartitioning::ParseKey( auto value = segment.substr(name_end + 1); if (value == null_fallback) { - return util::nullopt; + return Key{segment.substr(0, name_end), "", true}; } - return Key{segment.substr(0, name_end), segment.substr(name_end + 1)}; + return Key{segment.substr(0, name_end), segment.substr(name_end + 1), false}; } std::vector HivePartitioning::ParseKeys( @@ -443,7 +445,9 @@ Result HivePartitioning::FormatValues(const ScalarVector& values) c for (int i = 0; i < schema_->num_fields(); ++i) { const std::string& name = schema_->field(i)->name(); - if (values[i] == nullptr || !values[i]->is_valid) { + if (values[i] == nullptr) { + segments[i] = ""; + } else if (!values[i]->is_valid) { // If no key is available just provide a placeholder segment to maintain the // field_index <-> path nesting relation segments[i] = name + "=" + null_fallback_; diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index 5cdf7a1df66..e5afd00c76d 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -125,6 +125,7 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning { /// of a scalar value struct Key { std::string name, value; + bool null; }; static Status SetDefaultValuesFromKeys(const Expression& expr, diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 2558af293da..7e19e4f382d 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -297,13 +297,16 @@ TEST_F(TestPartitioning, DiscoverSchemaSegfault) { TEST_F(TestPartitioning, HivePartitioning) { partitioning_ = std::make_shared( - schema({field("alpha", int32()), field("beta", float32())})); + schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz"); AssertParse("/alpha=0/beta=3.25", and_(equal(field_ref("alpha"), literal(0)), equal(field_ref("beta"), literal(3.25f)))); AssertParse("/beta=3.25/alpha=0", and_(equal(field_ref("beta"), literal(3.25f)), equal(field_ref("alpha"), literal(0)))); AssertParse("/alpha=0", equal(field_ref("alpha"), literal(0))); + AssertParse("/alpha=xyz/beta=3.25", + and_(equal(field_ref("alpha"), null_literal(int32())), + equal(field_ref("beta"), literal(3.25f)))); AssertParse("/beta=3.25", equal(field_ref("beta"), literal(3.25f))); AssertParse("", literal(true)); @@ -332,9 +335,18 @@ TEST_F(TestPartitioning, HivePartitioningFormat) { AssertFormat(and_(equal(field_ref("beta"), literal(3.25f)), equal(field_ref("alpha"), literal(0))), "alpha=0/beta=3.25"); - AssertFormat(equal(field_ref("alpha"), literal(0)), "alpha=0/beta=xyz"); - AssertFormat(equal(field_ref("beta"), literal(3.25f)), "alpha=xyz/beta=3.25"); - AssertFormat(literal(true), "alpha=xyz/beta=xyz"); + AssertFormat(equal(field_ref("alpha"), literal(0)), "alpha=0"); + AssertFormat(and_(equal(field_ref("alpha"), literal(0)), + equal(field_ref("beta"), null_literal(float32()))), + "alpha=0/beta=xyz"); + AssertFormat(and_(equal(field_ref("alpha"), null_literal(int32())), + equal(field_ref("beta"), literal(3.25f))), + "alpha=xyz/beta=3.25"); + AssertFormat(literal(true), ""); + + AssertFormat(and_(equal(field_ref("alpha"), null_literal(int32())), + equal(field_ref("beta"), null_literal(float32()))), + "alpha=xyz/beta=xyz"); ASSERT_OK_AND_ASSIGN(written_schema_, written_schema_->AddField(0, field("gamma", utf8()))); @@ -343,9 +355,6 @@ TEST_F(TestPartitioning, HivePartitioningFormat) { equal(field_ref("beta"), literal(3.25f))}), "alpha=0/beta=3.25"); - AssertFormat(equal(field_ref("alpha"), literal(MakeNullScalar(int32()))), - "alpha=xyz/beta=xyz"); - // written_schema_ is incompatible with partitioning_'s schema written_schema_ = schema({field("alpha", utf8()), field("beta", utf8())}); AssertFormatError( diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index c6b0b4180b6..3f1fc28ee60 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -276,7 +276,7 @@ cdef class ChunkedArray(_PandasConvertible): """ return _pc().cast(self, target_type, safe=safe) - def dictionary_encode(self): + def dictionary_encode(self, null_encoding='mask'): """ Compute dictionary-encoded representation of array @@ -285,7 +285,8 @@ cdef class ChunkedArray(_PandasConvertible): pyarrow.ChunkedArray Same chunking as the input, all chunks share a common dictionary. """ - return _pc().call_function('dictionary_encode', [self]) + options = _pc().DictionaryEncodeOptions(null_encoding) + return _pc().call_function('dictionary_encode', [self], options) def flatten(self, MemoryPool memory_pool=None): """ From 613b286c2d1ed1ad7998a4458d216a9d5d05a83e Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Wed, 10 Feb 2021 23:38:07 -1000 Subject: [PATCH 11/33] WIP --- cpp/src/arrow/dataset/expression_test.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/dataset/expression_test.cc b/cpp/src/arrow/dataset/expression_test.cc index 81a9c74fad1..6329c5f83e9 100644 --- a/cpp/src/arrow/dataset/expression_test.cc +++ b/cpp/src/arrow/dataset/expression_test.cc @@ -1017,8 +1017,12 @@ TEST(Expression, SimplifyWithGuarantee) { .WithGuarantee(equal(field_ref("dict_i32"), literal(0))) .Expect(false); - Simplify{null_literal(int32())} - .WithGuarantee(null_literal(int32())) + Simplify{equal(field_ref("i32"), literal(7))} + .WithGuarantee(equal(field_ref("i32"), literal(7))) + .Expect(literal(true)); + + Simplify{equal(field_ref("i32"), null_literal(int32()))} + .WithGuarantee(equal(field_ref("i32"), null_literal(int32()))) .Expect(literal(true)); } From 3f4ec252ae6251040fed2d035dcf159d5631f342 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 11 Feb 2021 13:43:21 -1000 Subject: [PATCH 12/33] Improved null handling in expression/partition a bit --- cpp/src/arrow/dataset/expression.cc | 90 +++++++++++++++++------- cpp/src/arrow/dataset/expression.h | 26 ++++++- cpp/src/arrow/dataset/expression_test.cc | 51 +++++++++----- cpp/src/arrow/dataset/partition.cc | 56 ++++++++++----- cpp/src/arrow/dataset/partition_test.cc | 17 ++--- 5 files changed, 166 insertions(+), 74 deletions(-) diff --git a/cpp/src/arrow/dataset/expression.cc b/cpp/src/arrow/dataset/expression.cc index 6afe9309c54..ef92ae09fe7 100644 --- a/cpp/src/arrow/dataset/expression.cc +++ b/cpp/src/arrow/dataset/expression.cc @@ -688,30 +688,42 @@ std::vector GuaranteeConjunctionMembers( // conjunction_members Status ExtractKnownFieldValuesImpl( std::vector* conjunction_members, - std::unordered_map* known_values) { - auto unconsumed_end = - std::partition(conjunction_members->begin(), conjunction_members->end(), - [](const Expression& expr) { - // search for an equality conditions between a field and a literal - auto call = expr.call(); - if (!call) return true; - - if (call->function_name == "equal") { - auto ref = call->arguments[0].field_ref(); - auto lit = call->arguments[1].literal(); - return !(ref && lit); - } - - return true; - }); + std::unordered_map* known_values) { + auto unconsumed_end = std::partition( + conjunction_members->begin(), conjunction_members->end(), + [](const Expression& expr) { + // search for an equality conditions between a field and a literal + auto call = expr.call(); + if (!call) return true; + + if (call->function_name == "equal") { + auto ref = call->arguments[0].field_ref(); + auto lit = call->arguments[1].literal(); + return !(ref && lit); + } + + if (call->function_name == "is_null" || call->function_name == "is_valid") { + auto ref = call->arguments[0].field_ref(); + return !ref; + } + + return true; + }); for (auto it = unconsumed_end; it != conjunction_members->end(); ++it) { auto call = CallNotNull(*it); - auto ref = call->arguments[0].field_ref(); - auto lit = call->arguments[1].literal(); - - known_values->emplace(*ref, *lit); + if (call->function_name == "equal") { + auto ref = call->arguments[0].field_ref(); + auto lit = call->arguments[1].literal(); + known_values->emplace(*ref, *lit); + } else if (call->function_name == "is_null") { + auto ref = call->arguments[0].field_ref(); + known_values->emplace(*ref, false); + } else if (call->function_name == "is_valid") { + auto ref = call->arguments[0].field_ref(); + known_values->emplace(*ref, true); + } } conjunction_members->erase(unconsumed_end, conjunction_members->end()); @@ -721,16 +733,16 @@ Status ExtractKnownFieldValuesImpl( } // namespace -Result> ExtractKnownFieldValues( - const Expression& guaranteed_true_predicate) { +Result> +ExtractKnownFieldValues(const Expression& guaranteed_true_predicate) { auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate); - std::unordered_map known_values; + std::unordered_map known_values; RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values)); return known_values; } Result ReplaceFieldsWithKnownValues( - const std::unordered_map& known_values, + const std::unordered_map& known_values, Expression expr) { if (!expr.IsBound()) { return Status::Invalid( @@ -743,7 +755,11 @@ Result ReplaceFieldsWithKnownValues( if (auto ref = expr.field_ref()) { auto it = known_values.find(*ref); if (it != known_values.end()) { - Datum lit = it->second; + const auto& known_value = it->second; + if (!known_value.concrete()) { + return expr; + } + auto lit = known_value.datum; if (expr.type()->id() == Type::DICTIONARY) { if (lit.is_scalar()) { // FIXME the "right" way to support this is adding support for scalars to @@ -760,9 +776,25 @@ Result ReplaceFieldsWithKnownValues( DictionaryScalar::Make(std::move(index), std::move(dictionary))); } } - ARROW_ASSIGN_OR_RAISE(lit, compute::Cast(it->second, expr.type())); + ARROW_ASSIGN_OR_RAISE(lit, compute::Cast(lit, expr.type())); return literal(std::move(lit)); } + } else if (auto call = expr.call()) { + if (call->function_name == "is_null") { + if (auto ref = call->arguments[0].field_ref()) { + auto it = known_values.find(*ref); + if (it != known_values.end()) { + return literal(!it->second.valid); + } + } + } else if (call->function_name == "is_valid") { + if (auto ref = call->arguments[0].field_ref()) { + auto it = known_values.find(*ref); + if (it != known_values.end()) { + return literal(it->second.valid); + } + } + } } return expr; }, @@ -939,7 +971,7 @@ Result SimplifyWithGuarantee(Expression expr, const Expression& guaranteed_true_predicate) { auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate); - std::unordered_map known_values; + std::unordered_map known_values; RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values)); ARROW_ASSIGN_OR_RAISE(expr, @@ -1226,6 +1258,10 @@ Expression greater_equal(Expression lhs, Expression rhs) { return call("greater_equal", {std::move(lhs), std::move(rhs)}); } +Expression is_null(Expression lhs) { return call("is_null", {std::move(lhs)}); } + +Expression is_valid(Expression lhs) { return call("is_valid", {std::move(lhs)}); } + Expression and_(Expression lhs, Expression rhs) { return call("and_kleene", {std::move(lhs), std::move(rhs)}); } diff --git a/cpp/src/arrow/dataset/expression.h b/cpp/src/arrow/dataset/expression.h index 79d7f077f23..785290e4bb2 100644 --- a/cpp/src/arrow/dataset/expression.h +++ b/cpp/src/arrow/dataset/expression.h @@ -162,10 +162,25 @@ Expression call(std::string function, std::vector arguments, ARROW_DS_EXPORT std::vector FieldsInExpression(const Expression&); +/// Represents either a concrete value or a hint that a field is valid/invalid +struct KnownFieldValue { + Datum datum; + bool valid; + + KnownFieldValue(const Datum& datum) + : datum(datum), valid(datum.length() == datum.null_count()) {} + KnownFieldValue(bool is_valid) : datum(), valid(is_valid) {} + + inline bool concrete() const { return datum.kind() != Datum::Kind::NONE; } + bool operator==(const KnownFieldValue& other) const { + return datum == other.datum && valid == other.valid; + } +}; + /// Assemble a mapping from field references to known values. ARROW_DS_EXPORT -Result> ExtractKnownFieldValues( - const Expression& guaranteed_true_predicate); +Result> +ExtractKnownFieldValues(const Expression& guaranteed_true_predicate); /// \defgroup expression-passes Functions for modification of Expressions /// @@ -194,7 +209,8 @@ Result FoldConstants(Expression); /// Simplify Expressions by replacing with known values of the fields which it references. ARROW_DS_EXPORT Result ReplaceFieldsWithKnownValues( - const std::unordered_map& known_values, Expression); + const std::unordered_map& known_values, + Expression); /// Simplify an expression by replacing subexpressions based on a guarantee: /// a boolean expression which is guaranteed to evaluate to `true`. For example, this is @@ -239,6 +255,10 @@ ARROW_DS_EXPORT Expression greater(Expression lhs, Expression rhs); ARROW_DS_EXPORT Expression greater_equal(Expression lhs, Expression rhs); +ARROW_DS_EXPORT Expression is_null(Expression lhs); + +ARROW_DS_EXPORT Expression is_valid(Expression lhs); + ARROW_DS_EXPORT Expression and_(Expression lhs, Expression rhs); ARROW_DS_EXPORT Expression and_(const std::vector&); ARROW_DS_EXPORT Expression or_(Expression lhs, Expression rhs); diff --git a/cpp/src/arrow/dataset/expression_test.cc b/cpp/src/arrow/dataset/expression_test.cc index 6329c5f83e9..3aa62319e85 100644 --- a/cpp/src/arrow/dataset/expression_test.cc +++ b/cpp/src/arrow/dataset/expression_test.cc @@ -606,6 +606,8 @@ TEST(Expression, FoldConstants) { // call against literals (3 + 2 == 5) ExpectFoldsTo(call("add", {literal(3), literal(2)}), literal(5)); + ExpectFoldsTo(call("equal", {literal(3), literal(3)}), literal(true)); + // call against literal and field_ref ExpectFoldsTo(call("add", {literal(3), field_ref("i32")}), call("add", {literal(3), field_ref("i32")})); @@ -674,8 +676,9 @@ TEST(Expression, FoldConstantsBoolean) { TEST(Expression, ExtractKnownFieldValues) { struct { - void operator()(Expression guarantee, - std::unordered_map expected) { + void operator()( + Expression guarantee, + std::unordered_map expected) { ASSERT_OK_AND_ASSIGN(auto actual, ExtractKnownFieldValues(guarantee)); EXPECT_THAT(actual, UnorderedElementsAreArray(expected)) << " guarantee: " << guarantee.ToString(); @@ -723,20 +726,20 @@ TEST(Expression, ExtractKnownFieldValues) { } TEST(Expression, ReplaceFieldsWithKnownValues) { - auto ExpectReplacesTo = - [](Expression expr, - std::unordered_map known_values, - Expression unbound_expected) { - ASSERT_OK_AND_ASSIGN(expr, expr.Bind(*kBoringSchema)); - ASSERT_OK_AND_ASSIGN(auto expected, unbound_expected.Bind(*kBoringSchema)); - ASSERT_OK_AND_ASSIGN(auto replaced, - ReplaceFieldsWithKnownValues(known_values, expr)); + auto ExpectReplacesTo = [](Expression expr, + const std::unordered_map& known_values, + Expression unbound_expected) { + ASSERT_OK_AND_ASSIGN(expr, expr.Bind(*kBoringSchema)); + ASSERT_OK_AND_ASSIGN(auto expected, unbound_expected.Bind(*kBoringSchema)); + ASSERT_OK_AND_ASSIGN(auto replaced, ReplaceFieldsWithKnownValues(known_values, expr)); - EXPECT_EQ(replaced, expected); - ExpectIdenticalIfUnchanged(replaced, expr); - }; + EXPECT_EQ(replaced, expected); + ExpectIdenticalIfUnchanged(replaced, expr); + }; - std::unordered_map i32_is_3{{"i32", Datum(3)}}; + std::unordered_map i32_is_3{ + {"i32", Datum(3)}}; ExpectReplacesTo(literal(1), i32_is_3, literal(1)); @@ -768,6 +771,14 @@ TEST(Expression, ReplaceFieldsWithKnownValues) { }), literal(2), })); + + std::unordered_map a_valid_b_invalid{ + {"a", true}, {"b", false}}; + + ExpectReplacesTo(is_null(field_ref("a")), a_valid_b_invalid, literal(false)); + ExpectReplacesTo(is_valid(field_ref("a")), a_valid_b_invalid, literal(true)); + ExpectReplacesTo(is_null(field_ref("b")), a_valid_b_invalid, literal(true)); + ExpectReplacesTo(is_valid(field_ref("b")), a_valid_b_invalid, literal(false)); } struct { @@ -1021,8 +1032,16 @@ TEST(Expression, SimplifyWithGuarantee) { .WithGuarantee(equal(field_ref("i32"), literal(7))) .Expect(literal(true)); - Simplify{equal(field_ref("i32"), null_literal(int32()))} - .WithGuarantee(equal(field_ref("i32"), null_literal(int32()))) + Simplify{equal(field_ref("i32"), literal(7))} + .WithGuarantee(not_(equal(field_ref("i32"), literal(7)))) + .Expect(equal(field_ref("i32"), literal(7))); + + Simplify{is_null(field_ref("i32"))} + .WithGuarantee(is_null(field_ref("i32"))) + .Expect(literal(true)); + + Simplify{is_valid(field_ref("i32"))} + .WithGuarantee(is_valid(field_ref("i32"))) .Expect(literal(true)); } diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index c5f55d73b69..2afaf414f9d 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -74,15 +74,26 @@ Status KeyValuePartitioning::SetDefaultValuesFromKeys(const Expression& expr, RecordBatchProjector* projector) { ARROW_ASSIGN_OR_RAISE(auto known_values, ExtractKnownFieldValues(expr)); for (const auto& ref_value : known_values) { - if (!ref_value.second.is_scalar()) { - return Status::Invalid("non-scalar partition key ", ref_value.second.ToString()); + const auto& known_value = ref_value.second; + if (known_value.concrete() && !known_value.datum.is_scalar()) { + return Status::Invalid("non-scalar partition key ", known_value.datum.ToString()); } ARROW_ASSIGN_OR_RAISE(auto match, ref_value.first.FindOneOrNone(*projector->schema())); if (match.empty()) continue; - RETURN_NOT_OK(projector->SetDefaultValue(match, ref_value.second.scalar())); + + const auto& field = projector->schema()->field(match[0]); + if (known_value.concrete()) { + RETURN_NOT_OK(projector->SetDefaultValue(match, known_value.datum.scalar())); + } else if (known_value.valid) { + return Status::Invalid( + "Partition expression not defined enough to set default value for ", + ref_value.first.name()); + } else { + RETURN_NOT_OK(projector->SetDefaultValue(match, MakeNullScalar(field->type()))); + } } return Status::OK(); } @@ -148,7 +159,7 @@ Result KeyValuePartitioning::ConvertKey(const Key& key) const { std::shared_ptr converted; if (key.null) { - converted = MakeNullScalar(field->type()); + return is_null(field_ref(field->name())); } else if (field->type()->id() == Type::DICTIONARY) { if (dictionaries_.empty() || dictionaries_[field_index] == nullptr) { return Status::Invalid("No dictionary provided for dictionary field ", @@ -198,27 +209,34 @@ Result KeyValuePartitioning::Format(const Expression& expr) const { ARROW_ASSIGN_OR_RAISE(auto known_values, ExtractKnownFieldValues(expr)); for (const auto& ref_value : known_values) { - if (!ref_value.second.is_scalar()) { - return Status::Invalid("non-scalar partition key ", ref_value.second.ToString()); + const auto& known_value = ref_value.second; + if (known_value.concrete() && !known_value.datum.is_scalar()) { + return Status::Invalid("non-scalar partition key ", known_value.datum.ToString()); } ARROW_ASSIGN_OR_RAISE(auto match, ref_value.first.FindOneOrNone(*schema_)); if (match.empty()) continue; - auto value = ref_value.second.scalar(); - const auto& field = schema_->field(match[0]); - if (!value->type->Equals(field->type())) { - return Status::TypeError("scalar ", value->ToString(), " (of type ", *value->type, - ") is invalid for ", field->ToString()); - } - if (value->type->id() == Type::DICTIONARY) { - ARROW_ASSIGN_OR_RAISE( - value, checked_cast(*value).GetEncodedValue()); - } + if (known_value.concrete()) { + auto value = known_value.datum.scalar(); + if (!value->type->Equals(field->type())) { + return Status::TypeError("scalar ", value->ToString(), " (of type ", *value->type, + ") is invalid for ", field->ToString()); + } - values[match[0]] = std::move(value); + if (value->type->id() == Type::DICTIONARY) { + ARROW_ASSIGN_OR_RAISE( + value, checked_cast(*value).GetEncodedValue()); + } + + values[match[0]] = std::move(value); + } else { + if (!known_value.valid) { + values[match[0]] = MakeNullScalar(field->type()); + } + } } return FormatValues(values); @@ -471,7 +489,9 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory { for (auto path : paths) { for (auto&& segment : fs::internal::SplitAbstractPath(path)) { if (auto key = HivePartitioning::ParseKey(segment, null_fallback_)) { - RETURN_NOT_OK(InsertRepr(key->name, key->value)); + if (!key->null) { + RETURN_NOT_OK(InsertRepr(key->name, key->value)); + } } } } diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 7e19e4f382d..b8dade238c0 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -304,9 +304,8 @@ TEST_F(TestPartitioning, HivePartitioning) { AssertParse("/beta=3.25/alpha=0", and_(equal(field_ref("beta"), literal(3.25f)), equal(field_ref("alpha"), literal(0)))); AssertParse("/alpha=0", equal(field_ref("alpha"), literal(0))); - AssertParse("/alpha=xyz/beta=3.25", - and_(equal(field_ref("alpha"), null_literal(int32())), - equal(field_ref("beta"), literal(3.25f)))); + AssertParse("/alpha=xyz/beta=3.25", and_(is_null(field_ref("alpha")), + equal(field_ref("beta"), literal(3.25f)))); AssertParse("/beta=3.25", equal(field_ref("beta"), literal(3.25f))); AssertParse("", literal(true)); @@ -336,16 +335,14 @@ TEST_F(TestPartitioning, HivePartitioningFormat) { equal(field_ref("alpha"), literal(0))), "alpha=0/beta=3.25"); AssertFormat(equal(field_ref("alpha"), literal(0)), "alpha=0"); - AssertFormat(and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), null_literal(float32()))), + AssertFormat(and_(equal(field_ref("alpha"), literal(0)), is_null(field_ref("beta"))), "alpha=0/beta=xyz"); - AssertFormat(and_(equal(field_ref("alpha"), null_literal(int32())), - equal(field_ref("beta"), literal(3.25f))), - "alpha=xyz/beta=3.25"); + AssertFormat( + and_(is_null(field_ref("alpha")), equal(field_ref("beta"), literal(3.25f))), + "alpha=xyz/beta=3.25"); AssertFormat(literal(true), ""); - AssertFormat(and_(equal(field_ref("alpha"), null_literal(int32())), - equal(field_ref("beta"), null_literal(float32()))), + AssertFormat(and_(is_null(field_ref("alpha")), is_null(field_ref("beta"))), "alpha=xyz/beta=xyz"); ASSERT_OK_AND_ASSIGN(written_schema_, From 79dda1afb264d35f690e5bc2df2e0f4037b1dcb5 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 11 Feb 2021 15:15:58 -1000 Subject: [PATCH 13/33] Added the python half of the new extract known values --- cpp/src/arrow/dataset/expression.h | 3 ++- python/pyarrow/_dataset.pyx | 15 +++++++++++---- python/pyarrow/includes/libarrow_dataset.pxd | 9 ++++++++- python/pyarrow/tests/test_dataset.py | 20 +++++++++++++------- 4 files changed, 34 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/dataset/expression.h b/cpp/src/arrow/dataset/expression.h index 785290e4bb2..1e895febac6 100644 --- a/cpp/src/arrow/dataset/expression.h +++ b/cpp/src/arrow/dataset/expression.h @@ -167,8 +167,9 @@ struct KnownFieldValue { Datum datum; bool valid; + KnownFieldValue() : datum(), valid(false) {} KnownFieldValue(const Datum& datum) - : datum(datum), valid(datum.length() == datum.null_count()) {} + : datum(datum), valid(datum.length() != datum.null_count()) {} KnownFieldValue(bool is_valid) : datum(), valid(is_valid) {} inline bool concrete() const { return datum.kind() != Datum::Kind::NONE; } diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 5fa2b118ed5..acd5d9602b5 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -206,6 +206,10 @@ cdef class Expression(_Weakrefable): """Checks whether the expression is not-null (valid)""" return Expression._call("is_valid", [self]) + def is_null(self): + """Checks whether the expression is null""" + return Expression._call("is_null", [self]) + def cast(self, type, bint safe=True): """Explicitly change the expression's data type""" cdef shared_ptr[CCastOptions] c_options @@ -2351,14 +2355,17 @@ def _get_partition_keys(Expression partition_expression): """ cdef: CExpression expr = partition_expression.unwrap() - pair[CFieldRef, CDatum] ref_val + pair[CFieldRef, CKnownFieldValue] ref_val out = {} for ref_val in GetResultValue(CExtractKnownFieldValues(expr)): assert ref_val.first.name() != nullptr - assert ref_val.second.kind() == DatumType_SCALAR - val = pyarrow_wrap_scalar(ref_val.second.scalar()) - out[frombytes(deref(ref_val.first.name()))] = val.as_py() + if ref_val.second.valid: + assert ref_val.second.datum.kind() == DatumType_SCALAR + val = pyarrow_wrap_scalar(ref_val.second.datum.scalar()) + out[frombytes(deref(ref_val.first.name()))] = val.as_py() + else: + out[frombytes(deref(ref_val.first.name()))] = None return out diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd index 93bc0edddc1..2127b3dccff 100644 --- a/python/pyarrow/includes/libarrow_dataset.pxd +++ b/python/pyarrow/includes/libarrow_dataset.pxd @@ -315,7 +315,14 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: const CExpression& partition_expression, CRecordBatchProjector* projector) - cdef CResult[unordered_map[CFieldRef, CDatum, CFieldRefHash]] \ + cdef cppclass CKnownFieldValue "arrow::dataset::KnownFieldValue": + CDatum datum + c_bool valid + CKnownFieldValue(CDatum datum) + CKnownFieldValue(c_bool valid) + c_bool operator==(const CKnownFieldValue&) const + + cdef CResult[unordered_map[CFieldRef, CKnownFieldValue, CFieldRefHash]] \ CExtractKnownFieldValues "arrow::dataset::ExtractKnownFieldValues"( const CExpression& partition_expression) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 48ef421694d..a2d75f8e4a4 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -489,6 +489,9 @@ def test_partition_keys(): assert ds._get_partition_keys(a) == {"a": "a"} assert ds._get_partition_keys(a & b & c) == {f: f for f in "abc"} + null = ds.field("a").is_null() + assert ds._get_partition_keys(null) == {"a": None} + nope = ds.field("d") >= 3 assert ds._get_partition_keys(nope) == {} assert ds._get_partition_keys(a & nope) == {"a": "a"} @@ -1710,16 +1713,17 @@ def dict_type(key): @pytest.mark.pandas def test_dataset_partitioned_dictionary_type_reconstruct(tempdir): # https://issues.apache.org/jira/browse/ARROW-11400 - table = pa.table({'part': np.repeat(['A', 'B'], 5), 'col': range(10)}) - part = ds.partitioning(table.select(['part']).schema, flavor="hive") + table = pa.table({"part": np.repeat(["A", "B"], 5), "col": range(10)}) + part = ds.partitioning(table.select(["part"]).schema, flavor="hive") ds.write_dataset(table, tempdir, partitioning=part, format="feather") dataset = ds.dataset( - tempdir, format="feather", - partitioning=ds.HivePartitioning.discover(infer_dictionary=True) + tempdir, + format="feather", + partitioning=ds.HivePartitioning.discover(infer_dictionary=True), ) expected = pa.table( - {'col': table['col'], 'part': table['part'].dictionary_encode()} + {"col": table["col"], "part": table["part"].dictionary_encode()} ) assert dataset.to_table().equals(expected) fragment = list(dataset.get_fragments())[0] @@ -1732,8 +1736,10 @@ def test_dataset_partitioned_dictionary_type_reconstruct(tempdir): restored = pickle.loads(pickle.dumps(fragment)) assert restored.to_table(schema=dataset.schema).equals(expected[:5]) # to_pandas call triggers computation of the actual dictionary values - assert restored.to_table(schema=dataset.schema).to_pandas().equals( - expected[:5].to_pandas() + assert ( + restored.to_table(schema=dataset.schema) + .to_pandas() + .equals(expected[:5].to_pandas()) ) assert restored.partition_expression.equals(part_expr) From de7be7b89508ec355c7f97b84276893ccaa155ed Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 11 Feb 2021 15:35:01 -1000 Subject: [PATCH 14/33] Lint --- cpp/src/arrow/dataset/expression.h | 5 +- python/pyarrow/_compute.pyx | 5 +- python/pyarrow/compute.py | 6 +- python/pyarrow/includes/libarrow.pxd | 2 +- python/pyarrow/tests/test_dataset.py | 106 ++++++++++++++++++--------- 5 files changed, 82 insertions(+), 42 deletions(-) diff --git a/cpp/src/arrow/dataset/expression.h b/cpp/src/arrow/dataset/expression.h index 1e895febac6..1bbcb471015 100644 --- a/cpp/src/arrow/dataset/expression.h +++ b/cpp/src/arrow/dataset/expression.h @@ -168,9 +168,10 @@ struct KnownFieldValue { bool valid; KnownFieldValue() : datum(), valid(false) {} - KnownFieldValue(const Datum& datum) + KnownFieldValue(const Datum& datum) // NOLINT implicit conversion : datum(datum), valid(datum.length() != datum.null_count()) {} - KnownFieldValue(bool is_valid) : datum(), valid(is_valid) {} + KnownFieldValue(bool is_valid) // NOLINT implicit conversion + : datum(), valid(is_valid) {} inline bool concrete() const { return datum.kind() != Datum::Kind::NONE; } bool operator==(const KnownFieldValue& other) const { diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 8dea882334a..d3d5dc510a3 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -648,6 +648,7 @@ class FilterOptions(_FilterOptions): def __init__(self, null_selection_behavior='drop'): self._set_options(null_selection_behavior) + cdef class _DictionaryEncodeOptions(FunctionOptions): cdef: unique_ptr[CDictionaryEncodeOptions] dictionary_encode_options @@ -663,7 +664,9 @@ cdef class _DictionaryEncodeOptions(FunctionOptions): self.dictionary_encode_options.reset( new CDictionaryEncodeOptions(CDictionaryEncodeNullEncodingBehavior_MASK)) else: - raise ValueError('"{}" is not a valid null_encoding_behavior'.format(null_encoding_behavior)) + raise ValueError('"{}" is not a valid null_encoding_behavior'.format( + null_encoding_behavior)) + class DictionaryEncodeOptions(_DictionaryEncodeOptions): def __init__(self, null_encoding_behavior='mask'): diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 1483d97a72d..1e437d43d4c 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -157,7 +157,8 @@ def _get_options_class(func): return globals()[class_name] except KeyError: warnings.warn( - "Python binding for {} not exposed".format(class_name), RuntimeWarning + "Python binding for {} not exposed".format( + class_name), RuntimeWarning ) return None @@ -221,7 +222,8 @@ def _wrap_function(name, func): else: template = _wrapper_template exec( - template.format(func_name=name, args_sig=args_sig, kwonly=kwonly), globals(), ns + template.format(func_name=name, args_sig=args_sig, + kwonly=kwonly), globals(), ns ) wrapper = ns["make_wrapper"](func, option_class) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 983ee0df0f1..6423741ae50 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1810,7 +1810,7 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: "arrow::compute::DictionaryEncodeOptions::MASK" cdef cppclass CDictionaryEncodeOptions \ - "arrow::compute::DictionaryEncodeOptions"(CFunctionOptions): + "arrow::compute::DictionaryEncodeOptions"(CFunctionOptions): CDictionaryEncodeOptions() CDictionaryEncodeOptions(CDictionaryEncodeNullEncodingBehavior null_encoding) CDictionaryEncodeNullEncodingBehavior null_encoding diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index a2d75f8e4a4..3b9eeafef28 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -200,7 +200,8 @@ def dataset(mockfs): selector = fs.FileSelector("subdir", recursive=True) options = ds.FileSystemFactoryOptions("subdir") options.partitioning = ds.DirectoryPartitioning( - pa.schema([pa.field("group", pa.int32()), pa.field("key", pa.string())]) + pa.schema([pa.field("group", pa.int32()), + pa.field("key", pa.string())]) ) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) return factory.finish() @@ -338,7 +339,8 @@ def test_dataset(dataset): def test_scanner(dataset): - scanner = ds.Scanner.from_dataset(dataset, memory_pool=pa.default_memory_pool()) + scanner = ds.Scanner.from_dataset( + dataset, memory_pool=pa.default_memory_pool()) assert isinstance(scanner, ds.Scanner) assert len(list(scanner.scan())) == 2 @@ -368,13 +370,15 @@ def test_abstract_classes(): def test_partitioning(): - schema = pa.schema([pa.field("i64", pa.int64()), pa.field("f64", pa.float64())]) + schema = pa.schema([pa.field("i64", pa.int64()), + pa.field("f64", pa.float64())]) for klass in [ds.DirectoryPartitioning, ds.HivePartitioning]: partitioning = klass(schema) assert isinstance(partitioning, ds.Partitioning) partitioning = ds.DirectoryPartitioning( - pa.schema([pa.field("group", pa.int64()), pa.field("key", pa.float64())]) + pa.schema([pa.field("group", pa.int64()), + pa.field("key", pa.float64())]) ) expr = partitioning.parse("/3/3.14") assert isinstance(expr, ds.Expression) @@ -386,10 +390,12 @@ def test_partitioning(): partitioning.parse("/prefix/3/aaa") partitioning = ds.HivePartitioning( - pa.schema([pa.field("alpha", pa.int64()), pa.field("beta", pa.int64())]) + pa.schema([pa.field("alpha", pa.int64()), + pa.field("beta", pa.int64())]) ) expr = partitioning.parse("/alpha=0/beta=3") - expected = (ds.field("alpha") == ds.scalar(0)) & (ds.field("beta") == ds.scalar(3)) + expected = (ds.field("alpha") == ds.scalar(0)) & ( + ds.field("beta") == ds.scalar(3)) assert expr.equals(expected) for shouldfail in ["/alpha=one/beta=2", "/alpha=one", "/beta=two"]: @@ -397,7 +403,8 @@ def test_partitioning(): partitioning.parse(shouldfail) partitioning = ds.HivePartitioning( - pa.schema([pa.field("alpha", pa.int64()), pa.field("beta", pa.int64())]), + pa.schema([pa.field("alpha", pa.int64()), + pa.field("beta", pa.int64())]), None, "xyz", ) @@ -536,7 +543,8 @@ def test_file_format_pickling(): formats = [ ds.IpcFileFormat(), ds.CsvFileFormat(), - ds.CsvFileFormat(pa.csv.ParseOptions(delimiter="\t", ignore_empty_lines=True)), + ds.CsvFileFormat(pa.csv.ParseOptions( + delimiter="\t", ignore_empty_lines=True)), ds.ParquetFileFormat(), ds.ParquetFileFormat( read_options=ds.ParquetReadOptions(use_buffered_stream=True) @@ -568,13 +576,15 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): options = ds.FileSystemFactoryOptions("subdir") options.partitioning = ds.DirectoryPartitioning( - pa.schema([pa.field("group", pa.int32()), pa.field("key", pa.string())]) + pa.schema([pa.field("group", pa.int32()), + pa.field("key", pa.string())]) ) assert options.partition_base_dir == "subdir" assert options.selector_ignore_prefixes == [".", "_"] assert options.exclude_invalid_files is False - factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options) + factory = ds.FileSystemDatasetFactory( + mockfs, paths_or_selector, format, options) inspected_schema = factory.inspect() assert factory.inspect().equals( @@ -627,7 +637,8 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): def test_make_fragment(multisourcefs): parquet_format = ds.ParquetFileFormat() - dataset = ds.dataset("/plain", filesystem=multisourcefs, format=parquet_format) + dataset = ds.dataset( + "/plain", filesystem=multisourcefs, format=parquet_format) for path in dataset.files: fragment = parquet_format.make_fragment(path, multisourcefs) @@ -689,7 +700,8 @@ def test_make_parquet_fragment_from_buffer(): ) ) - cases = [(arrays, ds.ParquetFileFormat()), (dictionary_arrays, dictionary_format)] + cases = [(arrays, ds.ParquetFileFormat()), + (dictionary_arrays, dictionary_format)] for arrays, format_ in cases: table = pa.table(arrays, names=["alpha", "num", "animal"]) @@ -714,7 +726,8 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None): path = str(tempdir / "test_parquet_dataset") # write_to_dataset currently requires pandas - pq.write_to_dataset(table, path, partition_cols=["part"], chunk_size=chunk_size) + pq.write_to_dataset(table, path, partition_cols=[ + "part"], chunk_size=chunk_size) dataset = ds.dataset( path, format="parquet", partitioning="hive", filesystem=filesystem ) @@ -777,7 +790,8 @@ def test_fragments_reconstruct(tempdir): table, dataset = _create_dataset_for_fragments(tempdir) def assert_yields_projected(fragment, row_slice, columns=None, filter=None): - actual = fragment.to_table(schema=table.schema, columns=columns, filter=filter) + actual = fragment.to_table( + schema=table.schema, columns=columns, filter=filter) column_names = columns if columns else table.column_names assert actual.column_names == column_names @@ -824,13 +838,14 @@ def assert_yields_projected(fragment, row_slice, columns=None, filter=None): fragment.filesystem, partition_expression=fragment.partition_expression, ) - assert_yields_projected(new_fragment, (0, 4), filter=ds.field("part") == "a") + assert_yields_projected(new_fragment, (0, 4), + filter=ds.field("part") == "a") # Fragments don't contain the partition's columns if not provided to the # `to_table(schema=...)` method. pattern = ( - r"No match for FieldRef.Name\(part\) in " - + fragment.physical_schema.to_string(False, False, False) + r"No match for FieldRef.Name\(part\) in " + + fragment.physical_schema.to_string(False, False, False) ) with pytest.raises(ValueError, match=pattern): new_fragment = parquet_format.make_fragment( @@ -914,7 +929,8 @@ def test_fragments_parquet_row_groups_dictionary(tempdir): @pytest.mark.parquet def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs): fs, assert_opens = open_logging_fs - _, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2, filesystem=fs) + _, dataset = _create_dataset_for_fragments( + tempdir, chunk_size=2, filesystem=fs) fragment = list(dataset.get_fragments())[0] # with default discovery, no metadata loaded @@ -1163,7 +1179,8 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir): @pytest.mark.parquet def test_fragments_parquet_subset_ids(tempdir, open_logging_fs): fs, assert_opens = open_logging_fs - table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, filesystem=fs) + table, dataset = _create_dataset_for_fragments( + tempdir, chunk_size=1, filesystem=fs) fragment = list(dataset.get_fragments())[0] # select with row group ids @@ -1190,7 +1207,8 @@ def test_fragments_parquet_subset_ids(tempdir, open_logging_fs): @pytest.mark.parquet def test_fragments_parquet_subset_filter(tempdir, open_logging_fs): fs, assert_opens = open_logging_fs - table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, filesystem=fs) + table, dataset = _create_dataset_for_fragments( + tempdir, chunk_size=1, filesystem=fs) fragment = list(dataset.get_fragments())[0] # select with filter @@ -1240,7 +1258,8 @@ def test_partitioning_factory(mockfs): assert isinstance(partitioning_factory, ds.PartitioningFactory) options.partitioning_factory = partitioning_factory - factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options) + factory = ds.FileSystemDatasetFactory( + mockfs, paths_or_selector, format, options) inspected_schema = factory.inspect() # i64/f64 from data, group/key from "/1/xxx" and "/2/yyy" paths expected_schema = pa.schema( @@ -1269,7 +1288,8 @@ def test_partitioning_factory_dictionary(mockfs, infer_dictionary): ["group", "key"], infer_dictionary=infer_dictionary ) - factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options) + factory = ds.FileSystemDatasetFactory( + mockfs, paths_or_selector, format, options) inferred_schema = factory.inspect() if infer_dictionary: @@ -1396,7 +1416,8 @@ def test_open_dataset_list_of_files(tempdir): tables, (path1, path2) = _create_directory_of_files(tempdir) table = pa.concat_tables(tables) - datasets = [ds.dataset([path1, path2]), ds.dataset([str(path1), str(path2)])] + datasets = [ds.dataset([path1, path2]), + ds.dataset([str(path1), str(path2)])] datasets += [pickle.loads(pickle.dumps(d)) for d in datasets] for dataset in datasets: @@ -1504,7 +1525,8 @@ def test_construct_empty_dataset(): assert table.num_rows == 0 assert table.num_columns == 0 - empty = ds.dataset([], schema=pa.schema([("a", pa.int64()), ("a", pa.string())])) + empty = ds.dataset([], schema=pa.schema( + [("a", pa.int64()), ("a", pa.string())])) table = empty.to_table() assert table.num_rows == 0 assert table.num_columns == 2 @@ -1555,13 +1577,15 @@ def test_open_dataset_partitioned_directory(tempdir): _check_dataset_from_path(path, full_table) # specify partition scheme with discovery - dataset = ds.dataset(str(path), partitioning=ds.partitioning(flavor="hive")) + dataset = ds.dataset( + str(path), partitioning=ds.partitioning(flavor="hive")) expected_schema = table.schema.append(pa.field("part", pa.int32())) assert dataset.schema.equals(expected_schema) # specify partition scheme with discovery and relative path with change_cwd(tempdir): - dataset = ds.dataset("dataset/", partitioning=ds.partitioning(flavor="hive")) + dataset = ds.dataset( + "dataset/", partitioning=ds.partitioning(flavor="hive")) expected_schema = table.schema.append(pa.field("part", pa.int32())) assert dataset.schema.equals(expected_schema) @@ -1572,7 +1596,8 @@ def test_open_dataset_partitioned_directory(tempdir): # specify partition scheme with explicit scheme dataset = ds.dataset( str(path), - partitioning=ds.partitioning(pa.schema([("part", pa.int8())]), flavor="hive"), + partitioning=ds.partitioning( + pa.schema([("part", pa.int8())]), flavor="hive"), ) expected_schema = table.schema.append(pa.field("part", pa.int8())) assert dataset.schema.equals(expected_schema) @@ -1694,7 +1719,8 @@ def test_open_dataset_partitioned_dictionary_type( part_keys1, part_keys2 = partition_keys for part1 in part_keys1: for part2 in part_keys2: - path = basepath / fmt.format(part1 or null_value, part2 or null_value) + path = basepath / \ + fmt.format(part1 or null_value, part2 or null_value) path.mkdir(parents=True) pq.write_table(table, path / "test.parquet") @@ -2248,7 +2274,8 @@ def _create_metadata_file(root_path): metadata_collector.append(metadata) metadata_path = root_path / "_metadata" - pq.write_metadata(schema, metadata_path, metadata_collector=metadata_collector) + pq.write_metadata(schema, metadata_path, + metadata_collector=metadata_collector) return metadata_path @@ -2371,7 +2398,8 @@ def test_filter_mismatching_schema(tempdir): # specifying explicit schema, but that mismatches the schema of the data schema = pa.schema([("col", pa.int64())]) - dataset = ds.dataset(tempdir / "data.parquet", format="parquet", schema=schema) + dataset = ds.dataset(tempdir / "data.parquet", + format="parquet", schema=schema) # filtering on a column with such type mismatch should give a proper error with pytest.raises(TypeError): @@ -2410,7 +2438,8 @@ def test_dataset_project_null_column(tempdir): f = tempdir / "test_dataset_project_null_column.parquet" df.to_parquet(f, engine="pyarrow") - dataset = ds.dataset(f, format="parquet", schema=pa.schema([("col", pa.int64())])) + dataset = ds.dataset(f, format="parquet", + schema=pa.schema([("col", pa.int64())])) expected = pa.table({"col": pa.array([None, None, None], pa.int64())}) assert dataset.to_table().equals(expected) @@ -2433,7 +2462,8 @@ def _check_dataset_roundtrip( assert set(file_paths) == set(expected_files) # check that reading back in as dataset gives the same result - dataset2 = ds.dataset(base_dir_path, format="feather", partitioning=partitioning) + dataset2 = ds.dataset(base_dir_path, format="feather", + partitioning=partitioning) assert dataset2.to_table().equals(dataset.to_table()) @@ -2518,7 +2548,8 @@ def test_write_dataset_partitioned_dict(tempdir): # directory partitioning, dictionary partition columns dataset = ds.dataset( - directory, partitioning=ds.HivePartitioning.discover(infer_dictionary=True) + directory, partitioning=ds.HivePartitioning.discover( + infer_dictionary=True) ) target = tempdir / "partitioned-dir-target" expected_paths = [ @@ -2546,7 +2577,8 @@ def test_write_dataset_use_threads(tempdir): _ = _create_parquet_dataset_partitioned(directory) dataset = ds.dataset(directory, partitioning="hive") - partitioning = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive") + partitioning = ds.partitioning( + pa.schema([("part", pa.string())]), flavor="hive") target1 = tempdir / "partitioned1" ds.write_dataset( @@ -2587,7 +2619,8 @@ def test_write_table(tempdir): # with partitioning base_dir = tempdir / "partitioned" - partitioning = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive") + partitioning = ds.partitioning( + pa.schema([("part", pa.string())]), flavor="hive") ds.write_dataset( table, base_dir, @@ -2661,7 +2694,8 @@ def test_write_table_partitioned_dict(tempdir): partitioning = ds.partitioning(table.select(["part"]).schema) base_dir = tempdir / "dataset" - ds.write_dataset(table, base_dir, format="feather", partitioning=partitioning) + ds.write_dataset(table, base_dir, format="feather", + partitioning=partitioning) # check roundtrip partitioning_read = ds.DirectoryPartitioning.discover( From cd00e59b267aa1267c58e8026ae8e4b4be51d2a0 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 11 Feb 2021 15:37:17 -1000 Subject: [PATCH 15/33] Missed a test case --- python/pyarrow/tests/test_dataset.py | 110 +++++++++------------------ 1 file changed, 37 insertions(+), 73 deletions(-) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 3b9eeafef28..9bbffbc8d76 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -200,8 +200,7 @@ def dataset(mockfs): selector = fs.FileSelector("subdir", recursive=True) options = ds.FileSystemFactoryOptions("subdir") options.partitioning = ds.DirectoryPartitioning( - pa.schema([pa.field("group", pa.int32()), - pa.field("key", pa.string())]) + pa.schema([pa.field("group", pa.int32()), pa.field("key", pa.string())]) ) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) return factory.finish() @@ -339,8 +338,7 @@ def test_dataset(dataset): def test_scanner(dataset): - scanner = ds.Scanner.from_dataset( - dataset, memory_pool=pa.default_memory_pool()) + scanner = ds.Scanner.from_dataset(dataset, memory_pool=pa.default_memory_pool()) assert isinstance(scanner, ds.Scanner) assert len(list(scanner.scan())) == 2 @@ -370,15 +368,13 @@ def test_abstract_classes(): def test_partitioning(): - schema = pa.schema([pa.field("i64", pa.int64()), - pa.field("f64", pa.float64())]) + schema = pa.schema([pa.field("i64", pa.int64()), pa.field("f64", pa.float64())]) for klass in [ds.DirectoryPartitioning, ds.HivePartitioning]: partitioning = klass(schema) assert isinstance(partitioning, ds.Partitioning) partitioning = ds.DirectoryPartitioning( - pa.schema([pa.field("group", pa.int64()), - pa.field("key", pa.float64())]) + pa.schema([pa.field("group", pa.int64()), pa.field("key", pa.float64())]) ) expr = partitioning.parse("/3/3.14") assert isinstance(expr, ds.Expression) @@ -390,12 +386,10 @@ def test_partitioning(): partitioning.parse("/prefix/3/aaa") partitioning = ds.HivePartitioning( - pa.schema([pa.field("alpha", pa.int64()), - pa.field("beta", pa.int64())]) + pa.schema([pa.field("alpha", pa.int64()), pa.field("beta", pa.int64())]) ) expr = partitioning.parse("/alpha=0/beta=3") - expected = (ds.field("alpha") == ds.scalar(0)) & ( - ds.field("beta") == ds.scalar(3)) + expected = (ds.field("alpha") == ds.scalar(0)) & (ds.field("beta") == ds.scalar(3)) assert expr.equals(expected) for shouldfail in ["/alpha=one/beta=2", "/alpha=one", "/beta=two"]: @@ -403,15 +397,12 @@ def test_partitioning(): partitioning.parse(shouldfail) partitioning = ds.HivePartitioning( - pa.schema([pa.field("alpha", pa.int64()), - pa.field("beta", pa.int64())]), + pa.schema([pa.field("alpha", pa.int64()), pa.field("beta", pa.int64())]), None, "xyz", ) expr = partitioning.parse("/alpha=xyz/beta=3") - expected = (ds.field("alpha") == ds.scalar(None)) & ( - ds.field("beta") == ds.scalar(3) - ) + expected = (ds.field("alpha").is_null()) & (ds.field("beta") == ds.scalar(3)) assert expr.equals(expected) @@ -543,8 +534,7 @@ def test_file_format_pickling(): formats = [ ds.IpcFileFormat(), ds.CsvFileFormat(), - ds.CsvFileFormat(pa.csv.ParseOptions( - delimiter="\t", ignore_empty_lines=True)), + ds.CsvFileFormat(pa.csv.ParseOptions(delimiter="\t", ignore_empty_lines=True)), ds.ParquetFileFormat(), ds.ParquetFileFormat( read_options=ds.ParquetReadOptions(use_buffered_stream=True) @@ -576,15 +566,13 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): options = ds.FileSystemFactoryOptions("subdir") options.partitioning = ds.DirectoryPartitioning( - pa.schema([pa.field("group", pa.int32()), - pa.field("key", pa.string())]) + pa.schema([pa.field("group", pa.int32()), pa.field("key", pa.string())]) ) assert options.partition_base_dir == "subdir" assert options.selector_ignore_prefixes == [".", "_"] assert options.exclude_invalid_files is False - factory = ds.FileSystemDatasetFactory( - mockfs, paths_or_selector, format, options) + factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options) inspected_schema = factory.inspect() assert factory.inspect().equals( @@ -637,8 +625,7 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): def test_make_fragment(multisourcefs): parquet_format = ds.ParquetFileFormat() - dataset = ds.dataset( - "/plain", filesystem=multisourcefs, format=parquet_format) + dataset = ds.dataset("/plain", filesystem=multisourcefs, format=parquet_format) for path in dataset.files: fragment = parquet_format.make_fragment(path, multisourcefs) @@ -700,8 +687,7 @@ def test_make_parquet_fragment_from_buffer(): ) ) - cases = [(arrays, ds.ParquetFileFormat()), - (dictionary_arrays, dictionary_format)] + cases = [(arrays, ds.ParquetFileFormat()), (dictionary_arrays, dictionary_format)] for arrays, format_ in cases: table = pa.table(arrays, names=["alpha", "num", "animal"]) @@ -726,8 +712,7 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None): path = str(tempdir / "test_parquet_dataset") # write_to_dataset currently requires pandas - pq.write_to_dataset(table, path, partition_cols=[ - "part"], chunk_size=chunk_size) + pq.write_to_dataset(table, path, partition_cols=["part"], chunk_size=chunk_size) dataset = ds.dataset( path, format="parquet", partitioning="hive", filesystem=filesystem ) @@ -790,8 +775,7 @@ def test_fragments_reconstruct(tempdir): table, dataset = _create_dataset_for_fragments(tempdir) def assert_yields_projected(fragment, row_slice, columns=None, filter=None): - actual = fragment.to_table( - schema=table.schema, columns=columns, filter=filter) + actual = fragment.to_table(schema=table.schema, columns=columns, filter=filter) column_names = columns if columns else table.column_names assert actual.column_names == column_names @@ -838,14 +822,13 @@ def assert_yields_projected(fragment, row_slice, columns=None, filter=None): fragment.filesystem, partition_expression=fragment.partition_expression, ) - assert_yields_projected(new_fragment, (0, 4), - filter=ds.field("part") == "a") + assert_yields_projected(new_fragment, (0, 4), filter=ds.field("part") == "a") # Fragments don't contain the partition's columns if not provided to the # `to_table(schema=...)` method. pattern = ( - r"No match for FieldRef.Name\(part\) in " + - fragment.physical_schema.to_string(False, False, False) + r"No match for FieldRef.Name\(part\) in " + + fragment.physical_schema.to_string(False, False, False) ) with pytest.raises(ValueError, match=pattern): new_fragment = parquet_format.make_fragment( @@ -929,8 +912,7 @@ def test_fragments_parquet_row_groups_dictionary(tempdir): @pytest.mark.parquet def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs): fs, assert_opens = open_logging_fs - _, dataset = _create_dataset_for_fragments( - tempdir, chunk_size=2, filesystem=fs) + _, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2, filesystem=fs) fragment = list(dataset.get_fragments())[0] # with default discovery, no metadata loaded @@ -1179,8 +1161,7 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir): @pytest.mark.parquet def test_fragments_parquet_subset_ids(tempdir, open_logging_fs): fs, assert_opens = open_logging_fs - table, dataset = _create_dataset_for_fragments( - tempdir, chunk_size=1, filesystem=fs) + table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, filesystem=fs) fragment = list(dataset.get_fragments())[0] # select with row group ids @@ -1207,8 +1188,7 @@ def test_fragments_parquet_subset_ids(tempdir, open_logging_fs): @pytest.mark.parquet def test_fragments_parquet_subset_filter(tempdir, open_logging_fs): fs, assert_opens = open_logging_fs - table, dataset = _create_dataset_for_fragments( - tempdir, chunk_size=1, filesystem=fs) + table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, filesystem=fs) fragment = list(dataset.get_fragments())[0] # select with filter @@ -1258,8 +1238,7 @@ def test_partitioning_factory(mockfs): assert isinstance(partitioning_factory, ds.PartitioningFactory) options.partitioning_factory = partitioning_factory - factory = ds.FileSystemDatasetFactory( - mockfs, paths_or_selector, format, options) + factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options) inspected_schema = factory.inspect() # i64/f64 from data, group/key from "/1/xxx" and "/2/yyy" paths expected_schema = pa.schema( @@ -1288,8 +1267,7 @@ def test_partitioning_factory_dictionary(mockfs, infer_dictionary): ["group", "key"], infer_dictionary=infer_dictionary ) - factory = ds.FileSystemDatasetFactory( - mockfs, paths_or_selector, format, options) + factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options) inferred_schema = factory.inspect() if infer_dictionary: @@ -1416,8 +1394,7 @@ def test_open_dataset_list_of_files(tempdir): tables, (path1, path2) = _create_directory_of_files(tempdir) table = pa.concat_tables(tables) - datasets = [ds.dataset([path1, path2]), - ds.dataset([str(path1), str(path2)])] + datasets = [ds.dataset([path1, path2]), ds.dataset([str(path1), str(path2)])] datasets += [pickle.loads(pickle.dumps(d)) for d in datasets] for dataset in datasets: @@ -1525,8 +1502,7 @@ def test_construct_empty_dataset(): assert table.num_rows == 0 assert table.num_columns == 0 - empty = ds.dataset([], schema=pa.schema( - [("a", pa.int64()), ("a", pa.string())])) + empty = ds.dataset([], schema=pa.schema([("a", pa.int64()), ("a", pa.string())])) table = empty.to_table() assert table.num_rows == 0 assert table.num_columns == 2 @@ -1577,15 +1553,13 @@ def test_open_dataset_partitioned_directory(tempdir): _check_dataset_from_path(path, full_table) # specify partition scheme with discovery - dataset = ds.dataset( - str(path), partitioning=ds.partitioning(flavor="hive")) + dataset = ds.dataset(str(path), partitioning=ds.partitioning(flavor="hive")) expected_schema = table.schema.append(pa.field("part", pa.int32())) assert dataset.schema.equals(expected_schema) # specify partition scheme with discovery and relative path with change_cwd(tempdir): - dataset = ds.dataset( - "dataset/", partitioning=ds.partitioning(flavor="hive")) + dataset = ds.dataset("dataset/", partitioning=ds.partitioning(flavor="hive")) expected_schema = table.schema.append(pa.field("part", pa.int32())) assert dataset.schema.equals(expected_schema) @@ -1596,8 +1570,7 @@ def test_open_dataset_partitioned_directory(tempdir): # specify partition scheme with explicit scheme dataset = ds.dataset( str(path), - partitioning=ds.partitioning( - pa.schema([("part", pa.int8())]), flavor="hive"), + partitioning=ds.partitioning(pa.schema([("part", pa.int8())]), flavor="hive"), ) expected_schema = table.schema.append(pa.field("part", pa.int8())) assert dataset.schema.equals(expected_schema) @@ -1719,8 +1692,7 @@ def test_open_dataset_partitioned_dictionary_type( part_keys1, part_keys2 = partition_keys for part1 in part_keys1: for part2 in part_keys2: - path = basepath / \ - fmt.format(part1 or null_value, part2 or null_value) + path = basepath / fmt.format(part1 or null_value, part2 or null_value) path.mkdir(parents=True) pq.write_table(table, path / "test.parquet") @@ -2274,8 +2246,7 @@ def _create_metadata_file(root_path): metadata_collector.append(metadata) metadata_path = root_path / "_metadata" - pq.write_metadata(schema, metadata_path, - metadata_collector=metadata_collector) + pq.write_metadata(schema, metadata_path, metadata_collector=metadata_collector) return metadata_path @@ -2398,8 +2369,7 @@ def test_filter_mismatching_schema(tempdir): # specifying explicit schema, but that mismatches the schema of the data schema = pa.schema([("col", pa.int64())]) - dataset = ds.dataset(tempdir / "data.parquet", - format="parquet", schema=schema) + dataset = ds.dataset(tempdir / "data.parquet", format="parquet", schema=schema) # filtering on a column with such type mismatch should give a proper error with pytest.raises(TypeError): @@ -2438,8 +2408,7 @@ def test_dataset_project_null_column(tempdir): f = tempdir / "test_dataset_project_null_column.parquet" df.to_parquet(f, engine="pyarrow") - dataset = ds.dataset(f, format="parquet", - schema=pa.schema([("col", pa.int64())])) + dataset = ds.dataset(f, format="parquet", schema=pa.schema([("col", pa.int64())])) expected = pa.table({"col": pa.array([None, None, None], pa.int64())}) assert dataset.to_table().equals(expected) @@ -2462,8 +2431,7 @@ def _check_dataset_roundtrip( assert set(file_paths) == set(expected_files) # check that reading back in as dataset gives the same result - dataset2 = ds.dataset(base_dir_path, format="feather", - partitioning=partitioning) + dataset2 = ds.dataset(base_dir_path, format="feather", partitioning=partitioning) assert dataset2.to_table().equals(dataset.to_table()) @@ -2548,8 +2516,7 @@ def test_write_dataset_partitioned_dict(tempdir): # directory partitioning, dictionary partition columns dataset = ds.dataset( - directory, partitioning=ds.HivePartitioning.discover( - infer_dictionary=True) + directory, partitioning=ds.HivePartitioning.discover(infer_dictionary=True) ) target = tempdir / "partitioned-dir-target" expected_paths = [ @@ -2577,8 +2544,7 @@ def test_write_dataset_use_threads(tempdir): _ = _create_parquet_dataset_partitioned(directory) dataset = ds.dataset(directory, partitioning="hive") - partitioning = ds.partitioning( - pa.schema([("part", pa.string())]), flavor="hive") + partitioning = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive") target1 = tempdir / "partitioned1" ds.write_dataset( @@ -2619,8 +2585,7 @@ def test_write_table(tempdir): # with partitioning base_dir = tempdir / "partitioned" - partitioning = ds.partitioning( - pa.schema([("part", pa.string())]), flavor="hive") + partitioning = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive") ds.write_dataset( table, base_dir, @@ -2694,8 +2659,7 @@ def test_write_table_partitioned_dict(tempdir): partitioning = ds.partitioning(table.select(["part"]).schema) base_dir = tempdir / "dataset" - ds.write_dataset(table, base_dir, format="feather", - partitioning=partitioning) + ds.write_dataset(table, base_dir, format="feather", partitioning=partitioning) # check roundtrip partitioning_read = ds.DirectoryPartitioning.discover( From 4506853455451759a6501e330ab906b962c4aff0 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 11 Feb 2021 17:22:43 -1000 Subject: [PATCH 16/33] Re-lint, it appears my IDE is using the wrong style file --- python/pyarrow/tests/test_dataset.py | 109 ++++++++++++++++++--------- 1 file changed, 72 insertions(+), 37 deletions(-) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 9bbffbc8d76..89bec21fd86 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -200,7 +200,8 @@ def dataset(mockfs): selector = fs.FileSelector("subdir", recursive=True) options = ds.FileSystemFactoryOptions("subdir") options.partitioning = ds.DirectoryPartitioning( - pa.schema([pa.field("group", pa.int32()), pa.field("key", pa.string())]) + pa.schema([pa.field("group", pa.int32()), + pa.field("key", pa.string())]) ) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) return factory.finish() @@ -338,7 +339,8 @@ def test_dataset(dataset): def test_scanner(dataset): - scanner = ds.Scanner.from_dataset(dataset, memory_pool=pa.default_memory_pool()) + scanner = ds.Scanner.from_dataset( + dataset, memory_pool=pa.default_memory_pool()) assert isinstance(scanner, ds.Scanner) assert len(list(scanner.scan())) == 2 @@ -368,13 +370,15 @@ def test_abstract_classes(): def test_partitioning(): - schema = pa.schema([pa.field("i64", pa.int64()), pa.field("f64", pa.float64())]) + schema = pa.schema([pa.field("i64", pa.int64()), + pa.field("f64", pa.float64())]) for klass in [ds.DirectoryPartitioning, ds.HivePartitioning]: partitioning = klass(schema) assert isinstance(partitioning, ds.Partitioning) partitioning = ds.DirectoryPartitioning( - pa.schema([pa.field("group", pa.int64()), pa.field("key", pa.float64())]) + pa.schema([pa.field("group", pa.int64()), + pa.field("key", pa.float64())]) ) expr = partitioning.parse("/3/3.14") assert isinstance(expr, ds.Expression) @@ -386,10 +390,12 @@ def test_partitioning(): partitioning.parse("/prefix/3/aaa") partitioning = ds.HivePartitioning( - pa.schema([pa.field("alpha", pa.int64()), pa.field("beta", pa.int64())]) + pa.schema([pa.field("alpha", pa.int64()), + pa.field("beta", pa.int64())]) ) expr = partitioning.parse("/alpha=0/beta=3") - expected = (ds.field("alpha") == ds.scalar(0)) & (ds.field("beta") == ds.scalar(3)) + expected = (ds.field("alpha") == ds.scalar(0)) & ( + ds.field("beta") == ds.scalar(3)) assert expr.equals(expected) for shouldfail in ["/alpha=one/beta=2", "/alpha=one", "/beta=two"]: @@ -397,12 +403,14 @@ def test_partitioning(): partitioning.parse(shouldfail) partitioning = ds.HivePartitioning( - pa.schema([pa.field("alpha", pa.int64()), pa.field("beta", pa.int64())]), + pa.schema([pa.field("alpha", pa.int64()), + pa.field("beta", pa.int64())]), None, "xyz", ) expr = partitioning.parse("/alpha=xyz/beta=3") - expected = (ds.field("alpha").is_null()) & (ds.field("beta") == ds.scalar(3)) + expected = (ds.field("alpha").is_null()) & ( + ds.field("beta") == ds.scalar(3)) assert expr.equals(expected) @@ -534,7 +542,8 @@ def test_file_format_pickling(): formats = [ ds.IpcFileFormat(), ds.CsvFileFormat(), - ds.CsvFileFormat(pa.csv.ParseOptions(delimiter="\t", ignore_empty_lines=True)), + ds.CsvFileFormat(pa.csv.ParseOptions( + delimiter="\t", ignore_empty_lines=True)), ds.ParquetFileFormat(), ds.ParquetFileFormat( read_options=ds.ParquetReadOptions(use_buffered_stream=True) @@ -566,13 +575,15 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): options = ds.FileSystemFactoryOptions("subdir") options.partitioning = ds.DirectoryPartitioning( - pa.schema([pa.field("group", pa.int32()), pa.field("key", pa.string())]) + pa.schema([pa.field("group", pa.int32()), + pa.field("key", pa.string())]) ) assert options.partition_base_dir == "subdir" assert options.selector_ignore_prefixes == [".", "_"] assert options.exclude_invalid_files is False - factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options) + factory = ds.FileSystemDatasetFactory( + mockfs, paths_or_selector, format, options) inspected_schema = factory.inspect() assert factory.inspect().equals( @@ -625,7 +636,8 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): def test_make_fragment(multisourcefs): parquet_format = ds.ParquetFileFormat() - dataset = ds.dataset("/plain", filesystem=multisourcefs, format=parquet_format) + dataset = ds.dataset( + "/plain", filesystem=multisourcefs, format=parquet_format) for path in dataset.files: fragment = parquet_format.make_fragment(path, multisourcefs) @@ -687,7 +699,8 @@ def test_make_parquet_fragment_from_buffer(): ) ) - cases = [(arrays, ds.ParquetFileFormat()), (dictionary_arrays, dictionary_format)] + cases = [(arrays, ds.ParquetFileFormat()), + (dictionary_arrays, dictionary_format)] for arrays, format_ in cases: table = pa.table(arrays, names=["alpha", "num", "animal"]) @@ -712,7 +725,8 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None): path = str(tempdir / "test_parquet_dataset") # write_to_dataset currently requires pandas - pq.write_to_dataset(table, path, partition_cols=["part"], chunk_size=chunk_size) + pq.write_to_dataset(table, path, partition_cols=[ + "part"], chunk_size=chunk_size) dataset = ds.dataset( path, format="parquet", partitioning="hive", filesystem=filesystem ) @@ -775,7 +789,8 @@ def test_fragments_reconstruct(tempdir): table, dataset = _create_dataset_for_fragments(tempdir) def assert_yields_projected(fragment, row_slice, columns=None, filter=None): - actual = fragment.to_table(schema=table.schema, columns=columns, filter=filter) + actual = fragment.to_table( + schema=table.schema, columns=columns, filter=filter) column_names = columns if columns else table.column_names assert actual.column_names == column_names @@ -822,13 +837,14 @@ def assert_yields_projected(fragment, row_slice, columns=None, filter=None): fragment.filesystem, partition_expression=fragment.partition_expression, ) - assert_yields_projected(new_fragment, (0, 4), filter=ds.field("part") == "a") + assert_yields_projected(new_fragment, (0, 4), + filter=ds.field("part") == "a") # Fragments don't contain the partition's columns if not provided to the # `to_table(schema=...)` method. pattern = ( - r"No match for FieldRef.Name\(part\) in " - + fragment.physical_schema.to_string(False, False, False) + r"No match for FieldRef.Name\(part\) in " + + fragment.physical_schema.to_string(False, False, False) ) with pytest.raises(ValueError, match=pattern): new_fragment = parquet_format.make_fragment( @@ -912,7 +928,8 @@ def test_fragments_parquet_row_groups_dictionary(tempdir): @pytest.mark.parquet def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs): fs, assert_opens = open_logging_fs - _, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2, filesystem=fs) + _, dataset = _create_dataset_for_fragments( + tempdir, chunk_size=2, filesystem=fs) fragment = list(dataset.get_fragments())[0] # with default discovery, no metadata loaded @@ -1161,7 +1178,8 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir): @pytest.mark.parquet def test_fragments_parquet_subset_ids(tempdir, open_logging_fs): fs, assert_opens = open_logging_fs - table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, filesystem=fs) + table, dataset = _create_dataset_for_fragments( + tempdir, chunk_size=1, filesystem=fs) fragment = list(dataset.get_fragments())[0] # select with row group ids @@ -1188,7 +1206,8 @@ def test_fragments_parquet_subset_ids(tempdir, open_logging_fs): @pytest.mark.parquet def test_fragments_parquet_subset_filter(tempdir, open_logging_fs): fs, assert_opens = open_logging_fs - table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, filesystem=fs) + table, dataset = _create_dataset_for_fragments( + tempdir, chunk_size=1, filesystem=fs) fragment = list(dataset.get_fragments())[0] # select with filter @@ -1238,7 +1257,8 @@ def test_partitioning_factory(mockfs): assert isinstance(partitioning_factory, ds.PartitioningFactory) options.partitioning_factory = partitioning_factory - factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options) + factory = ds.FileSystemDatasetFactory( + mockfs, paths_or_selector, format, options) inspected_schema = factory.inspect() # i64/f64 from data, group/key from "/1/xxx" and "/2/yyy" paths expected_schema = pa.schema( @@ -1267,7 +1287,8 @@ def test_partitioning_factory_dictionary(mockfs, infer_dictionary): ["group", "key"], infer_dictionary=infer_dictionary ) - factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options) + factory = ds.FileSystemDatasetFactory( + mockfs, paths_or_selector, format, options) inferred_schema = factory.inspect() if infer_dictionary: @@ -1394,7 +1415,8 @@ def test_open_dataset_list_of_files(tempdir): tables, (path1, path2) = _create_directory_of_files(tempdir) table = pa.concat_tables(tables) - datasets = [ds.dataset([path1, path2]), ds.dataset([str(path1), str(path2)])] + datasets = [ds.dataset([path1, path2]), + ds.dataset([str(path1), str(path2)])] datasets += [pickle.loads(pickle.dumps(d)) for d in datasets] for dataset in datasets: @@ -1502,7 +1524,8 @@ def test_construct_empty_dataset(): assert table.num_rows == 0 assert table.num_columns == 0 - empty = ds.dataset([], schema=pa.schema([("a", pa.int64()), ("a", pa.string())])) + empty = ds.dataset([], schema=pa.schema( + [("a", pa.int64()), ("a", pa.string())])) table = empty.to_table() assert table.num_rows == 0 assert table.num_columns == 2 @@ -1553,13 +1576,15 @@ def test_open_dataset_partitioned_directory(tempdir): _check_dataset_from_path(path, full_table) # specify partition scheme with discovery - dataset = ds.dataset(str(path), partitioning=ds.partitioning(flavor="hive")) + dataset = ds.dataset( + str(path), partitioning=ds.partitioning(flavor="hive")) expected_schema = table.schema.append(pa.field("part", pa.int32())) assert dataset.schema.equals(expected_schema) # specify partition scheme with discovery and relative path with change_cwd(tempdir): - dataset = ds.dataset("dataset/", partitioning=ds.partitioning(flavor="hive")) + dataset = ds.dataset( + "dataset/", partitioning=ds.partitioning(flavor="hive")) expected_schema = table.schema.append(pa.field("part", pa.int32())) assert dataset.schema.equals(expected_schema) @@ -1570,7 +1595,8 @@ def test_open_dataset_partitioned_directory(tempdir): # specify partition scheme with explicit scheme dataset = ds.dataset( str(path), - partitioning=ds.partitioning(pa.schema([("part", pa.int8())]), flavor="hive"), + partitioning=ds.partitioning( + pa.schema([("part", pa.int8())]), flavor="hive"), ) expected_schema = table.schema.append(pa.field("part", pa.int8())) assert dataset.schema.equals(expected_schema) @@ -1692,7 +1718,8 @@ def test_open_dataset_partitioned_dictionary_type( part_keys1, part_keys2 = partition_keys for part1 in part_keys1: for part2 in part_keys2: - path = basepath / fmt.format(part1 or null_value, part2 or null_value) + path = basepath / \ + fmt.format(part1 or null_value, part2 or null_value) path.mkdir(parents=True) pq.write_table(table, path / "test.parquet") @@ -2246,7 +2273,8 @@ def _create_metadata_file(root_path): metadata_collector.append(metadata) metadata_path = root_path / "_metadata" - pq.write_metadata(schema, metadata_path, metadata_collector=metadata_collector) + pq.write_metadata(schema, metadata_path, + metadata_collector=metadata_collector) return metadata_path @@ -2369,7 +2397,8 @@ def test_filter_mismatching_schema(tempdir): # specifying explicit schema, but that mismatches the schema of the data schema = pa.schema([("col", pa.int64())]) - dataset = ds.dataset(tempdir / "data.parquet", format="parquet", schema=schema) + dataset = ds.dataset(tempdir / "data.parquet", + format="parquet", schema=schema) # filtering on a column with such type mismatch should give a proper error with pytest.raises(TypeError): @@ -2408,7 +2437,8 @@ def test_dataset_project_null_column(tempdir): f = tempdir / "test_dataset_project_null_column.parquet" df.to_parquet(f, engine="pyarrow") - dataset = ds.dataset(f, format="parquet", schema=pa.schema([("col", pa.int64())])) + dataset = ds.dataset(f, format="parquet", + schema=pa.schema([("col", pa.int64())])) expected = pa.table({"col": pa.array([None, None, None], pa.int64())}) assert dataset.to_table().equals(expected) @@ -2431,7 +2461,8 @@ def _check_dataset_roundtrip( assert set(file_paths) == set(expected_files) # check that reading back in as dataset gives the same result - dataset2 = ds.dataset(base_dir_path, format="feather", partitioning=partitioning) + dataset2 = ds.dataset(base_dir_path, format="feather", + partitioning=partitioning) assert dataset2.to_table().equals(dataset.to_table()) @@ -2516,7 +2547,8 @@ def test_write_dataset_partitioned_dict(tempdir): # directory partitioning, dictionary partition columns dataset = ds.dataset( - directory, partitioning=ds.HivePartitioning.discover(infer_dictionary=True) + directory, partitioning=ds.HivePartitioning.discover( + infer_dictionary=True) ) target = tempdir / "partitioned-dir-target" expected_paths = [ @@ -2544,7 +2576,8 @@ def test_write_dataset_use_threads(tempdir): _ = _create_parquet_dataset_partitioned(directory) dataset = ds.dataset(directory, partitioning="hive") - partitioning = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive") + partitioning = ds.partitioning( + pa.schema([("part", pa.string())]), flavor="hive") target1 = tempdir / "partitioned1" ds.write_dataset( @@ -2585,7 +2618,8 @@ def test_write_table(tempdir): # with partitioning base_dir = tempdir / "partitioned" - partitioning = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive") + partitioning = ds.partitioning( + pa.schema([("part", pa.string())]), flavor="hive") ds.write_dataset( table, base_dir, @@ -2659,7 +2693,8 @@ def test_write_table_partitioned_dict(tempdir): partitioning = ds.partitioning(table.select(["part"]).schema) base_dir = tempdir / "dataset" - ds.write_dataset(table, base_dir, format="feather", partitioning=partitioning) + ds.write_dataset(table, base_dir, format="feather", + partitioning=partitioning) # check roundtrip partitioning_read = ds.DirectoryPartitioning.discover( From 3ca5f348636214452bbfac535312d928420b8b12 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 11 Feb 2021 21:50:09 -1000 Subject: [PATCH 17/33] Final lint pass. Turns out I was relying on black which was messing up everything --- python/pyarrow/_compute.pyx | 10 +- python/pyarrow/_dataset.pyx | 10 +- python/pyarrow/compute.py | 107 +-- python/pyarrow/includes/libarrow.pxd | 3 +- python/pyarrow/tests/test_dataset.py | 1314 ++++++++++++-------------- 5 files changed, 647 insertions(+), 797 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index d3d5dc510a3..3cb152aa381 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -659,13 +659,15 @@ cdef class _DictionaryEncodeOptions(FunctionOptions): def _set_options(self, null_encoding_behavior): if null_encoding_behavior == 'encode': self.dictionary_encode_options.reset( - new CDictionaryEncodeOptions(CDictionaryEncodeNullEncodingBehavior_ENCODE)) + new CDictionaryEncodeOptions( + CDictionaryEncodeNullEncodingBehavior_ENCODE)) elif null_encoding_behavior == 'mask': self.dictionary_encode_options.reset( - new CDictionaryEncodeOptions(CDictionaryEncodeNullEncodingBehavior_MASK)) + new CDictionaryEncodeOptions( + CDictionaryEncodeNullEncodingBehavior_MASK)) else: - raise ValueError('"{}" is not a valid null_encoding_behavior'.format( - null_encoding_behavior)) + raise ValueError('"{}" is not a valid null_encoding_behavior' + .format(null_encoding_behavior)) class DictionaryEncodeOptions(_DictionaryEncodeOptions): diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index acd5d9602b5..e38ea626d79 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1614,7 +1614,11 @@ cdef class HivePartitioning(Partitioning): cdef: CHivePartitioning* hive_partitioning - def __init__(self, Schema schema not None, dictionaries=None, null_fallback="__HIVE_DEFAULT_PARTITION__"): + def __init__(self, + Schema schema not None, + dictionaries=None, + null_fallback="__HIVE_DEFAULT_PARTITION__"): + cdef: shared_ptr[CHivePartitioning] c_partitioning c_string c_null_fallback = tobytes(null_fallback) @@ -1631,7 +1635,9 @@ cdef class HivePartitioning(Partitioning): self.hive_partitioning = sp.get() @staticmethod - def discover(infer_dictionary=False, max_partition_dictionary_size=0, null_fallback="__HIVE_DEFAULT_PARTITION__"): + def discover(infer_dictionary=False, + max_partition_dictionary_size=0, + null_fallback="__HIVE_DEFAULT_PARTITION__"): """ Discover a HivePartitioning. diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 1e437d43d4c..3d7f5ecb4c3 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -69,14 +69,14 @@ def _get_arg_names(func): arg_names = ["left", "right"] else: raise NotImplementedError( - f"unsupported arity: {func.arity} (function: {func.name})" - ) + f"unsupported arity: {func.arity} (function: {func.name})") return arg_names def _decorate_compute_function(wrapper, exposed_name, func, option_class): - wrapper.__arrow_compute_function__ = dict(name=func.name, arity=func.arity) + wrapper.__arrow_compute_function__ = dict(name=func.name, + arity=func.arity) wrapper.__name__ = exposed_name wrapper.__qualname__ = exposed_name @@ -86,64 +86,47 @@ def _decorate_compute_function(wrapper, exposed_name, func, option_class): summary = cpp_doc.summary if not summary: arg_str = "arguments" if func.arity > 1 else "argument" - summary = "Call compute function {!r} with the given {}".format( - func.name, arg_str - ) + summary = ("Call compute function {!r} with the given {}" + .format(func.name, arg_str)) description = cpp_doc.description arg_names = _get_arg_names(func) - doc_pieces.append( - """\ + doc_pieces.append("""\ {}. - """.format( - summary - ) - ) + """.format(summary)) if description: doc_pieces.append("{}\n\n".format(description)) - doc_pieces.append( - """\ + doc_pieces.append("""\ Parameters ---------- - """ - ) + """) for arg_name in arg_names: - if func.kind in ("vector", "scalar_aggregate"): - arg_type = "Array-like" + if func.kind in ('vector', 'scalar_aggregate'): + arg_type = 'Array-like' else: - arg_type = "Array-like or scalar-like" - doc_pieces.append( - """\ + arg_type = 'Array-like or scalar-like' + doc_pieces.append("""\ {} : {} Argument to compute function - """.format( - arg_name, arg_type - ) - ) + """.format(arg_name, arg_type)) - doc_pieces.append( - """\ + doc_pieces.append("""\ memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the default memory pool. - """ - ) + """) if option_class is not None: - doc_pieces.append( - """\ + doc_pieces.append("""\ options : pyarrow.compute.{0}, optional Parameters altering compute function semantics **kwargs : optional Parameters for {0} constructor. Either `options` or `**kwargs` can be passed, but not both at the same time. - """.format( - option_class.__name__ - ) - ) + """.format(option_class.__name__)) wrapper.__doc__ = "".join(dedent(s) for s in doc_pieces) return wrapper @@ -156,10 +139,8 @@ def _get_options_class(func): try: return globals()[class_name] except KeyError: - warnings.warn( - "Python binding for {} not exposed".format( - class_name), RuntimeWarning - ) + warnings.warn("Python binding for {} not exposed" + .format(class_name), RuntimeWarning) return None @@ -169,8 +150,8 @@ def _handle_options(name, option_class, options, kwargs): return option_class(**kwargs) raise TypeError( "Function {!r} called with both an 'options' argument " - "and additional named arguments".format(name) - ) + "and additional named arguments" + .format(name)) if options is not None: if isinstance(options, dict): @@ -178,25 +159,20 @@ def _handle_options(name, option_class, options, kwargs): elif isinstance(options, option_class): return options raise TypeError( - "Function {!r} expected a {} parameter, got {}".format( - name, option_class, type(options) - ) - ) + "Function {!r} expected a {} parameter, got {}" + .format(name, option_class, type(options))) return options -_wrapper_template = dedent( - """\ +_wrapper_template = dedent("""\ def make_wrapper(func, option_class): def {func_name}({args_sig}{kwonly}, memory_pool=None): return func.call([{args_sig}], None, memory_pool) return {func_name} - """ -) + """) -_wrapper_options_template = dedent( - """\ +_wrapper_options_template = dedent("""\ def make_wrapper(func, option_class): def {func_name}({args_sig}{kwonly}, options=None, memory_pool=None, **kwargs): @@ -204,15 +180,14 @@ def {func_name}({args_sig}{kwonly}, options=None, memory_pool=None, kwargs) return func.call([{args_sig}], options, memory_pool) return {func_name} - """ -) + """) def _wrap_function(name, func): option_class = _get_options_class(func) arg_names = _get_arg_names(func) - args_sig = ", ".join(arg_names) - kwonly = "" if arg_names[-1].startswith("*") else ", *" + args_sig = ', '.join(arg_names) + kwonly = '' if arg_names[-1].startswith('*') else ', *' # Generate templated wrapper, so that the signature matches # the documented argument names. @@ -221,11 +196,9 @@ def _wrap_function(name, func): template = _wrapper_options_template else: template = _wrapper_template - exec( - template.format(func_name=name, args_sig=args_sig, - kwonly=kwonly), globals(), ns - ) - wrapper = ns["make_wrapper"](func, option_class) + exec(template.format(func_name=name, args_sig=args_sig, kwonly=kwonly), + globals(), ns) + wrapper = ns['make_wrapper'](func, option_class) return _decorate_compute_function(wrapper, name, func, option_class) @@ -241,7 +214,8 @@ def _make_global_functions(): reg = function_registry() # Avoid clashes with Python keywords - rewrites = {"and": "and_", "or": "or_"} + rewrites = {'and': 'and_', + 'or': 'or_'} for cpp_name in reg.list_functions(): name = rewrites.get(cpp_name, cpp_name) @@ -325,7 +299,8 @@ def match_substring(array, pattern): ------- result : pyarrow.Array or pyarrow.ChunkedArray """ - return call_function("match_substring", [array], MatchSubstringOptions(pattern)) + return call_function("match_substring", [array], + MatchSubstringOptions(pattern)) def sum(array): @@ -340,7 +315,7 @@ def sum(array): ------- sum : pyarrow.Scalar """ - return call_function("sum", [array]) + return call_function('sum', [array]) def mode(array, n=1): @@ -372,7 +347,7 @@ def mode(array, n=1): return call_function("mode", [array], options) -def filter(data, mask, null_selection_behavior="drop"): +def filter(data, mask, null_selection_behavior='drop'): """ Select values (or records) from array- or table-like data given boolean filter, where true values are selected. @@ -413,7 +388,7 @@ def filter(data, mask, null_selection_behavior="drop"): ] """ options = FilterOptions(null_selection_behavior) - return call_function("filter", [data, mask], options) + return call_function('filter', [data, mask], options) def take(data, indices, *, boundscheck=True, memory_pool=None): @@ -454,7 +429,7 @@ def take(data, indices, *, boundscheck=True, memory_pool=None): ] """ options = TakeOptions(boundscheck=boundscheck) - return call_function("take", [data, indices], options, memory_pool) + return call_function('take', [data, indices], options, memory_pool) def fill_null(values, fill_value): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 6423741ae50..ba3c3ad7d2b 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1812,7 +1812,8 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: cdef cppclass CDictionaryEncodeOptions \ "arrow::compute::DictionaryEncodeOptions"(CFunctionOptions): CDictionaryEncodeOptions() - CDictionaryEncodeOptions(CDictionaryEncodeNullEncodingBehavior null_encoding) + CDictionaryEncodeOptions( + CDictionaryEncodeNullEncodingBehavior null_encoding) CDictionaryEncodeNullEncodingBehavior null_encoding cdef cppclass CTakeOptions \ diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 89bec21fd86..b2c1fc9f030 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -21,7 +21,6 @@ import textwrap import numpy as np -from numpy.core.fromnumeric import partition import pytest import pyarrow as pa @@ -50,25 +49,23 @@ def _generate_data(n): day = datetime.datetime(2000, 1, 1) interval = datetime.timedelta(days=5) - colors = itertools.cycle(["green", "blue", "yellow", "red", "orange"]) + colors = itertools.cycle(['green', 'blue', 'yellow', 'red', 'orange']) data = [] for i in range(n): data.append((day, i, float(i), next(colors))) day += interval - return pd.DataFrame(data, columns=["date", "index", "value", "color"]) + return pd.DataFrame(data, columns=['date', 'index', 'value', 'color']) def _table_from_pandas(df): - schema = pa.schema( - [ - pa.field("date", pa.date32()), - pa.field("index", pa.int64()), - pa.field("value", pa.float64()), - pa.field("color", pa.string()), - ] - ) + schema = pa.schema([ + pa.field('date', pa.date32()), + pa.field('index', pa.int64()), + pa.field('value', pa.float64()), + pa.field('color', pa.string()), + ]) table = pa.Table.from_pandas(df, schema=schema, preserve_index=False) return table.replace_schema_metadata() @@ -81,28 +78,26 @@ def mockfs(): mockfs = fs._MockFileSystem() directories = [ - "subdir/1/xxx", - "subdir/2/yyy", + 'subdir/1/xxx', + 'subdir/2/yyy', ] for i, directory in enumerate(directories): - path = "{}/file{}.parquet".format(directory, i) + path = '{}/file{}.parquet'.format(directory, i) mockfs.create_dir(directory) with mockfs.open_output_stream(path) as out: data = [ list(range(5)), list(map(float, range(5))), list(map(str, range(5))), - [i] * 5, + [i] * 5 ] - schema = pa.schema( - [ - pa.field("i64", pa.int64()), - pa.field("f64", pa.float64()), - pa.field("str", pa.string()), - pa.field("const", pa.int64()), - ] - ) + schema = pa.schema([ + pa.field('i64', pa.int64()), + pa.field('f64', pa.float64()), + pa.field('str', pa.string()), + pa.field('const', pa.int64()), + ]) batch = pa.record_batch(data, schema=schema) table = pa.Table.from_batches([batch]) @@ -143,10 +138,10 @@ def assert_opens(expected_opened): return fs, assert_opens -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def multisourcefs(request): - request.config.pyarrow.requires("pandas") - request.config.pyarrow.requires("parquet") + request.config.pyarrow.requires('pandas') + request.config.pyarrow.requires('parquet') import pyarrow.parquet as pq df = _generate_data(1000) @@ -158,35 +153,35 @@ def multisourcefs(request): # create a directory containing a flat sequence of parquet files without # any partitioning involved - mockfs.create_dir("plain") + mockfs.create_dir('plain') for i, chunk in enumerate(np.array_split(df_a, 10)): - path = "plain/chunk-{}.parquet".format(i) + path = 'plain/chunk-{}.parquet'.format(i) with mockfs.open_output_stream(path) as out: pq.write_table(_table_from_pandas(chunk), out) # create one with schema partitioning by weekday and color - mockfs.create_dir("schema") + mockfs.create_dir('schema') for part, chunk in df_b.groupby([df_b.date.dt.dayofweek, df_b.color]): - folder = "schema/{}/{}".format(*part) - path = "{}/chunk.parquet".format(folder) + folder = 'schema/{}/{}'.format(*part) + path = '{}/chunk.parquet'.format(folder) mockfs.create_dir(folder) with mockfs.open_output_stream(path) as out: pq.write_table(_table_from_pandas(chunk), out) # create one with hive partitioning by year and month - mockfs.create_dir("hive") + mockfs.create_dir('hive') for part, chunk in df_c.groupby([df_c.date.dt.year, df_c.date.dt.month]): - folder = "hive/year={}/month={}".format(*part) - path = "{}/chunk.parquet".format(folder) + folder = 'hive/year={}/month={}'.format(*part) + path = '{}/chunk.parquet'.format(folder) mockfs.create_dir(folder) with mockfs.open_output_stream(path) as out: pq.write_table(_table_from_pandas(chunk), out) # create one with hive partitioning by color - mockfs.create_dir("hive_color") + mockfs.create_dir('hive_color') for part, chunk in df_d.groupby(["color"]): - folder = "hive_color/color={}".format(*part) - path = "{}/chunk.parquet".format(folder) + folder = 'hive_color/color={}'.format(*part) + path = '{}/chunk.parquet'.format(folder) mockfs.create_dir(folder) with mockfs.open_output_stream(path) as out: pq.write_table(_table_from_pandas(chunk), out) @@ -197,41 +192,36 @@ def multisourcefs(request): @pytest.fixture def dataset(mockfs): format = ds.ParquetFileFormat() - selector = fs.FileSelector("subdir", recursive=True) - options = ds.FileSystemFactoryOptions("subdir") + selector = fs.FileSelector('subdir', recursive=True) + options = ds.FileSystemFactoryOptions('subdir') options.partitioning = ds.DirectoryPartitioning( - pa.schema([pa.field("group", pa.int32()), - pa.field("key", pa.string())]) + pa.schema([ + pa.field('group', pa.int32()), + pa.field('key', pa.string()) + ]) ) factory = ds.FileSystemDatasetFactory(mockfs, selector, format, options) return factory.finish() def test_filesystem_dataset(mockfs): - schema = pa.schema([pa.field("const", pa.int64())]) + schema = pa.schema([ + pa.field('const', pa.int64()) + ]) file_format = ds.ParquetFileFormat() - paths = ["subdir/1/xxx/file0.parquet", "subdir/2/yyy/file1.parquet"] - partitions = [ds.field("part") == x for x in range(1, 3)] - fragments = [ - file_format.make_fragment(path, mockfs, part) - for path, part in zip(paths, partitions) - ] - root_partition = ds.field("level") == ds.scalar(1337) + paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] + partitions = [ds.field('part') == x for x in range(1, 3)] + fragments = [file_format.make_fragment(path, mockfs, part) + for path, part in zip(paths, partitions)] + root_partition = ds.field('level') == ds.scalar(1337) dataset_from_fragments = ds.FileSystemDataset( - fragments, - schema=schema, - format=file_format, - filesystem=mockfs, - root_partition=root_partition, + fragments, schema=schema, format=file_format, + filesystem=mockfs, root_partition=root_partition, ) dataset_from_paths = ds.FileSystemDataset.from_paths( - paths, - schema=schema, - format=file_format, - filesystem=mockfs, - partitions=partitions, - root_partition=root_partition, + paths, schema=schema, format=file_format, filesystem=mockfs, + partitions=partitions, root_partition=root_partition, ) for dataset in [dataset_from_fragments, dataset_from_paths]: @@ -278,9 +268,8 @@ def test_filesystem_dataset(mockfs): ds.FileSystemDataset(fragments, file_format, schema) # validation of root_partition with pytest.raises(TypeError, match="incorrect type"): - ds.FileSystemDataset( - fragments, schema=schema, format=file_format, root_partition=1 - ) + ds.FileSystemDataset(fragments, schema=schema, + format=file_format, root_partition=1) # missing required argument in from_paths with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset.from_paths(fragments, format=file_format) @@ -288,15 +277,15 @@ def test_filesystem_dataset(mockfs): def test_filesystem_dataset_no_filesystem_interaction(): # ARROW-8283 - schema = pa.schema([pa.field("f1", pa.int64())]) + schema = pa.schema([ + pa.field('f1', pa.int64()) + ]) file_format = ds.IpcFileFormat() - paths = ["nonexistingfile.arrow"] + paths = ['nonexistingfile.arrow'] # creating the dataset itself doesn't raise dataset = ds.FileSystemDataset.from_paths( - paths, - schema=schema, - format=file_format, + paths, schema=schema, format=file_format, filesystem=fs.LocalFileSystem(), ) @@ -328,28 +317,27 @@ def test_dataset(dataset): assert isinstance(table, pa.Table) assert len(table) == 10 - condition = ds.field("i64") == 1 + condition = ds.field('i64') == 1 result = dataset.to_table(use_threads=True, filter=condition).to_pydict() # don't rely on the scanning order - assert result["i64"] == [1, 1] - assert result["f64"] == [1.0, 1.0] - assert sorted(result["group"]) == [1, 2] - assert sorted(result["key"]) == ["xxx", "yyy"] + assert result['i64'] == [1, 1] + assert result['f64'] == [1., 1.] + assert sorted(result['group']) == [1, 2] + assert sorted(result['key']) == ['xxx', 'yyy'] def test_scanner(dataset): - scanner = ds.Scanner.from_dataset( - dataset, memory_pool=pa.default_memory_pool()) + scanner = ds.Scanner.from_dataset(dataset, + memory_pool=pa.default_memory_pool()) assert isinstance(scanner, ds.Scanner) assert len(list(scanner.scan())) == 2 with pytest.raises(pa.ArrowInvalid): - ds.Scanner.from_dataset(dataset, columns=["unknown"]) + ds.Scanner.from_dataset(dataset, columns=['unknown']) - scanner = ds.Scanner.from_dataset( - dataset, columns=["i64"], memory_pool=pa.default_memory_pool() - ) + scanner = ds.Scanner.from_dataset(dataset, columns=['i64'], + memory_pool=pa.default_memory_pool()) assert isinstance(scanner, ds.Scanner) assert len(list(scanner.scan())) == 2 @@ -370,49 +358,46 @@ def test_abstract_classes(): def test_partitioning(): - schema = pa.schema([pa.field("i64", pa.int64()), - pa.field("f64", pa.float64())]) + schema = pa.schema([ + pa.field('i64', pa.int64()), + pa.field('f64', pa.float64()) + ]) for klass in [ds.DirectoryPartitioning, ds.HivePartitioning]: partitioning = klass(schema) assert isinstance(partitioning, ds.Partitioning) partitioning = ds.DirectoryPartitioning( - pa.schema([pa.field("group", pa.int64()), - pa.field("key", pa.float64())]) + pa.schema([ + pa.field('group', pa.int64()), + pa.field('key', pa.float64()) + ]) ) - expr = partitioning.parse("/3/3.14") + expr = partitioning.parse('/3/3.14') assert isinstance(expr, ds.Expression) - expected = (ds.field("group") == 3) & (ds.field("key") == 3.14) + expected = (ds.field('group') == 3) & (ds.field('key') == 3.14) assert expr.equals(expected) with pytest.raises(pa.ArrowInvalid): - partitioning.parse("/prefix/3/aaa") + partitioning.parse('/prefix/3/aaa') partitioning = ds.HivePartitioning( - pa.schema([pa.field("alpha", pa.int64()), - pa.field("beta", pa.int64())]) + pa.schema([ + pa.field('alpha', pa.int64()), + pa.field('beta', pa.int64()) + ]) + ) + expr = partitioning.parse('/alpha=0/beta=3') + expected = ( + (ds.field('alpha') == ds.scalar(0)) & + (ds.field('beta') == ds.scalar(3)) ) - expr = partitioning.parse("/alpha=0/beta=3") - expected = (ds.field("alpha") == ds.scalar(0)) & ( - ds.field("beta") == ds.scalar(3)) assert expr.equals(expected) - for shouldfail in ["/alpha=one/beta=2", "/alpha=one", "/beta=two"]: + for shouldfail in ['/alpha=one/beta=2', '/alpha=one', '/beta=two']: with pytest.raises(pa.ArrowInvalid): partitioning.parse(shouldfail) - partitioning = ds.HivePartitioning( - pa.schema([pa.field("alpha", pa.int64()), - pa.field("beta", pa.int64())]), - None, - "xyz", - ) - expr = partitioning.parse("/alpha=xyz/beta=3") - expected = (ds.field("alpha").is_null()) & ( - ds.field("beta") == ds.scalar(3)) - assert expr.equals(expected) - def test_expression_serialization(): a = ds.scalar(1) @@ -420,30 +405,14 @@ def test_expression_serialization(): c = ds.scalar(True) d = ds.scalar("string") e = ds.scalar(None) - f = ds.scalar({"a": 1}) + f = ds.scalar({'a': 1}) g = ds.scalar(pa.scalar(1)) - all_exprs = [ - a, - b, - c, - d, - e, - f, - g, - a == b, - a > b, - a & b, - a | b, - ~c, - d.is_valid(), - a.cast(pa.int32(), safe=False), - a.cast(pa.int32(), safe=False), - a.isin([1, 2, 3]), - ds.field("i64") > 5, - ds.field("i64") == 5, - ds.field("i64") == 7, - ] + all_exprs = [a, b, c, d, e, f, g, a == b, a > b, a & b, a | b, ~c, + d.is_valid(), a.cast(pa.int32(), safe=False), + a.cast(pa.int32(), safe=False), a.isin([1, 2, 3]), + ds.field('i64') > 5, ds.field('i64') == 5, + ds.field('i64') == 7] for expr in all_exprs: assert isinstance(expr, ds.Expression) restored = pickle.loads(pickle.dumps(expr)) @@ -491,16 +460,13 @@ def test_expression_boolean_operators(): def test_partition_keys(): - a, b, c = [ds.field(f) == f for f in "abc"] - assert ds._get_partition_keys(a) == {"a": "a"} - assert ds._get_partition_keys(a & b & c) == {f: f for f in "abc"} + a, b, c = [ds.field(f) == f for f in 'abc'] + assert ds._get_partition_keys(a) == {'a': 'a'} + assert ds._get_partition_keys(a & b & c) == {f: f for f in 'abc'} - null = ds.field("a").is_null() - assert ds._get_partition_keys(null) == {"a": None} - - nope = ds.field("d") >= 3 + nope = ds.field('d') >= 3 assert ds._get_partition_keys(nope) == {} - assert ds._get_partition_keys(a & nope) == {"a": "a"} + assert ds._get_partition_keys(a & nope) == {'a': 'a'} def test_parquet_read_options(): @@ -542,66 +508,69 @@ def test_file_format_pickling(): formats = [ ds.IpcFileFormat(), ds.CsvFileFormat(), - ds.CsvFileFormat(pa.csv.ParseOptions( - delimiter="\t", ignore_empty_lines=True)), + ds.CsvFileFormat(pa.csv.ParseOptions(delimiter='\t', + ignore_empty_lines=True)), ds.ParquetFileFormat(), ds.ParquetFileFormat( read_options=ds.ParquetReadOptions(use_buffered_stream=True) ), ds.ParquetFileFormat( read_options={ - "use_buffered_stream": True, - "buffer_size": 4096, + 'use_buffered_stream': True, + 'buffer_size': 4096, } - ), + ) ] for file_format in formats: assert pickle.loads(pickle.dumps(file_format)) == file_format -@pytest.mark.parametrize( - "paths_or_selector", +@pytest.mark.parametrize('paths_or_selector', [ + fs.FileSelector('subdir', recursive=True), [ 'subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet', ] ]) +<<<<<<< HEAD @pytest.mark.parametrize('pre_buffer', [False, True]) def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): +======= +def test_filesystem_factory(mockfs, paths_or_selector): +>>>>>>> Final lint pass. Turns out I was relying on black which was messing up everything format = ds.ParquetFileFormat( read_options=ds.ParquetReadOptions(dictionary_columns={"str"}, pre_buffer=pre_buffer) ) - options = ds.FileSystemFactoryOptions("subdir") + options = ds.FileSystemFactoryOptions('subdir') options.partitioning = ds.DirectoryPartitioning( - pa.schema([pa.field("group", pa.int32()), - pa.field("key", pa.string())]) + pa.schema([ + pa.field('group', pa.int32()), + pa.field('key', pa.string()) + ]) ) - assert options.partition_base_dir == "subdir" - assert options.selector_ignore_prefixes == [".", "_"] + assert options.partition_base_dir == 'subdir' + assert options.selector_ignore_prefixes == ['.', '_'] assert options.exclude_invalid_files is False factory = ds.FileSystemDatasetFactory( - mockfs, paths_or_selector, format, options) + mockfs, paths_or_selector, format, options + ) inspected_schema = factory.inspect() - assert factory.inspect().equals( - pa.schema( - [ - pa.field("i64", pa.int64()), - pa.field("f64", pa.float64()), - pa.field("str", pa.dictionary(pa.int32(), pa.string())), - pa.field("const", pa.int64()), - pa.field("group", pa.int32()), - pa.field("key", pa.string()), - ] - ), - check_metadata=False, - ) + assert factory.inspect().equals(pa.schema([ + pa.field('i64', pa.int64()), + pa.field('f64', pa.float64()), + pa.field('str', pa.dictionary(pa.int32(), pa.string())), + pa.field('const', pa.int64()), + pa.field('group', pa.int32()), + pa.field('key', pa.string()), + ]), check_metadata=False) assert isinstance(factory.inspect_schemas(), list) - assert isinstance(factory.finish(inspected_schema), ds.FileSystemDataset) + assert isinstance(factory.finish(inspected_schema), + ds.FileSystemDataset) assert factory.root_partition.equals(ds.scalar(True)) dataset = factory.finish() @@ -613,9 +582,9 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64()) expected_str = pa.DictionaryArray.from_arrays( pa.array([0, 1, 2, 3, 4], type=pa.int32()), - pa.array("0 1 2 3 4".split(), type=pa.string()), + pa.array("0 1 2 3 4".split(), type=pa.string()) ) - for task, group, key in zip(scanner.scan(), [1, 2], ["xxx", "yyy"]): + for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']): expected_group = pa.array([group] * 5, type=pa.int32()) expected_key = pa.array([key] * 5, type=pa.string()) expected_const = pa.array([group - 1] * 5, type=pa.int64()) @@ -636,16 +605,15 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): def test_make_fragment(multisourcefs): parquet_format = ds.ParquetFileFormat() - dataset = ds.dataset( - "/plain", filesystem=multisourcefs, format=parquet_format) + dataset = ds.dataset('/plain', filesystem=multisourcefs, + format=parquet_format) for path in dataset.files: fragment = parquet_format.make_fragment(path, multisourcefs) assert fragment.row_groups == [0] - row_group_fragment = parquet_format.make_fragment( - path, multisourcefs, row_groups=[0] - ) + row_group_fragment = parquet_format.make_fragment(path, multisourcefs, + row_groups=[0]) for f in [fragment, row_group_fragment]: assert isinstance(f, ds.ParquetFileFragment) assert f.path == path @@ -654,23 +622,21 @@ def test_make_fragment(multisourcefs): def test_make_csv_fragment_from_buffer(): - content = textwrap.dedent( - """ + content = textwrap.dedent(""" alpha,num,animal a,12,dog b,11,cat c,10,rabbit - """ - ) - buffer = pa.py_buffer(content.encode("utf-8")) + """) + buffer = pa.py_buffer(content.encode('utf-8')) csv_format = ds.CsvFileFormat() fragment = csv_format.make_fragment(buffer) - expected = pa.table( - [["a", "b", "c"], [12, 11, 10], ["dog", "cat", "rabbit"]], - names=["alpha", "num", "animal"], - ) + expected = pa.table([['a', 'b', 'c'], + [12, 11, 10], + ['dog', 'cat', 'rabbit']], + names=['alpha', 'num', 'animal']) assert fragment.to_table().equals(expected) pickled = pickle.loads(pickle.dumps(fragment)) @@ -682,27 +648,29 @@ def test_make_parquet_fragment_from_buffer(): import pyarrow.parquet as pq arrays = [ - pa.array(["a", "b", "c"]), + pa.array(['a', 'b', 'c']), pa.array([12, 11, 10]), - pa.array(["dog", "cat", "rabbit"]), + pa.array(['dog', 'cat', 'rabbit']) ] dictionary_arrays = [ arrays[0].dictionary_encode(), arrays[1], - arrays[2].dictionary_encode(), + arrays[2].dictionary_encode() ] dictionary_format = ds.ParquetFileFormat( read_options=ds.ParquetReadOptions( use_buffered_stream=True, buffer_size=4096, - dictionary_columns=["alpha", "animal"], + dictionary_columns=['alpha', 'animal'] ) ) - cases = [(arrays, ds.ParquetFileFormat()), - (dictionary_arrays, dictionary_format)] + cases = [ + (arrays, ds.ParquetFileFormat()), + (dictionary_arrays, dictionary_format) + ] for arrays, format_ in cases: - table = pa.table(arrays, names=["alpha", "num", "animal"]) + table = pa.table(arrays, names=['alpha', 'num', 'animal']) out = pa.BufferOutputStream() pq.write_table(table, out) @@ -719,14 +687,15 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None): import pyarrow.parquet as pq table = pa.table( - [range(8), [1] * 8, ["a"] * 4 + ["b"] * 4], names=["f1", "f2", "part"] + [range(8), [1] * 8, ['a'] * 4 + ['b'] * 4], + names=['f1', 'f2', 'part'] ) path = str(tempdir / "test_parquet_dataset") # write_to_dataset currently requires pandas - pq.write_to_dataset(table, path, partition_cols=[ - "part"], chunk_size=chunk_size) + pq.write_to_dataset(table, path, + partition_cols=["part"], chunk_size=chunk_size) dataset = ds.dataset( path, format="parquet", partitioning="hive", filesystem=filesystem ) @@ -744,11 +713,11 @@ def test_fragments(tempdir): assert len(fragments) == 2 f = fragments[0] - physical_names = ["f1", "f2"] + physical_names = ['f1', 'f2'] # file's schema does not include partition column assert f.physical_schema.names == physical_names assert f.format.inspect(f.path, f.filesystem) == f.physical_schema - assert f.partition_expression.equals(ds.field("part") == "a") + assert f.partition_expression.equals(ds.field('part') == 'a') # By default, the partition column is not part of the schema. result = f.to_table() @@ -758,13 +727,13 @@ def test_fragments(tempdir): # scanning fragment includes partition columns when given the proper # schema. result = f.to_table(schema=dataset.schema) - assert result.column_names == ["f1", "f2", "part"] + assert result.column_names == ['f1', 'f2', 'part'] assert result.equals(table.slice(0, 4)) assert f.physical_schema == result.schema.remove(2) # scanning fragments follow filter predicate - result = f.to_table(schema=dataset.schema, filter=ds.field("f1") < 2) - assert result.column_names == ["f1", "f2", "part"] + result = f.to_table(schema=dataset.schema, filter=ds.field('f1') < 2) + assert result.column_names == ['f1', 'f2', 'part'] @pytest.mark.pandas @@ -773,11 +742,11 @@ def test_fragments_implicit_cast(tempdir): # ARROW-8693 import pyarrow.parquet as pq - table = pa.table([range(8), [1] * 4 + [2] * 4], names=["col", "part"]) + table = pa.table([range(8), [1] * 4 + [2] * 4], names=['col', 'part']) path = str(tempdir / "test_parquet_dataset") pq.write_to_dataset(table, path, partition_cols=["part"]) - part = ds.partitioning(pa.schema([("part", "int8")]), flavor="hive") + part = ds.partitioning(pa.schema([('part', 'int8')]), flavor="hive") dataset = ds.dataset(path, format="parquet", partitioning=part) fragments = dataset.get_fragments(filter=ds.field("part") >= 2) assert len(list(fragments)) == 1 @@ -788,7 +757,8 @@ def test_fragments_implicit_cast(tempdir): def test_fragments_reconstruct(tempdir): table, dataset = _create_dataset_for_fragments(tempdir) - def assert_yields_projected(fragment, row_slice, columns=None, filter=None): + def assert_yields_projected(fragment, row_slice, + columns=None, filter=None): actual = fragment.to_table( schema=table.schema, columns=columns, filter=filter) column_names = columns if columns else table.column_names @@ -806,53 +776,40 @@ def assert_yields_projected(fragment, row_slice, columns=None, filter=None): # manually re-construct a fragment, with explicit schema new_fragment = parquet_format.make_fragment( - fragment.path, - fragment.filesystem, - partition_expression=fragment.partition_expression, - ) + fragment.path, fragment.filesystem, + partition_expression=fragment.partition_expression) assert new_fragment.to_table().equals(fragment.to_table()) assert_yields_projected(new_fragment, (0, 4)) # filter / column projection, inspected schema new_fragment = parquet_format.make_fragment( - fragment.path, - fragment.filesystem, - partition_expression=fragment.partition_expression, - ) - assert_yields_projected(new_fragment, (0, 2), filter=ds.field("f1") < 2) + fragment.path, fragment.filesystem, + partition_expression=fragment.partition_expression) + assert_yields_projected(new_fragment, (0, 2), filter=ds.field('f1') < 2) # filter requiring cast / column projection, inspected schema new_fragment = parquet_format.make_fragment( - fragment.path, - fragment.filesystem, - partition_expression=fragment.partition_expression, - ) - assert_yields_projected( - new_fragment, (0, 2), columns=["f1"], filter=ds.field("f1") < 2.0 - ) + fragment.path, fragment.filesystem, + partition_expression=fragment.partition_expression) + assert_yields_projected(new_fragment, (0, 2), + columns=['f1'], filter=ds.field('f1') < 2.0) # filter on the partition column new_fragment = parquet_format.make_fragment( - fragment.path, - fragment.filesystem, - partition_expression=fragment.partition_expression, - ) + fragment.path, fragment.filesystem, + partition_expression=fragment.partition_expression) assert_yields_projected(new_fragment, (0, 4), - filter=ds.field("part") == "a") + filter=ds.field('part') == 'a') # Fragments don't contain the partition's columns if not provided to the # `to_table(schema=...)` method. - pattern = ( - r"No match for FieldRef.Name\(part\) in " + - fragment.physical_schema.to_string(False, False, False) - ) + pattern = (r'No match for FieldRef.Name\(part\) in ' + + fragment.physical_schema.to_string(False, False, False)) with pytest.raises(ValueError, match=pattern): new_fragment = parquet_format.make_fragment( - fragment.path, - fragment.filesystem, - partition_expression=fragment.partition_expression, - ) - new_fragment.to_table(filter=ds.field("part") == "a") + fragment.path, fragment.filesystem, + partition_expression=fragment.partition_expression) + new_fragment.to_table(filter=ds.field('part') == 'a') @pytest.mark.pandas @@ -866,21 +823,21 @@ def test_fragments_parquet_row_groups(tempdir): row_group_fragments = list(fragment.split_by_row_group()) assert len(row_group_fragments) == fragment.num_row_groups == 2 result = row_group_fragments[0].to_table(schema=dataset.schema) - assert result.column_names == ["f1", "f2", "part"] + assert result.column_names == ['f1', 'f2', 'part'] assert len(result) == 2 assert result.equals(table.slice(0, 2)) assert row_group_fragments[0].row_groups is not None assert row_group_fragments[0].num_row_groups == 1 assert row_group_fragments[0].row_groups[0].statistics == { - "f1": {"min": 0, "max": 1}, - "f2": {"min": 1, "max": 1}, + 'f1': {'min': 0, 'max': 1}, + 'f2': {'min': 1, 'max': 1}, } - fragment = list(dataset.get_fragments(filter=ds.field("f1") < 1))[0] - row_group_fragments = list(fragment.split_by_row_group(ds.field("f1") < 1)) + fragment = list(dataset.get_fragments(filter=ds.field('f1') < 1))[0] + row_group_fragments = list(fragment.split_by_row_group(ds.field('f1') < 1)) assert len(row_group_fragments) == 1 - result = row_group_fragments[0].to_table(filter=ds.field("f1") < 1) + result = row_group_fragments[0].to_table(filter=ds.field('f1') < 1) assert len(result) == 1 @@ -888,15 +845,15 @@ def test_fragments_parquet_row_groups(tempdir): def test_fragments_parquet_num_row_groups(tempdir): import pyarrow.parquet as pq - table = pa.table({"a": range(8)}) + table = pa.table({'a': range(8)}) pq.write_table(table, tempdir / "test.parquet", row_group_size=2) dataset = ds.dataset(tempdir / "test.parquet", format="parquet") original_fragment = list(dataset.get_fragments())[0] # create fragment with subset of row groups fragment = original_fragment.format.make_fragment( - original_fragment.path, original_fragment.filesystem, row_groups=[1, 3] - ) + original_fragment.path, original_fragment.filesystem, + row_groups=[1, 3]) assert fragment.num_row_groups == 2 # ensure that parsing metadata preserves correct number of row groups fragment.ensure_complete_metadata() @@ -909,16 +866,14 @@ def test_fragments_parquet_num_row_groups(tempdir): def test_fragments_parquet_row_groups_dictionary(tempdir): import pandas as pd - df = pd.DataFrame(dict(col1=["a", "b"], col2=[1, 2])) - df["col1"] = df["col1"].astype("category") + df = pd.DataFrame(dict(col1=['a', 'b'], col2=[1, 2])) + df['col1'] = df['col1'].astype("category") import pyarrow.parquet as pq - pq.write_table(pa.table(df), tempdir / "test_filter_dictionary.parquet") import pyarrow.dataset as ds - - dataset = ds.dataset(tempdir / "test_filter_dictionary.parquet") + dataset = ds.dataset(tempdir / 'test_filter_dictionary.parquet') result = dataset.to_table(filter=ds.field("col1") == "a") assert (df.iloc[0] == result.to_pandas()).all().all() @@ -929,7 +884,8 @@ def test_fragments_parquet_row_groups_dictionary(tempdir): def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs): fs, assert_opens = open_logging_fs _, dataset = _create_dataset_for_fragments( - tempdir, chunk_size=2, filesystem=fs) + tempdir, chunk_size=2, filesystem=fs + ) fragment = list(dataset.get_fragments())[0] # with default discovery, no metadata loaded @@ -979,38 +935,38 @@ def _create_dataset_all_types(tempdir, chunk_size=None): pa.array([1, 10, 42], pa.uint64()), pa.array([1.0, 10.0, 42.0], pa.float32()), pa.array([1.0, 10.0, 42.0], pa.float64()), - pa.array(["a", None, "z"], pa.utf8()), - pa.array(["a", None, "z"], pa.binary()), - pa.array([1, 10, 42], pa.timestamp("s")), - pa.array([1, 10, 42], pa.timestamp("ms")), - pa.array([1, 10, 42], pa.timestamp("us")), + pa.array(['a', None, 'z'], pa.utf8()), + pa.array(['a', None, 'z'], pa.binary()), + pa.array([1, 10, 42], pa.timestamp('s')), + pa.array([1, 10, 42], pa.timestamp('ms')), + pa.array([1, 10, 42], pa.timestamp('us')), pa.array([1, 10, 42], pa.date32()), pa.array([1, 10, 4200000000], pa.date64()), - pa.array([1, 10, 42], pa.time32("s")), - pa.array([1, 10, 42], pa.time64("us")), + pa.array([1, 10, 42], pa.time32('s')), + pa.array([1, 10, 42], pa.time64('us')), ], names=[ - "boolean", - "int8", - "uint8", - "int16", - "uint16", - "int32", - "uint32", - "int64", - "uint64", - "float", - "double", - "utf8", - "binary", - "ts[s]", - "ts[ms]", - "ts[us]", - "date32", - "date64", - "time32", - "time64", - ], + 'boolean', + 'int8', + 'uint8', + 'int16', + 'uint16', + 'int32', + 'uint32', + 'int64', + 'uint64', + 'float', + 'double', + 'utf8', + 'binary', + 'ts[s]', + 'ts[ms]', + 'ts[us]', + 'date32', + 'date64', + 'time32', + 'time64', + ] ) path = str(tempdir / "test_parquet_dataset_all_types") @@ -1029,16 +985,9 @@ def test_parquet_fragment_statistics(tempdir): fragment = list(dataset.get_fragments())[0] import datetime - - def dt_s(x): - return datetime.datetime(1970, 1, 1, 0, 0, x) - - def dt_ms(x): - return datetime.datetime(1970, 1, 1, 0, 0, 0, x * 1000) - - def dt_us(x): - return datetime.datetime(1970, 1, 1, 0, 0, 0, x) - + def dt_s(x): return datetime.datetime(1970, 1, 1, 0, 0, x) + def dt_ms(x): return datetime.datetime(1970, 1, 1, 0, 0, 0, x*1000) + def dt_us(x): return datetime.datetime(1970, 1, 1, 0, 0, 0, x) date = datetime.date time = datetime.time @@ -1049,26 +998,26 @@ def dt_us(x): assert row_group.num_rows == 3 assert row_group.total_byte_size > 1000 assert row_group.statistics == { - "boolean": {"min": False, "max": True}, - "int8": {"min": 1, "max": 42}, - "uint8": {"min": 1, "max": 42}, - "int16": {"min": 1, "max": 42}, - "uint16": {"min": 1, "max": 42}, - "int32": {"min": 1, "max": 42}, - "uint32": {"min": 1, "max": 42}, - "int64": {"min": 1, "max": 42}, - "uint64": {"min": 1, "max": 42}, - "float": {"min": 1.0, "max": 42.0}, - "double": {"min": 1.0, "max": 42.0}, - "utf8": {"min": "a", "max": "z"}, - "binary": {"min": b"a", "max": b"z"}, - "ts[s]": {"min": dt_s(1), "max": dt_s(42)}, - "ts[ms]": {"min": dt_ms(1), "max": dt_ms(42)}, - "ts[us]": {"min": dt_us(1), "max": dt_us(42)}, - "date32": {"min": date(1970, 1, 2), "max": date(1970, 2, 12)}, - "date64": {"min": date(1970, 1, 1), "max": date(1970, 2, 18)}, - "time32": {"min": time(0, 0, 1), "max": time(0, 0, 42)}, - "time64": {"min": time(0, 0, 0, 1), "max": time(0, 0, 0, 42)}, + 'boolean': {'min': False, 'max': True}, + 'int8': {'min': 1, 'max': 42}, + 'uint8': {'min': 1, 'max': 42}, + 'int16': {'min': 1, 'max': 42}, + 'uint16': {'min': 1, 'max': 42}, + 'int32': {'min': 1, 'max': 42}, + 'uint32': {'min': 1, 'max': 42}, + 'int64': {'min': 1, 'max': 42}, + 'uint64': {'min': 1, 'max': 42}, + 'float': {'min': 1.0, 'max': 42.0}, + 'double': {'min': 1.0, 'max': 42.0}, + 'utf8': {'min': 'a', 'max': 'z'}, + 'binary': {'min': b'a', 'max': b'z'}, + 'ts[s]': {'min': dt_s(1), 'max': dt_s(42)}, + 'ts[ms]': {'min': dt_ms(1), 'max': dt_ms(42)}, + 'ts[us]': {'min': dt_us(1), 'max': dt_us(42)}, + 'date32': {'min': date(1970, 1, 2), 'max': date(1970, 2, 12)}, + 'date64': {'min': date(1970, 1, 1), 'max': date(1970, 2, 18)}, + 'time32': {'min': time(0, 0, 1), 'max': time(0, 0, 42)}, + 'time64': {'min': time(0, 0, 0, 1), 'max': time(0, 0, 0, 42)}, } @@ -1076,7 +1025,7 @@ def dt_us(x): def test_parquet_fragment_statistics_nulls(tempdir): import pyarrow.parquet as pq - table = pa.table({"a": [0, 1, None, None], "b": ["a", "b", None, None]}) + table = pa.table({'a': [0, 1, None, None], 'b': ['a', 'b', None, None]}) pq.write_table(table, tempdir / "test.parquet", row_group_size=2) dataset = ds.dataset(tempdir / "test.parquet", format="parquet") @@ -1103,25 +1052,21 @@ def test_fragments_parquet_row_groups_predicate(tempdir): table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2) fragment = list(dataset.get_fragments())[0] - assert fragment.partition_expression.equals(ds.field("part") == "a") + assert fragment.partition_expression.equals(ds.field('part') == 'a') # predicate may reference a partition field not present in the # physical_schema if an explicit schema is provided to split_by_row_group # filter matches partition_expression: all row groups row_group_fragments = list( - fragment.split_by_row_group( - filter=ds.field("part") == "a", schema=dataset.schema - ) - ) + fragment.split_by_row_group(filter=ds.field('part') == 'a', + schema=dataset.schema)) assert len(row_group_fragments) == 2 # filter contradicts partition_expression: no row groups row_group_fragments = list( - fragment.split_by_row_group( - filter=ds.field("part") == "b", schema=dataset.schema - ) - ) + fragment.split_by_row_group(filter=ds.field('part') == 'b', + schema=dataset.schema)) assert len(row_group_fragments) == 0 @@ -1140,36 +1085,27 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir): # manually re-construct row group fragments new_fragment = parquet_format.make_fragment( - fragment.path, - fragment.filesystem, + fragment.path, fragment.filesystem, partition_expression=fragment.partition_expression, - row_groups=[0], - ) + row_groups=[0]) result = new_fragment.to_table() assert result.equals(row_group_fragments[0].to_table()) # manually re-construct a row group fragment with filter/column projection new_fragment = parquet_format.make_fragment( - fragment.path, - fragment.filesystem, + fragment.path, fragment.filesystem, partition_expression=fragment.partition_expression, - row_groups={1}, - ) - result = new_fragment.to_table( - schema=table.schema, - columns=["f1", "part"], - filter=ds.field("f1") < 3, - ) - assert result.column_names == ["f1", "part"] + row_groups={1}) + result = new_fragment.to_table(schema=table.schema, columns=['f1', 'part'], + filter=ds.field('f1') < 3, ) + assert result.column_names == ['f1', 'part'] assert len(result) == 1 # out of bounds row group index new_fragment = parquet_format.make_fragment( - fragment.path, - fragment.filesystem, + fragment.path, fragment.filesystem, partition_expression=fragment.partition_expression, - row_groups={2}, - ) + row_groups={2}) with pytest.raises(IndexError, match="references row group 2"): new_fragment.to_table() @@ -1178,8 +1114,8 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir): @pytest.mark.parquet def test_fragments_parquet_subset_ids(tempdir, open_logging_fs): fs, assert_opens = open_logging_fs - table, dataset = _create_dataset_for_fragments( - tempdir, chunk_size=1, filesystem=fs) + table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, + filesystem=fs) fragment = list(dataset.get_fragments())[0] # select with row group ids @@ -1206,8 +1142,8 @@ def test_fragments_parquet_subset_ids(tempdir, open_logging_fs): @pytest.mark.parquet def test_fragments_parquet_subset_filter(tempdir, open_logging_fs): fs, assert_opens = open_logging_fs - table, dataset = _create_dataset_for_fragments( - tempdir, chunk_size=1, filesystem=fs) + table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1, + filesystem=fs) fragment = list(dataset.get_fragments())[0] # select with filter @@ -1249,43 +1185,41 @@ def test_fragments_parquet_subset_invalid(tempdir): def test_partitioning_factory(mockfs): - paths_or_selector = fs.FileSelector("subdir", recursive=True) + paths_or_selector = fs.FileSelector('subdir', recursive=True) format = ds.ParquetFileFormat() - options = ds.FileSystemFactoryOptions("subdir") - partitioning_factory = ds.DirectoryPartitioning.discover(["group", "key"]) + options = ds.FileSystemFactoryOptions('subdir') + partitioning_factory = ds.DirectoryPartitioning.discover(['group', 'key']) assert isinstance(partitioning_factory, ds.PartitioningFactory) options.partitioning_factory = partitioning_factory factory = ds.FileSystemDatasetFactory( - mockfs, paths_or_selector, format, options) + mockfs, paths_or_selector, format, options + ) inspected_schema = factory.inspect() # i64/f64 from data, group/key from "/1/xxx" and "/2/yyy" paths - expected_schema = pa.schema( - [ - ("i64", pa.int64()), - ("f64", pa.float64()), - ("str", pa.string()), - ("const", pa.int64()), - ("group", pa.int32()), - ("key", pa.string()), - ] - ) + expected_schema = pa.schema([ + ("i64", pa.int64()), + ("f64", pa.float64()), + ("str", pa.string()), + ("const", pa.int64()), + ("group", pa.int32()), + ("key", pa.string()), + ]) assert inspected_schema.equals(expected_schema) hive_partitioning_factory = ds.HivePartitioning.discover() assert isinstance(hive_partitioning_factory, ds.PartitioningFactory) -@pytest.mark.parametrize("infer_dictionary", [False, True]) +@pytest.mark.parametrize('infer_dictionary', [False, True]) def test_partitioning_factory_dictionary(mockfs, infer_dictionary): - paths_or_selector = fs.FileSelector("subdir", recursive=True) + paths_or_selector = fs.FileSelector('subdir', recursive=True) format = ds.ParquetFileFormat() - options = ds.FileSystemFactoryOptions("subdir") + options = ds.FileSystemFactoryOptions('subdir') options.partitioning_factory = ds.DirectoryPartitioning.discover( - ["group", "key"], infer_dictionary=infer_dictionary - ) + ['group', 'key'], infer_dictionary=infer_dictionary) factory = ds.FileSystemDatasetFactory( mockfs, paths_or_selector, format, options) @@ -1293,20 +1227,20 @@ def test_partitioning_factory_dictionary(mockfs, infer_dictionary): inferred_schema = factory.inspect() if infer_dictionary: expected_type = pa.dictionary(pa.int32(), pa.string()) - assert inferred_schema.field("key").type == expected_type + assert inferred_schema.field('key').type == expected_type table = factory.finish().to_table().combine_chunks() - actual = table.column("key").chunk(0) - expected = pa.array(["xxx"] * 5 + ["yyy"] * 5).dictionary_encode() + actual = table.column('key').chunk(0) + expected = pa.array(['xxx'] * 5 + ['yyy'] * 5).dictionary_encode() assert actual.equals(expected) # ARROW-9345 ensure filtering on the partition field works - table = factory.finish().to_table(filter=ds.field("key") == "xxx") - actual = table.column("key").chunk(0) + table = factory.finish().to_table(filter=ds.field('key') == 'xxx') + actual = table.column('key').chunk(0) expected = expected.slice(0, 5) assert actual.equals(expected) else: - assert inferred_schema.field("key").type == pa.string() + assert inferred_schema.field('key').type == pa.string() def test_partitioning_function(): @@ -1344,9 +1278,8 @@ def test_partitioning_function(): def _create_single_file(base_dir, table=None, row_group_size=None): import pyarrow.parquet as pq - if table is None: - table = pa.table({"a": range(9), "b": [0.0] * 4 + [1.0] * 5}) + table = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5}) path = base_dir / "test.parquet" pq.write_table(table, path, row_group_size=row_group_size) return table, path @@ -1354,11 +1287,10 @@ def _create_single_file(base_dir, table=None, row_group_size=None): def _create_directory_of_files(base_dir): import pyarrow.parquet as pq - - table1 = pa.table({"a": range(9), "b": [0.0] * 4 + [1.0] * 5}) + table1 = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5}) path1 = base_dir / "test1.parquet" pq.write_table(table1, path1) - table2 = pa.table({"a": range(9, 18), "b": [0.0] * 4 + [1.0] * 5}) + table2 = pa.table({'a': range(9, 18), 'b': [0.] * 4 + [1.] * 5}) path2 = base_dir / "test2.parquet" pq.write_table(table2, path2) return (table1, table2), (path1, path2) @@ -1415,9 +1347,13 @@ def test_open_dataset_list_of_files(tempdir): tables, (path1, path2) = _create_directory_of_files(tempdir) table = pa.concat_tables(tables) - datasets = [ds.dataset([path1, path2]), - ds.dataset([str(path1), str(path2)])] - datasets += [pickle.loads(pickle.dumps(d)) for d in datasets] + datasets = [ + ds.dataset([path1, path2]), + ds.dataset([str(path1), str(path2)]) + ] + datasets += [ + pickle.loads(pickle.dumps(d)) for d in datasets + ] for dataset in datasets: assert dataset.schema.equals(table.schema) @@ -1426,7 +1362,7 @@ def test_open_dataset_list_of_files(tempdir): def test_construct_from_single_file(tempdir): - directory = tempdir / "single-file" + directory = tempdir / 'single-file' directory.mkdir() table, path = _create_single_file(directory) relative_path = path.relative_to(directory) @@ -1444,7 +1380,7 @@ def test_construct_from_single_file(tempdir): def test_construct_from_single_directory(tempdir): - directory = tempdir / "single-directory" + directory = tempdir / 'single-directory' directory.mkdir() tables, paths = _create_directory_of_files(directory) @@ -1464,7 +1400,7 @@ def test_construct_from_single_directory(tempdir): def test_construct_from_list_of_files(tempdir): # instantiate from a list of files - directory = tempdir / "list-of-files" + directory = tempdir / 'list-of-files' directory.mkdir() tables, paths = _create_directory_of_files(directory) @@ -1487,19 +1423,18 @@ def test_construct_from_list_of_files(tempdir): def test_construct_from_list_of_mixed_paths_fails(mockfs): # isntantiate from a list of mixed paths files = [ - "subdir/1/xxx/file0.parquet", - "subdir/1/xxx/doesnt-exist.parquet", + 'subdir/1/xxx/file0.parquet', + 'subdir/1/xxx/doesnt-exist.parquet', ] - with pytest.raises(FileNotFoundError, match="doesnt-exist"): + with pytest.raises(FileNotFoundError, match='doesnt-exist'): ds.dataset(files, filesystem=mockfs) def test_construct_from_mixed_child_datasets(mockfs): # isntantiate from a list of mixed paths - a = ds.dataset( - ["subdir/1/xxx/file0.parquet", "subdir/2/yyy/file1.parquet"], filesystem=mockfs - ) - b = ds.dataset("subdir", filesystem=mockfs) + a = ds.dataset(['subdir/1/xxx/file0.parquet', + 'subdir/2/yyy/file1.parquet'], filesystem=mockfs) + b = ds.dataset('subdir', filesystem=mockfs) dataset = ds.dataset([a, b]) @@ -1512,10 +1447,8 @@ def test_construct_from_mixed_child_datasets(mockfs): assert len(dataset.children) == 2 for child in dataset.children: - assert child.files == [ - "subdir/1/xxx/file0.parquet", - "subdir/2/yyy/file1.parquet", - ] + assert child.files == ['subdir/1/xxx/file0.parquet', + 'subdir/2/yyy/file1.parquet'] def test_construct_empty_dataset(): @@ -1524,8 +1457,10 @@ def test_construct_empty_dataset(): assert table.num_rows == 0 assert table.num_columns == 0 - empty = ds.dataset([], schema=pa.schema( - [("a", pa.int64()), ("a", pa.string())])) + empty = ds.dataset([], schema=pa.schema([ + ('a', pa.int64()), + ('a', pa.string()) + ])) table = empty.to_table() assert table.num_rows == 0 assert table.num_columns == 2 @@ -1533,13 +1468,17 @@ def test_construct_empty_dataset(): def test_construct_from_invalid_sources_raise(multisourcefs): child1 = ds.FileSystemDatasetFactory( - multisourcefs, fs.FileSelector("/plain"), format=ds.ParquetFileFormat() + multisourcefs, + fs.FileSelector('/plain'), + format=ds.ParquetFileFormat() ) child2 = ds.FileSystemDatasetFactory( - multisourcefs, fs.FileSelector("/schema"), format=ds.ParquetFileFormat() + multisourcefs, + fs.FileSelector('/schema'), + format=ds.ParquetFileFormat() ) - with pytest.raises(TypeError, match="Expected.*FileSystemDatasetFactory"): + with pytest.raises(TypeError, match='Expected.*FileSystemDatasetFactory'): ds.dataset([child1, child2]) expected = ( @@ -1560,8 +1499,7 @@ def test_construct_from_invalid_sources_raise(multisourcefs): @pytest.mark.parquet def test_open_dataset_partitioned_directory(tempdir): import pyarrow.parquet as pq - - table = pa.table({"a": range(9), "b": [0.0] * 4 + [1.0] * 5}) + table = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5}) path = tempdir / "dataset" path.mkdir() @@ -1596,15 +1534,13 @@ def test_open_dataset_partitioned_directory(tempdir): dataset = ds.dataset( str(path), partitioning=ds.partitioning( - pa.schema([("part", pa.int8())]), flavor="hive"), - ) + pa.schema([("part", pa.int8())]), flavor="hive")) expected_schema = table.schema.append(pa.field("part", pa.int8())) assert dataset.schema.equals(expected_schema) result = dataset.to_table() expected = full_table.append_column( - "part", pa.array(np.repeat([0, 1, 2], 9), type=pa.int8()) - ) + "part", pa.array(np.repeat([0, 1, 2], 9), type=pa.int8())) assert result.equals(expected) @@ -1651,7 +1587,7 @@ def test_open_union_dataset(tempdir): def test_open_union_dataset_with_additional_kwargs(multisourcefs): - child = ds.dataset("/plain", filesystem=multisourcefs, format="parquet") + child = ds.dataset('/plain', filesystem=multisourcefs, format='parquet') with pytest.raises(ValueError, match="cannot pass any additional"): ds.dataset([child], format="parquet") @@ -1660,34 +1596,31 @@ def test_open_dataset_non_existing_file(): # ARROW-8213: Opening a dataset with a local incorrect path gives confusing # error message with pytest.raises(FileNotFoundError): - ds.dataset("i-am-not-existing.parquet", format="parquet") + ds.dataset('i-am-not-existing.parquet', format='parquet') - with pytest.raises(pa.ArrowInvalid, match="cannot be relative"): - ds.dataset("file:i-am-not-existing.parquet", format="parquet") + with pytest.raises(pa.ArrowInvalid, match='cannot be relative'): + ds.dataset('file:i-am-not-existing.parquet', format='parquet') @pytest.mark.parquet -@pytest.mark.parametrize("partitioning", ["directory", "hive"]) -@pytest.mark.parametrize("null_fallback", ["xyz", None]) -@pytest.mark.parametrize( - "partition_keys", - [ - (["A", "B", "C"], [1, 2, 3]), - ([1, 2, 3], ["A", "B", "C"]), - (["A", "B", "C"], ["D", "E", "F"]), - ([1, 2, 3], [4, 5, 6]), - ([1, None, 3], ["A", "B", "C"]), - ([1, 2, 3], ["A", None, "C"]), - ([None, 2, 3], [None, 2, 3]), - ], -) +@pytest.mark.parametrize('partitioning', ["directory", "hive"]) +@pytest.mark.parametrize('null_fallback', ['xyz', None]) +@pytest.mark.parametrize('partition_keys', [ + (["A", "B", "C"], [1, 2, 3]), + ([1, 2, 3], ["A", "B", "C"]), + (["A", "B", "C"], ["D", "E", "F"]), + ([1, 2, 3], [4, 5, 6]), + ([1, None, 3], ["A", "B", "C"]), + ([1, 2, 3], ["A", None, "C"]), + ([None, 2, 3], [None, 2, 3]), +]) def test_open_dataset_partitioned_dictionary_type( tempdir, partitioning, null_fallback, partition_keys ): # ARROW-9288 / ARROW-9476 import pyarrow.parquet as pq - table = pa.table({"a": range(9), "b": [0.0] * 4 + [1.0] * 5}) + table = pa.table({'a': range(9), 'b': [0.0] * 4 + [1.0] * 5}) if None in partition_keys[0] or None in partition_keys[1]: # Directory partitioning can't handle the first part being null @@ -1695,8 +1628,7 @@ def test_open_dataset_partitioned_dictionary_type( if partitioning == "directory": partitioning = ds.DirectoryPartitioning.discover( - ["part1", "part2"], infer_dictionary=True - ) + ["part1", "part2"], infer_dictionary=True) fmt = "{0}/{1}" null_value = None else: @@ -1728,27 +1660,27 @@ def test_open_dataset_partitioned_dictionary_type( def dict_type(key): value_type = pa.string() if isinstance(key, str) else pa.int32() return pa.dictionary(pa.int32(), value_type) - expected_schema = table.schema.append( pa.field("part1", dict_type(part_keys1[0])) - ).append(pa.field("part2", dict_type(part_keys2[0]))) + ).append( + pa.field("part2", dict_type(part_keys2[0])) + ) assert dataset.schema.equals(expected_schema) @pytest.mark.pandas def test_dataset_partitioned_dictionary_type_reconstruct(tempdir): # https://issues.apache.org/jira/browse/ARROW-11400 - table = pa.table({"part": np.repeat(["A", "B"], 5), "col": range(10)}) - part = ds.partitioning(table.select(["part"]).schema, flavor="hive") + table = pa.table({'part': np.repeat(['A', 'B'], 5), 'col': range(10)}) + part = ds.partitioning(table.select(['part']).schema, flavor="hive") ds.write_dataset(table, tempdir, partitioning=part, format="feather") dataset = ds.dataset( - tempdir, - format="feather", - partitioning=ds.HivePartitioning.discover(infer_dictionary=True), + tempdir, format="feather", + partitioning=ds.HivePartitioning.discover(infer_dictionary=True) ) expected = pa.table( - {"col": table["col"], "part": table["part"].dictionary_encode()} + {'col': table['col'], 'part': table['part'].dictionary_encode()} ) assert dataset.to_table().equals(expected) fragment = list(dataset.get_fragments())[0] @@ -1761,10 +1693,8 @@ def test_dataset_partitioned_dictionary_type_reconstruct(tempdir): restored = pickle.loads(pickle.dumps(fragment)) assert restored.to_table(schema=dataset.schema).equals(expected[:5]) # to_pandas call triggers computation of the actual dictionary values - assert ( - restored.to_table(schema=dataset.schema) - .to_pandas() - .equals(expected[:5].to_pandas()) + assert restored.to_table(schema=dataset.schema).to_pandas().equals( + expected[:5].to_pandas() ) assert restored.partition_expression.equals(part_expr) @@ -1775,14 +1705,15 @@ def s3_example_simple(s3_connection, s3_server): import pyarrow.parquet as pq host, port, access_key, secret_key = s3_connection - uri = "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}".format( - access_key, secret_key, host, port + uri = ( + "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}" + .format(access_key, secret_key, host, port) ) fs, path = FileSystem.from_uri(uri) fs.create_dir("mybucket") - table = pa.table({"a": [1, 2, 3]}) + table = pa.table({'a': [1, 2, 3]}) with fs.open_output_stream("mybucket/data.parquet") as out: pq.write_table(table, out) @@ -1815,7 +1746,9 @@ def test_open_dataset_from_uri_s3_fsspec(s3_example_simple): fs = s3fs.S3FileSystem( key=access_key, secret=secret_key, - client_kwargs={"endpoint_url": "http://{}:{}".format(host, port)}, + client_kwargs={ + 'endpoint_url': 'http://{}:{}'.format(host, port) + } ) # passing as fsspec filesystem @@ -1835,18 +1768,18 @@ def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server): import pyarrow.parquet as pq host, port, access_key, secret_key = s3_connection - bucket = "theirbucket" - path = "nested/folder/data.parquet" + bucket = 'theirbucket' + path = 'nested/folder/data.parquet' uri = "s3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}".format( access_key, secret_key, bucket, path, host, port ) fs, path = FileSystem.from_uri(uri) - assert path == "theirbucket/nested/folder/data.parquet" + assert path == 'theirbucket/nested/folder/data.parquet' fs.create_dir(bucket) - table = pa.table({"a": [1, 2, 3]}) + table = pa.table({'a': [1, 2, 3]}) with fs.open_output_stream(path) as out: pq.write_table(table, out) @@ -1855,25 +1788,27 @@ def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server): assert dataset.to_table().equals(table) # passing filesystem as an uri - template = "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format( - access_key, secret_key, host, port + template = ( + "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format( + access_key, secret_key, host, port + ) ) cases = [ - ("theirbucket/nested/folder/", "/data.parquet"), - ("theirbucket/nested/folder", "data.parquet"), - ("theirbucket/nested/", "folder/data.parquet"), - ("theirbucket/nested", "folder/data.parquet"), - ("theirbucket", "/nested/folder/data.parquet"), - ("theirbucket", "nested/folder/data.parquet"), + ('theirbucket/nested/folder/', '/data.parquet'), + ('theirbucket/nested/folder', 'data.parquet'), + ('theirbucket/nested/', 'folder/data.parquet'), + ('theirbucket/nested', 'folder/data.parquet'), + ('theirbucket', '/nested/folder/data.parquet'), + ('theirbucket', 'nested/folder/data.parquet'), ] for prefix, path in cases: uri = template.format(prefix) dataset = ds.dataset(path, filesystem=uri, format="parquet") assert dataset.to_table().equals(table) - with pytest.raises(pa.ArrowInvalid, match="Missing bucket name"): - uri = template.format("/") - ds.dataset("/theirbucket/nested/folder/data.parquet", filesystem=uri) + with pytest.raises(pa.ArrowInvalid, match='Missing bucket name'): + uri = template.format('/') + ds.dataset('/theirbucket/nested/folder/data.parquet', filesystem=uri) error = ( "The path component of the filesystem URI must point to a directory " @@ -1881,17 +1816,17 @@ def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server): "filesystem URI is `{}`" ) - path = "theirbucket/doesnt/exist" + path = 'theirbucket/doesnt/exist' uri = template.format(path) with pytest.raises(ValueError) as exc: - ds.dataset("data.parquet", filesystem=uri) - assert str(exc.value) == error.format("NotFound", path, uri) + ds.dataset('data.parquet', filesystem=uri) + assert str(exc.value) == error.format('NotFound', path, uri) - path = "theirbucket/nested/folder/data.parquet" + path = 'theirbucket/nested/folder/data.parquet' uri = template.format(path) with pytest.raises(ValueError) as exc: - ds.dataset("data.parquet", filesystem=uri) - assert str(exc.value) == error.format("File", path, uri) + ds.dataset('data.parquet', filesystem=uri) + assert str(exc.value) == error.format('File', path, uri) @pytest.mark.parquet @@ -1936,17 +1871,18 @@ def test_filter_timestamp(tempdir): @pytest.mark.parquet def test_filter_implicit_cast(tempdir): # ARROW-7652 - table = pa.table({"a": pa.array([0, 1, 2, 3, 4, 5], type=pa.int8())}) + table = pa.table({'a': pa.array([0, 1, 2, 3, 4, 5], type=pa.int8())}) _, path = _create_single_file(tempdir, table) dataset = ds.dataset(str(path)) - filter_ = ds.field("a") > 2 + filter_ = ds.field('a') > 2 assert len(dataset.to_table(filter=filter_)) == 3 def test_dataset_union(multisourcefs): child = ds.FileSystemDatasetFactory( - multisourcefs, fs.FileSelector("/plain"), format=ds.ParquetFileFormat() + multisourcefs, fs.FileSelector('/plain'), + format=ds.ParquetFileFormat() ) factory = ds.UnionDatasetFactory([child]) @@ -1959,128 +1895,106 @@ def test_dataset_union(multisourcefs): def test_union_dataset_from_other_datasets(tempdir, multisourcefs): - child1 = ds.dataset("/plain", filesystem=multisourcefs, format="parquet") - child2 = ds.dataset( - "/schema", - filesystem=multisourcefs, - format="parquet", - partitioning=["week", "color"], - ) - child3 = ds.dataset( - "/hive", filesystem=multisourcefs, format="parquet", partitioning="hive" - ) + child1 = ds.dataset('/plain', filesystem=multisourcefs, format='parquet') + child2 = ds.dataset('/schema', filesystem=multisourcefs, format='parquet', + partitioning=['week', 'color']) + child3 = ds.dataset('/hive', filesystem=multisourcefs, format='parquet', + partitioning='hive') assert child1.schema != child2.schema != child3.schema assembled = ds.dataset([child1, child2, child3]) assert isinstance(assembled, ds.UnionDataset) - msg = "cannot pass any additional arguments" + msg = 'cannot pass any additional arguments' with pytest.raises(ValueError, match=msg): ds.dataset([child1, child2], filesystem=multisourcefs) - expected_schema = pa.schema( - [ - ("date", pa.date32()), - ("index", pa.int64()), - ("value", pa.float64()), - ("color", pa.string()), - ("week", pa.int32()), - ("year", pa.int32()), - ("month", pa.int32()), - ] - ) + expected_schema = pa.schema([ + ('date', pa.date32()), + ('index', pa.int64()), + ('value', pa.float64()), + ('color', pa.string()), + ('week', pa.int32()), + ('year', pa.int32()), + ('month', pa.int32()), + ]) assert assembled.schema.equals(expected_schema) assert assembled.to_table().schema.equals(expected_schema) assembled = ds.dataset([child1, child3]) - expected_schema = pa.schema( - [ - ("date", pa.date32()), - ("index", pa.int64()), - ("value", pa.float64()), - ("color", pa.string()), - ("year", pa.int32()), - ("month", pa.int32()), - ] - ) + expected_schema = pa.schema([ + ('date', pa.date32()), + ('index', pa.int64()), + ('value', pa.float64()), + ('color', pa.string()), + ('year', pa.int32()), + ('month', pa.int32()), + ]) assert assembled.schema.equals(expected_schema) assert assembled.to_table().schema.equals(expected_schema) - expected_schema = pa.schema( - [ - ("month", pa.int32()), - ("color", pa.string()), - ("date", pa.date32()), - ] - ) + expected_schema = pa.schema([ + ('month', pa.int32()), + ('color', pa.string()), + ('date', pa.date32()), + ]) assembled = ds.dataset([child1, child3], schema=expected_schema) assert assembled.to_table().schema.equals(expected_schema) - expected_schema = pa.schema( - [ - ("month", pa.int32()), - ("color", pa.string()), - ("unknown", pa.string()), # fill with nulls - ] - ) + expected_schema = pa.schema([ + ('month', pa.int32()), + ('color', pa.string()), + ('unknown', pa.string()) # fill with nulls + ]) assembled = ds.dataset([child1, child3], schema=expected_schema) assert assembled.to_table().schema.equals(expected_schema) # incompatible schemas, date and index columns have conflicting types - table = pa.table( - [range(9), [0.0] * 4 + [1.0] * 5, "abcdefghj"], names=["date", "value", "index"] - ) + table = pa.table([range(9), [0.] * 4 + [1.] * 5, 'abcdefghj'], + names=['date', 'value', 'index']) _, path = _create_single_file(tempdir, table=table) child4 = ds.dataset(path) - with pytest.raises(pa.ArrowInvalid, match="Unable to merge"): + with pytest.raises(pa.ArrowInvalid, match='Unable to merge'): ds.dataset([child1, child4]) def test_dataset_from_a_list_of_local_directories_raises(multisourcefs): - msg = "points to a directory, but only file paths are supported" + msg = 'points to a directory, but only file paths are supported' with pytest.raises(IsADirectoryError, match=msg): - ds.dataset(["/plain", "/schema", "/hive"], filesystem=multisourcefs) + ds.dataset(['/plain', '/schema', '/hive'], filesystem=multisourcefs) def test_union_dataset_filesystem_datasets(multisourcefs): # without partitioning - dataset = ds.dataset( - [ - ds.dataset("/plain", filesystem=multisourcefs), - ds.dataset("/schema", filesystem=multisourcefs), - ds.dataset("/hive", filesystem=multisourcefs), - ] - ) - expected_schema = pa.schema( - [ - ("date", pa.date32()), - ("index", pa.int64()), - ("value", pa.float64()), - ("color", pa.string()), - ] - ) + dataset = ds.dataset([ + ds.dataset('/plain', filesystem=multisourcefs), + ds.dataset('/schema', filesystem=multisourcefs), + ds.dataset('/hive', filesystem=multisourcefs), + ]) + expected_schema = pa.schema([ + ('date', pa.date32()), + ('index', pa.int64()), + ('value', pa.float64()), + ('color', pa.string()), + ]) assert dataset.schema.equals(expected_schema) # with hive partitioning for two hive sources - dataset = ds.dataset( - [ - ds.dataset("/plain", filesystem=multisourcefs), - ds.dataset("/schema", filesystem=multisourcefs), - ds.dataset("/hive", filesystem=multisourcefs, partitioning="hive"), - ] - ) - expected_schema = pa.schema( - [ - ("date", pa.date32()), - ("index", pa.int64()), - ("value", pa.float64()), - ("color", pa.string()), - ("year", pa.int32()), - ("month", pa.int32()), - ] - ) + dataset = ds.dataset([ + ds.dataset('/plain', filesystem=multisourcefs), + ds.dataset('/schema', filesystem=multisourcefs), + ds.dataset('/hive', filesystem=multisourcefs, partitioning='hive') + ]) + expected_schema = pa.schema([ + ('date', pa.date32()), + ('index', pa.int64()), + ('value', pa.float64()), + ('color', pa.string()), + ('year', pa.int32()), + ('month', pa.int32()), + ]) assert dataset.schema.equals(expected_schema) @@ -2088,7 +2002,7 @@ def test_union_dataset_filesystem_datasets(multisourcefs): def test_specified_schema(tempdir): import pyarrow.parquet as pq - table = pa.table({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + table = pa.table({'a': [1, 2, 3], 'b': [.1, .2, .3]}) pq.write_table(table, tempdir / "data.parquet") def _check_dataset(schema, expected, expected_schema=None): @@ -2111,24 +2025,24 @@ def _check_dataset(schema, expected, expected_schema=None): _check_dataset(schema, expected) # Specifying schema with change column order - schema = pa.schema([("b", "float64"), ("a", "int64")]) - expected = pa.table([[0.1, 0.2, 0.3], [1, 2, 3]], names=["b", "a"]) + schema = pa.schema([('b', 'float64'), ('a', 'int64')]) + expected = pa.table([[.1, .2, .3], [1, 2, 3]], names=['b', 'a']) _check_dataset(schema, expected) # Specifying schema with missing column - schema = pa.schema([("a", "int64")]) - expected = pa.table([[1, 2, 3]], names=["a"]) + schema = pa.schema([('a', 'int64')]) + expected = pa.table([[1, 2, 3]], names=['a']) _check_dataset(schema, expected) # Specifying schema with additional column - schema = pa.schema([("a", "int64"), ("c", "int32")]) - expected = pa.table( - [[1, 2, 3], pa.array([None, None, None], type="int32")], names=["a", "c"] - ) + schema = pa.schema([('a', 'int64'), ('c', 'int32')]) + expected = pa.table([[1, 2, 3], + pa.array([None, None, None], type='int32')], + names=['a', 'c']) _check_dataset(schema, expected) # Specifying with incompatible schema - schema = pa.schema([("a", "int32"), ("b", "float64")]) + schema = pa.schema([('a', 'int32'), ('b', 'float64')]) dataset = ds.dataset(str(tempdir / "data.parquet"), schema=schema) assert dataset.schema.equals(schema) with pytest.raises(TypeError): @@ -2136,14 +2050,10 @@ def _check_dataset(schema, expected, expected_schema=None): def test_ipc_format(tempdir): - table = pa.table( - { - "a": pa.array([1, 2, 3], type="int8"), - "b": pa.array([0.1, 0.2, 0.3], type="float64"), - } - ) + table = pa.table({'a': pa.array([1, 2, 3], type="int8"), + 'b': pa.array([.1, .2, .3], type="float64")}) - path = str(tempdir / "test.arrow") + path = str(tempdir / 'test.arrow') with pa.output_stream(path) as sink: writer = pa.RecordBatchFileWriter(sink, table.schema) writer.write_batch(table.to_batches()[0]) @@ -2161,21 +2071,17 @@ def test_ipc_format(tempdir): @pytest.mark.pandas def test_csv_format(tempdir): - table = pa.table( - { - "a": pa.array([1, 2, 3], type="int64"), - "b": pa.array([0.1, 0.2, 0.3], type="float64"), - } - ) + table = pa.table({'a': pa.array([1, 2, 3], type="int64"), + 'b': pa.array([.1, .2, .3], type="float64")}) - path = str(tempdir / "test.csv") + path = str(tempdir / 'test.csv') table.to_pandas().to_csv(path, index=False) dataset = ds.dataset(path, format=ds.CsvFileFormat()) result = dataset.to_table() assert result.equals(table) - dataset = ds.dataset(path, format="csv") + dataset = ds.dataset(path, format='csv') result = dataset.to_table() assert result.equals(table) @@ -2183,12 +2089,8 @@ def test_csv_format(tempdir): def test_feather_format(tempdir): from pyarrow.feather import write_feather - table = pa.table( - { - "a": pa.array([1, 2, 3], type="int8"), - "b": pa.array([0.1, 0.2, 0.3], type="float64"), - } - ) + table = pa.table({'a': pa.array([1, 2, 3], type="int8"), + 'b': pa.array([.1, .2, .3], type="float64")}) basedir = tempdir / "feather_dataset" basedir.mkdir() @@ -2220,15 +2122,16 @@ def _create_parquet_dataset_simple(root_path): metadata_collector = [] for i in range(4): - table = pa.table({"f1": [i] * 10, "f2": np.random.randn(10)}) + table = pa.table({'f1': [i] * 10, 'f2': np.random.randn(10)}) pq.write_to_dataset( table, str(root_path), metadata_collector=metadata_collector ) - metadata_path = str(root_path / "_metadata") + metadata_path = str(root_path / '_metadata') # write _metadata file pq.write_metadata( - table.schema, metadata_path, metadata_collector=metadata_collector + table.schema, metadata_path, + metadata_collector=metadata_collector ) return metadata_path, table @@ -2273,24 +2176,22 @@ def _create_metadata_file(root_path): metadata_collector.append(metadata) metadata_path = root_path / "_metadata" - pq.write_metadata(schema, metadata_path, - metadata_collector=metadata_collector) + pq.write_metadata( + schema, metadata_path, metadata_collector=metadata_collector + ) return metadata_path def _create_parquet_dataset_partitioned(root_path): import pyarrow.parquet as pq - table = pa.table( - [ - pa.array(range(20)), - pa.array(np.random.randn(20)), - pa.array(np.repeat(["a", "b"], 10)), - ], - names=["f1", "f2", "part"], + table = pa.table([ + pa.array(range(20)), pa.array(np.random.randn(20)), + pa.array(np.repeat(['a', 'b'], 10))], + names=["f1", "f2", "part"] ) table = table.replace_schema_metadata({"key": "value"}) - pq.write_to_dataset(table, str(root_path), partition_cols=["part"]) + pq.write_to_dataset(table, str(root_path), partition_cols=['part']) return _create_metadata_file(root_path), table @@ -2343,8 +2244,9 @@ def test_parquet_dataset_lazy_filtering(tempdir, open_logging_fs): # creating the dataset should only open the metadata file with assert_opens([metadata_path]): dataset = ds.parquet_dataset( - metadata_path, partitioning=ds.partitioning(flavor="hive"), filesystem=fs - ) + metadata_path, + partitioning=ds.partitioning(flavor="hive"), + filesystem=fs) # materializing fragments should not open any file with assert_opens([]): @@ -2373,7 +2275,7 @@ def test_parquet_dataset_lazy_filtering(tempdir, open_logging_fs): @pytest.mark.pandas def test_dataset_schema_metadata(tempdir): # ARROW-8802 - df = pd.DataFrame({"a": [1, 2, 3]}) + df = pd.DataFrame({'a': [1, 2, 3]}) path = tempdir / "test.parquet" df.to_parquet(path) dataset = ds.dataset(path) @@ -2392,13 +2294,13 @@ def test_filter_mismatching_schema(tempdir): # ARROW-9146 import pyarrow.parquet as pq - table = pa.table({"col": pa.array([1, 2, 3, 4], type="int32")}) + table = pa.table({"col": pa.array([1, 2, 3, 4], type='int32')}) pq.write_table(table, str(tempdir / "data.parquet")) # specifying explicit schema, but that mismatches the schema of the data schema = pa.schema([("col", pa.int64())]) - dataset = ds.dataset(tempdir / "data.parquet", - format="parquet", schema=schema) + dataset = ds.dataset( + tempdir / "data.parquet", format="parquet", schema=schema) # filtering on a column with such type mismatch should give a proper error with pytest.raises(TypeError): @@ -2415,72 +2317,65 @@ def test_dataset_project_only_partition_columns(tempdir): # ARROW-8729 import pyarrow.parquet as pq - table = pa.table({"part": "a a b b".split(), "col": list(range(4))}) + table = pa.table({'part': 'a a b b'.split(), 'col': list(range(4))}) - path = str(tempdir / "test_dataset") - pq.write_to_dataset(table, path, partition_cols=["part"]) - dataset = ds.dataset(path, partitioning="hive") + path = str(tempdir / 'test_dataset') + pq.write_to_dataset(table, path, partition_cols=['part']) + dataset = ds.dataset(path, partitioning='hive') all_cols = dataset.to_table(use_threads=False) - part_only = dataset.to_table(columns=["part"], use_threads=False) + part_only = dataset.to_table(columns=['part'], use_threads=False) - assert all_cols.column("part").equals(part_only.column("part")) + assert all_cols.column('part').equals(part_only.column('part')) @pytest.mark.parquet @pytest.mark.pandas def test_dataset_project_null_column(tempdir): import pandas as pd - - df = pd.DataFrame({"col": np.array([None, None, None], dtype="object")}) + df = pd.DataFrame({"col": np.array([None, None, None], dtype='object')}) f = tempdir / "test_dataset_project_null_column.parquet" df.to_parquet(f, engine="pyarrow") dataset = ds.dataset(f, format="parquet", schema=pa.schema([("col", pa.int64())])) - expected = pa.table({"col": pa.array([None, None, None], pa.int64())}) + expected = pa.table({'col': pa.array([None, None, None], pa.int64())}) assert dataset.to_table().equals(expected) -def _check_dataset_roundtrip( - dataset, base_dir, expected_files, base_dir_path=None, partitioning=None -): +def _check_dataset_roundtrip(dataset, base_dir, expected_files, + base_dir_path=None, partitioning=None): base_dir_path = base_dir_path or base_dir - ds.write_dataset( - dataset, - base_dir, - format="feather", - partitioning=partitioning, - use_threads=False, - ) + ds.write_dataset(dataset, base_dir, format="feather", + partitioning=partitioning, use_threads=False) # check that all files are present file_paths = list(base_dir_path.rglob("*")) assert set(file_paths) == set(expected_files) # check that reading back in as dataset gives the same result - dataset2 = ds.dataset(base_dir_path, format="feather", - partitioning=partitioning) + dataset2 = ds.dataset( + base_dir_path, format="feather", partitioning=partitioning) assert dataset2.to_table().equals(dataset.to_table()) @pytest.mark.parquet def test_write_dataset(tempdir): # manually create a written dataset and read as dataset object - directory = tempdir / "single-file" + directory = tempdir / 'single-file' directory.mkdir() _ = _create_single_file(directory) dataset = ds.dataset(directory) # full string path - target = tempdir / "single-file-target" + target = tempdir / 'single-file-target' expected_files = [target / "part-0.feather"] _check_dataset_roundtrip(dataset, str(target), expected_files, target) # pathlib path object - target = tempdir / "single-file-target2" + target = tempdir / 'single-file-target2' expected_files = [target / "part-0.feather"] _check_dataset_roundtrip(dataset, target, expected_files, target) @@ -2492,12 +2387,12 @@ def test_write_dataset(tempdir): # dataset, './single-file-target3', expected_files, target) # Directory of files - directory = tempdir / "single-directory" + directory = tempdir / 'single-directory' directory.mkdir() _ = _create_directory_of_files(directory) dataset = ds.dataset(directory) - target = tempdir / "single-directory-target" + target = tempdir / 'single-directory-target' expected_files = [target / "part-0.feather"] _check_dataset_roundtrip(dataset, str(target), expected_files, target) @@ -2511,32 +2406,28 @@ def test_write_dataset_partitioned(tempdir): dataset = ds.dataset(directory, partitioning=partitioning) # hive partitioning - target = tempdir / "partitioned-hive-target" + target = tempdir / 'partitioned-hive-target' expected_paths = [ - target / "part=a", - target / "part=a" / "part-0.feather", - target / "part=b", - target / "part=b" / "part-1.feather", + target / "part=a", target / "part=a" / "part-0.feather", + target / "part=b", target / "part=b" / "part-1.feather" ] partitioning_schema = ds.partitioning( - pa.schema([("part", pa.string())]), flavor="hive" - ) + pa.schema([("part", pa.string())]), flavor="hive") _check_dataset_roundtrip( - dataset, str(target), expected_paths, target, partitioning=partitioning_schema - ) + dataset, str(target), expected_paths, target, + partitioning=partitioning_schema) # directory partitioning - target = tempdir / "partitioned-dir-target" + target = tempdir / 'partitioned-dir-target' expected_paths = [ - target / "a", - target / "a" / "part-0.feather", - target / "b", - target / "b" / "part-1.feather", + target / "a", target / "a" / "part-0.feather", + target / "b", target / "b" / "part-1.feather" ] - partitioning_schema = ds.partitioning(pa.schema([("part", pa.string())])) + partitioning_schema = ds.partitioning( + pa.schema([("part", pa.string())])) _check_dataset_roundtrip( - dataset, str(target), expected_paths, target, partitioning=partitioning_schema - ) + dataset, str(target), expected_paths, target, + partitioning=partitioning_schema) @pytest.mark.parquet @@ -2547,26 +2438,22 @@ def test_write_dataset_partitioned_dict(tempdir): # directory partitioning, dictionary partition columns dataset = ds.dataset( - directory, partitioning=ds.HivePartitioning.discover( - infer_dictionary=True) - ) - target = tempdir / "partitioned-dir-target" + directory, + partitioning=ds.HivePartitioning.discover(infer_dictionary=True)) + target = tempdir / 'partitioned-dir-target' expected_paths = [ - target / "a", - target / "a" / "part-0.feather", - target / "b", - target / "b" / "part-1.feather", + target / "a", target / "a" / "part-0.feather", + target / "b", target / "b" / "part-1.feather" ] - partitioning = ds.partitioning( - pa.schema([dataset.schema.field("part")]), - dictionaries={"part": pa.array(["a", "b"])}, - ) + partitioning = ds.partitioning(pa.schema([ + dataset.schema.field('part')]), + dictionaries={'part': pa.array(['a', 'b'])}) # NB: dictionaries required here since we use partitioning to parse # directories in _check_dataset_roundtrip (not currently required for # the formatting step) _check_dataset_roundtrip( - dataset, str(target), expected_paths, target, partitioning=partitioning - ) + dataset, str(target), expected_paths, target, + partitioning=partitioning) @pytest.mark.parquet @@ -2579,13 +2466,15 @@ def test_write_dataset_use_threads(tempdir): partitioning = ds.partitioning( pa.schema([("part", pa.string())]), flavor="hive") - target1 = tempdir / "partitioned1" + target1 = tempdir / 'partitioned1' ds.write_dataset( - dataset, target1, format="feather", partitioning=partitioning, use_threads=True + dataset, target1, format="feather", partitioning=partitioning, + use_threads=True ) - target2 = tempdir / "partitioned2" + target2 = tempdir / 'partitioned2' ds.write_dataset( - dataset, target2, format="feather", partitioning=partitioning, use_threads=False + dataset, target2, format="feather", partitioning=partitioning, + use_threads=False ) # check that reading in gives same result @@ -2595,19 +2484,14 @@ def test_write_dataset_use_threads(tempdir): def test_write_table(tempdir): - table = pa.table( - [ - pa.array(range(20)), - pa.array(np.random.randn(20)), - pa.array(np.repeat(["a", "b"], 10)), - ], - names=["f1", "f2", "part"], - ) - - base_dir = tempdir / "single" - ds.write_dataset( - table, base_dir, basename_template="dat_{i}.arrow", format="feather" - ) + table = pa.table([ + pa.array(range(20)), pa.array(np.random.randn(20)), + pa.array(np.repeat(['a', 'b'], 10)) + ], names=["f1", "f2", "part"]) + + base_dir = tempdir / 'single' + ds.write_dataset(table, base_dir, + basename_template='dat_{i}.arrow', format="feather") # check that all files are present file_paths = list(base_dir.rglob("*")) expected_paths = [base_dir / "dat_0.arrow"] @@ -2617,22 +2501,16 @@ def test_write_table(tempdir): assert result.equals(table) # with partitioning - base_dir = tempdir / "partitioned" + base_dir = tempdir / 'partitioned' partitioning = ds.partitioning( pa.schema([("part", pa.string())]), flavor="hive") - ds.write_dataset( - table, - base_dir, - format="feather", - basename_template="dat_{i}.arrow", - partitioning=partitioning, - ) + ds.write_dataset(table, base_dir, format="feather", + basename_template='dat_{i}.arrow', + partitioning=partitioning) file_paths = list(base_dir.rglob("*")) expected_paths = [ - base_dir / "part=a", - base_dir / "part=a" / "dat_0.arrow", - base_dir / "part=b", - base_dir / "part=b" / "dat_1.arrow", + base_dir / "part=a", base_dir / "part=a" / "dat_0.arrow", + base_dir / "part=b", base_dir / "part=b" / "dat_1.arrow" ] assert set(file_paths) == set(expected_paths) result = ds.dataset(base_dir, format="ipc", partitioning=partitioning) @@ -2640,66 +2518,59 @@ def test_write_table(tempdir): def test_write_table_multiple_fragments(tempdir): - table = pa.table( - [ - pa.array(range(10)), - pa.array(np.random.randn(10)), - pa.array(np.repeat(["a", "b"], 5)), - ], - names=["f1", "f2", "part"], - ) - table = pa.concat_tables([table] * 2) + table = pa.table([ + pa.array(range(10)), pa.array(np.random.randn(10)), + pa.array(np.repeat(['a', 'b'], 5)) + ], names=["f1", "f2", "part"]) + table = pa.concat_tables([table]*2) # Table with multiple batches written as single Fragment by default - base_dir = tempdir / "single" + base_dir = tempdir / 'single' ds.write_dataset(table, base_dir, format="feather") assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals(table) # Same for single-element list of Table - base_dir = tempdir / "single-list" + base_dir = tempdir / 'single-list' ds.write_dataset([table], base_dir, format="feather") assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals(table) # Provide list of batches to write multiple fragments - base_dir = tempdir / "multiple" + base_dir = tempdir / 'multiple' ds.write_dataset(table.to_batches(), base_dir, format="feather") - assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"]) + assert set(base_dir.rglob("*")) == set( + [base_dir / "part-0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals(table) # Provide list of tables to write multiple fragments - base_dir = tempdir / "multiple-table" + base_dir = tempdir / 'multiple-table' ds.write_dataset([table, table], base_dir, format="feather") - assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"]) - assert ( - ds.dataset(base_dir, format="ipc") - .to_table() - .equals(pa.concat_tables([table] * 2)) + assert set(base_dir.rglob("*")) == set( + [base_dir / "part-0.feather"]) + assert ds.dataset(base_dir, format="ipc").to_table().equals( + pa.concat_tables([table]*2) ) def test_write_table_partitioned_dict(tempdir): # ensure writing table partitioned on a dictionary column works without # specifying the dictionary values explicitly - table = pa.table( - [ - pa.array(range(20)), - pa.array(np.repeat(["a", "b"], 10)).dictionary_encode(), - ], - names=["col", "part"], - ) + table = pa.table([ + pa.array(range(20)), + pa.array(np.repeat(['a', 'b'], 10)).dictionary_encode(), + ], names=['col', 'part']) partitioning = ds.partitioning(table.select(["part"]).schema) base_dir = tempdir / "dataset" - ds.write_dataset(table, base_dir, format="feather", - partitioning=partitioning) + ds.write_dataset( + table, base_dir, format="feather", partitioning=partitioning + ) # check roundtrip partitioning_read = ds.DirectoryPartitioning.discover( - ["part"], infer_dictionary=True - ) + ["part"], infer_dictionary=True) result = ds.dataset( base_dir, format="ipc", partitioning=partitioning_read ).to_table() @@ -2710,18 +2581,14 @@ def test_write_table_partitioned_dict(tempdir): def test_write_dataset_parquet(tempdir): import pyarrow.parquet as pq - table = pa.table( - [ - pa.array(range(20)), - pa.array(np.random.randn(20)), - pa.array(np.repeat(["a", "b"], 10)), - ], - names=["f1", "f2", "part"], - ) + table = pa.table([ + pa.array(range(20)), pa.array(np.random.randn(20)), + pa.array(np.repeat(['a', 'b'], 10)) + ], names=["f1", "f2", "part"]) # using default "parquet" format string - base_dir = tempdir / "parquet_dataset" + base_dir = tempdir / 'parquet_dataset' ds.write_dataset(table, base_dir, format="parquet") # check that all files are present file_paths = list(base_dir.rglob("*")) @@ -2735,7 +2602,7 @@ def test_write_dataset_parquet(tempdir): for version in ["1.0", "2.0"]: format = ds.ParquetFileFormat() opts = format.make_write_options(version=version) - base_dir = tempdir / "parquet_dataset_version{0}".format(version) + base_dir = tempdir / 'parquet_dataset_version{0}'.format(version) ds.write_dataset(table, base_dir, format=format, file_options=opts) meta = pq.read_metadata(base_dir / "part-0.parquet") assert meta.format_version == version @@ -2760,12 +2627,12 @@ def test_write_dataset_schema_metadata(tempdir): # ensure that schema metadata gets written from pyarrow import feather - table = pa.table({"a": [1, 2, 3]}) - table = table.replace_schema_metadata({b"key": b"value"}) + table = pa.table({'a': [1, 2, 3]}) + table = table.replace_schema_metadata({b'key': b'value'}) ds.write_dataset(table, tempdir, format="feather") schema = feather.read_table(tempdir / "part-0.feather").schema - assert schema.metadata == {b"key": b"value"} + assert schema.metadata == {b'key': b'value'} @pytest.mark.parquet @@ -2773,12 +2640,12 @@ def test_write_dataset_schema_metadata_parquet(tempdir): # ensure that schema metadata gets written import pyarrow.parquet as pq - table = pa.table({"a": [1, 2, 3]}) - table = table.replace_schema_metadata({b"key": b"value"}) + table = pa.table({'a': [1, 2, 3]}) + table = table.replace_schema_metadata({b'key': b'value'}) ds.write_dataset(table, tempdir, format="parquet") schema = pq.read_table(tempdir / "part-0.parquet").schema - assert schema.metadata == {b"key": b"value"} + assert schema.metadata == {b'key': b'value'} @pytest.mark.parquet @@ -2786,23 +2653,22 @@ def test_write_dataset_schema_metadata_parquet(tempdir): def test_write_dataset_s3(s3_example_simple): # write dataset with s3 filesystem _, _, fs, _, host, port, access_key, secret_key = s3_example_simple - uri_template = "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format( - access_key, secret_key, host, port + uri_template = ( + "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format( + access_key, secret_key, host, port) ) - table = pa.table( - [ - pa.array(range(20)), - pa.array(np.random.randn(20)), - pa.array(np.repeat(["a", "b"], 10)), - ], - names=["f1", "f2", "part"], + table = pa.table([ + pa.array(range(20)), pa.array(np.random.randn(20)), + pa.array(np.repeat(['a', 'b'], 10))], + names=["f1", "f2", "part"] ) part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive") # writing with filesystem object ds.write_dataset( - table, "mybucket/dataset", filesystem=fs, format="feather", partitioning=part + table, "mybucket/dataset", filesystem=fs, format="feather", + partitioning=part ) # check rountrip result = ds.dataset( From 982f68c1afefeedf1656ebd14f5b51ef0f6c796a Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 15 Feb 2021 11:12:40 -1000 Subject: [PATCH 18/33] Added more tests, rounded out a few behaviors --- cpp/src/arrow/dataset/partition.cc | 15 ++- cpp/src/arrow/dataset/partition.h | 1 + cpp/src/arrow/dataset/partition_test.cc | 40 ++++++-- python/pyarrow/tests/test_dataset.py | 123 +++++++++++++++++++++--- 4 files changed, 153 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 2afaf414f9d..46142560c13 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -98,12 +98,16 @@ Status KeyValuePartitioning::SetDefaultValuesFromKeys(const Expression& expr, return Status::OK(); } -inline Expression ConjunctionFromGroupingRow(Scalar* row) { +Expression ConjunctionFromGroupingRow(Scalar* row) { ScalarVector* values = &checked_cast(row)->value; std::vector equality_expressions(values->size()); for (size_t i = 0; i < values->size(); ++i) { const std::string& name = row->type->field(static_cast(i))->name(); - equality_expressions[i] = equal(field_ref(name), literal(std::move(values->at(i)))); + if (values->at(i)->is_valid) { + equality_expressions[i] = equal(field_ref(name), literal(std::move(values->at(i)))); + } else { + equality_expressions[i] = is_null(field_ref(name)); + } } return and_(std::move(equality_expressions)); } @@ -272,7 +276,7 @@ Result DirectoryPartitioning::FormatValues( std::vector segments(static_cast(schema_->num_fields())); for (int i = 0; i < schema_->num_fields(); ++i) { - if (values[i] != nullptr) { + if (values[i] != nullptr && values[i]->is_valid) { segments[i] = values[i]->ToString(); continue; } @@ -432,7 +436,7 @@ std::shared_ptr DirectoryPartitioning::MakeFactory( util::optional HivePartitioning::ParseKey( const std::string& segment, const std::string& null_fallback) { auto name_end = string_view(segment).find_first_of('='); - // Keep for backwards compatibility, this would be produced by arrow <= 3 + // Not round-trippable if (name_end == string_view::npos) { return util::nullopt; } @@ -513,7 +517,8 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory { // drop fields which aren't in field_names_ auto out_schema = SchemaFromColumnNames(schema, field_names_); - return std::make_shared(std::move(out_schema), dictionaries_); + return std::make_shared(std::move(out_schema), dictionaries_, + null_fallback_); } } diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index e5afd00c76d..bc59dfe53c5 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -202,6 +202,7 @@ class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning { null_fallback_(null_fallback) {} std::string type_name() const override { return "hive"; } + std::string null_fallback() const { return null_fallback_; } static util::optional ParseKey(const std::string& segment, const std::string& null_fallback); diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index b8dade238c0..4b9c1d222f9 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -80,15 +80,19 @@ class TestPartitioning : public ::testing::Test { void AssertPartition(const std::shared_ptr partitioning, const std::shared_ptr full_batch, - const RecordBatchVector& expected_batches) { + const RecordBatchVector& expected_batches, + const std::vector& expected_expressions) { ASSERT_OK_AND_ASSIGN(auto partition_results, partitioning->Partition(full_batch)); std::shared_ptr rest = full_batch; ASSERT_EQ(partition_results.batches.size(), expected_batches.size()); auto max_index = std::min(partition_results.batches.size(), expected_batches.size()); for (std::size_t partition_index = 0; partition_index < max_index; partition_index++) { - std::shared_ptr actual = partition_results.batches[partition_index]; - AssertBatchesEqual(*expected_batches[partition_index], *actual); + std::shared_ptr actual_batch = + partition_results.batches[partition_index]; + AssertBatchesEqual(*expected_batches[partition_index], *actual_batch); + Expression actual_expression = partition_results.expressions[partition_index]; + ASSERT_EQ(expected_expressions[partition_index], actual_expression); } } @@ -96,14 +100,15 @@ class TestPartitioning : public ::testing::Test { const std::shared_ptr schema, const std::string& record_batch_json, const std::shared_ptr partitioned_schema, - const std::vector& expected_record_batch_strs) { + const std::vector& expected_record_batch_strs, + const std::vector& expected_expressions) { auto record_batch = RecordBatchFromJSON(schema, record_batch_json); RecordBatchVector expected_batches; for (const auto& expected_record_batch_str : expected_record_batch_strs) { expected_batches.push_back( RecordBatchFromJSON(partitioned_schema, expected_record_batch_str)); } - AssertPartition(partitioning, record_batch, expected_batches); + AssertPartition(partitioning, record_batch, expected_batches, expected_expressions); } void AssertInspectError(const std::vector& paths) { @@ -132,7 +137,7 @@ class TestPartitioning : public ::testing::Test { std::shared_ptr written_schema_; }; -TEST_F(TestPartitioning, Basic) { +TEST_F(TestPartitioning, Partition) { auto partition_schema = schema({field("a", int32()), field("b", utf8())}); auto schema_ = schema({field("a", int32()), field("b", utf8()), field("c", uint32())}); auto remaining_schema = schema({field("c", uint32())}); @@ -147,7 +152,13 @@ TEST_F(TestPartitioning, Basic) { std::vector expected_batches = {R"([{"c": 0}, {"c": 1}])", R"([{"c": 2}])", R"([{"c": 3}, {"c": 5}])", R"([{"c": 4}])"}; - AssertPartition(partitioning, schema_, json, remaining_schema, expected_batches); + std::vector expected_expressions = { + and_(equal(field_ref("a"), literal(3)), equal(field_ref("b"), literal("x"))), + and_(equal(field_ref("a"), literal(1)), is_null(field_ref("b"))), + and_(is_null(field_ref("a")), is_null(field_ref("b"))), + and_(is_null(field_ref("a")), equal(field_ref("b"), literal("z")))}; + AssertPartition(partitioning, schema_, json, remaining_schema, expected_batches, + expected_expressions); } TEST_F(TestPartitioning, StructDictionaryNull) {} @@ -185,6 +196,10 @@ TEST_F(TestPartitioning, DirectoryPartitioningFormat) { equal(field_ref("alpha"), literal(0))), "0/hello"); AssertFormat(equal(field_ref("alpha"), literal(0)), "0"); + AssertFormat(and_(equal(field_ref("alpha"), literal(0)), is_null(field_ref("beta"))), + "0"); + AssertFormatError( + and_(is_null(field_ref("alpha")), equal(field_ref("beta"), literal("hello")))); AssertFormatError(equal(field_ref("beta"), literal("hello"))); AssertFormat(literal(true), ""); @@ -406,6 +421,17 @@ TEST_F(TestPartitioning, HiveDictionaryInference) { {DictStr("alpha"), DictStr("beta")}); } +TEST_F(TestPartitioning, HiveNullFallbackPassedOn) { + HivePartitioningFactoryOptions options; + options.null_fallback = "xyz"; + factory_ = HivePartitioning::MakeFactory(options); + + EXPECT_OK_AND_ASSIGN(auto schema, factory_->Inspect({"/alpha=a/beta=0"})); + EXPECT_OK_AND_ASSIGN(auto partitioning, factory_->Finish(schema)); + ASSERT_EQ("xyz", + std::static_pointer_cast(partitioning)->null_fallback()); +} + TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) { HivePartitioningFactoryOptions options; options.infer_dictionary = true; diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index b2c1fc9f030..a42dc83769e 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -16,8 +16,11 @@ # under the License. import contextlib +import os +import posixpath import pathlib import pickle +from pyarrow.dataset import partitioning import textwrap import numpy as np @@ -381,11 +384,16 @@ def test_partitioning(): with pytest.raises(pa.ArrowInvalid): partitioning.parse('/prefix/3/aaa') + expr = partitioning.parse('/3') + expected = ds.field('group') == 3 + assert expr.equals(expected) + partitioning = ds.HivePartitioning( pa.schema([ pa.field('alpha', pa.int64()), pa.field('beta', pa.int64()) - ]) + ]), + null_fallback='xyz' ) expr = partitioning.parse('/alpha=0/beta=3') expected = ( @@ -394,6 +402,12 @@ def test_partitioning(): ) assert expr.equals(expected) + expr = partitioning.parse('/alpha=xyz/beta=3') + expected = ( + (ds.field('alpha').is_null() & (ds.field('beta') == ds.scalar(3))) + ) + assert expr.equals(expected) + for shouldfail in ['/alpha=one/beta=2', '/alpha=one', '/beta=two']: with pytest.raises(pa.ArrowInvalid): partitioning.parse(shouldfail) @@ -412,7 +426,7 @@ def test_expression_serialization(): d.is_valid(), a.cast(pa.int32(), safe=False), a.cast(pa.int32(), safe=False), a.isin([1, 2, 3]), ds.field('i64') > 5, ds.field('i64') == 5, - ds.field('i64') == 7] + ds.field('i64') == 7, ds.field('i64').is_null()] for expr in all_exprs: assert isinstance(expr, ds.Expression) restored = pickle.loads(pickle.dumps(expr)) @@ -440,7 +454,6 @@ def test_expression_construction(): with pytest.raises(pa.ArrowInvalid): field != {1} - def test_expression_boolean_operators(): # https://issues.apache.org/jira/browse/ARROW-11412 true = ds.scalar(True) @@ -468,6 +481,8 @@ def test_partition_keys(): assert ds._get_partition_keys(nope) == {} assert ds._get_partition_keys(a & nope) == {'a': 'a'} + null = ds.field('a').is_null() + assert ds._get_partition_keys(null) == {'a': None} def test_parquet_read_options(): opts1 = ds.ParquetReadOptions() @@ -1242,6 +1257,45 @@ def test_partitioning_factory_dictionary(mockfs, infer_dictionary): else: assert inferred_schema.field('key').type == pa.string() +def test_dictionary_partitioning_outer_nulls_raises(tempdir): + table = pa.table({'a': [ 'x', 'y', None ], 'b': ['x', 'y', 'z']}) + part = ds.partitioning(pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())])) + with pytest.raises(pa.ArrowInvalid): + ds.write_dataset(table, tempdir, format='parquet', partitioning=part) + +def _has_subdirs(basedir): + return any([os.path.isdir(os.path.join(basedir, el)) for el in os.listdir(basedir)]) + +def _do_list_all_dirs(basedir, path_so_far, result): + for f in os.listdir(basedir): + true_nested = os.path.join(basedir, f) + if os.path.isdir(true_nested): + norm_nested = posixpath.join(path_so_far, f) + if _has_subdirs(true_nested): + _do_list_all_dirs(true_nested, norm_nested, result) + else: + result.append(norm_nested) + +def _list_all_dirs(basedir): + result = [] + _do_list_all_dirs(basedir, '', result) + return result + +def _check_dataset_directories(tempdir, expected_directories): + actual_directories = set(_list_all_dirs(tempdir)) + assert actual_directories == set(expected_directories) + +def test_dictionary_partitioning_inner_nulls(tempdir): + table = pa.table({'a': [ 'x', 'y', 'z' ], 'b': ['x', 'y', None]}) + part = ds.partitioning(pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())])) + ds.write_dataset(table, tempdir, format='parquet', partitioning=part) + _check_dataset_directories(tempdir, ['x/x', 'y/y', 'z']) + +def test_hive_partitioning_nulls(tempdir): + table = pa.table({'a': [ 'x', None, 'z' ], 'b': ['x', 'y', None]}) + part = ds.HivePartitioning(pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())]), None, 'xyz') + ds.write_dataset(table, tempdir, format='parquet', partitioning=part) + _check_dataset_directories(tempdir, ['a=x/b=x', 'a=xyz/b=y', 'a=z/b=xyz']) def test_partitioning_function(): schema = pa.schema([("year", pa.int16()), ("month", pa.int8())]) @@ -1605,6 +1659,7 @@ def test_open_dataset_non_existing_file(): @pytest.mark.parquet @pytest.mark.parametrize('partitioning', ["directory", "hive"]) @pytest.mark.parametrize('null_fallback', ['xyz', None]) +@pytest.mark.parametrize('infer_dictionary', [False, True]) @pytest.mark.parametrize('partition_keys', [ (["A", "B", "C"], [1, 2, 3]), ([1, 2, 3], ["A", "B", "C"]), @@ -1614,30 +1669,30 @@ def test_open_dataset_non_existing_file(): ([1, 2, 3], ["A", None, "C"]), ([None, 2, 3], [None, 2, 3]), ]) -def test_open_dataset_partitioned_dictionary_type( - tempdir, partitioning, null_fallback, partition_keys +def test_partition_discovery( + tempdir, partitioning, null_fallback, infer_dictionary, partition_keys ): # ARROW-9288 / ARROW-9476 import pyarrow.parquet as pq table = pa.table({'a': range(9), 'b': [0.0] * 4 + [1.0] * 5}) - if None in partition_keys[0] or None in partition_keys[1]: + if partitioning == "directory" and (None in partition_keys[0] or None in partition_keys[1]): # Directory partitioning can't handle the first part being null return if partitioning == "directory": partitioning = ds.DirectoryPartitioning.discover( - ["part1", "part2"], infer_dictionary=True) + ["part1", "part2"], infer_dictionary=infer_dictionary) fmt = "{0}/{1}" null_value = None else: if null_fallback: partitioning = ds.HivePartitioning.discover( - infer_dictionary=True, null_fallback=null_fallback + infer_dictionary=infer_dictionary, null_fallback=null_fallback ) else: - partitioning = ds.HivePartitioning.discover(infer_dictionary=True) + partitioning = ds.HivePartitioning.discover(infer_dictionary=infer_dictionary) fmt = "part1={0}/part2={1}" if null_fallback: null_value = null_fallback @@ -1657,13 +1712,16 @@ def test_open_dataset_partitioned_dictionary_type( dataset = ds.dataset(str(basepath), partitioning=partitioning) - def dict_type(key): - value_type = pa.string() if isinstance(key, str) else pa.int32() - return pa.dictionary(pa.int32(), value_type) + def expected_type(key): + if infer_dictionary: + value_type = pa.string() if isinstance(key, str) else pa.int32() + return pa.dictionary(pa.int32(), value_type) + else: + return pa.string() if isinstance(key, str) else pa.int32() expected_schema = table.schema.append( - pa.field("part1", dict_type(part_keys1[0])) + pa.field("part1", expected_type(part_keys1[0])) ).append( - pa.field("part2", dict_type(part_keys2[0])) + pa.field("part2", expected_type(part_keys2[0])) ) assert dataset.schema.equals(expected_schema) @@ -2327,7 +2385,44 @@ def test_dataset_project_only_partition_columns(tempdir): part_only = dataset.to_table(columns=['part'], use_threads=False) assert all_cols.column('part').equals(part_only.column('part')) + +@pytest.mark.parquet +@pytest.mark.pandas +def test_write_to_dataset_given_null_just_works(tempdir): + import pyarrow.parquet as pq + + schema = pa.schema([ + pa.field('col', pa.int64()), + pa.field('part', pa.dictionary(pa.int32(), pa.string())) + ]) + table = pa.table({'part': [None, None, 'a', 'a'], 'col': list(range(4))}, schema=schema) + + path = str(tempdir / 'test_dataset') + pq.write_to_dataset(table, path, partition_cols=['part'], use_legacy_dataset=False) + + actual_table = pq.read_table(tempdir / 'test_dataset') + # column.equals can handle the difference in chunking but not the fact that `part` + # will have different dictionaries for the two chunks + assert actual_table.column('part').to_pylist() == table.column('part').to_pylist() + assert actual_table.column('col').equals(table.column('col')) + +@pytest.mark.parquet +@pytest.mark.pandas +def test_legacy_write_to_dataset_drops_null(tempdir): + import pyarrow.parquet as pq + + schema = pa.schema([ + pa.field('col', pa.int64()), + pa.field('part', pa.dictionary(pa.int32(), pa.string())) + ]) + table = pa.table({'part': ['a', 'a', None, None], 'col': list(range(4))}, schema=schema) + expected = pa.table({'part': ['a', 'a'], 'col': list(range(2))}, schema=schema) + + path = str(tempdir / 'test_dataset') + pq.write_to_dataset(table, path, partition_cols=['part'], use_legacy_dataset=True) + actual = pq.read_table(tempdir / 'test_dataset') + assert actual == expected @pytest.mark.parquet @pytest.mark.pandas From 07eee3a40b02f2f05d23c65778ac574c36dd5274 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 15 Feb 2021 12:07:41 -1000 Subject: [PATCH 19/33] Added tests for SetDefaultValues to ensure it does the correct thing on null --- cpp/src/arrow/dataset/partition_test.cc | 49 +++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 4b9c1d222f9..1573b64dcbe 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -458,6 +458,55 @@ TEST_F(TestPartitioning, HiveDictionaryHasUniqueValues) { AssertParseError("/alpha=yosemite"); // not in inspected dictionary } +TEST_F(TestPartitioning, SetDefaultValuesConcrete) { + auto small_schm = schema({field("c", int32())}); + auto schm = schema({field("a", int32()), field("b", utf8())}); + auto full_schm = schema({field("a", int32()), field("b", utf8()), field("c", int32())}); + RecordBatchProjector record_batch_projector(full_schm); + HivePartitioning part(schm); + part.SetDefaultValuesFromKeys( + and_(equal(field_ref("a"), literal(10)), equal(field_ref("b"), literal("y"))), + &record_batch_projector); + + auto in_rb = RecordBatchFromJSON(small_schm, R"([{"c": 0}, + {"c": 1}, + {"c": 2}, + {"c": 3} + ])"); + + EXPECT_OK_AND_ASSIGN(auto out_rb, record_batch_projector.Project(*in_rb)); + auto expected_rb = RecordBatchFromJSON(full_schm, R"([{"a": 10, "b": "y", "c": 0}, + {"a": 10, "b": "y", "c": 1}, + {"a": 10, "b": "y", "c": 2}, + {"a": 10, "b": "y", "c": 3} + ])"); + AssertBatchesEqual(*expected_rb, *out_rb); +} + +TEST_F(TestPartitioning, SetDefaultValuesNull) { + auto small_schm = schema({field("c", int32())}); + auto schm = schema({field("a", int32()), field("b", utf8())}); + auto full_schm = schema({field("a", int32()), field("b", utf8()), field("c", int32())}); + RecordBatchProjector record_batch_projector(full_schm); + HivePartitioning part(schm); + part.SetDefaultValuesFromKeys(and_(is_null(field_ref("a")), is_null(field_ref("b"))), + &record_batch_projector); + + auto in_rb = RecordBatchFromJSON(small_schm, R"([{"c": 0}, + {"c": 1}, + {"c": 2}, + {"c": 3} + ])"); + + EXPECT_OK_AND_ASSIGN(auto out_rb, record_batch_projector.Project(*in_rb)); + auto expected_rb = RecordBatchFromJSON(full_schm, R"([{"a": null, "b": null, "c": 0}, + {"a": null, "b": null, "c": 1}, + {"a": null, "b": null, "c": 2}, + {"a": null, "b": null, "c": 3} + ])"); + AssertBatchesEqual(*expected_rb, *out_rb); +} + TEST_F(TestPartitioning, EtlThenHive) { FieldVector etl_fields{field("year", int16()), field("month", int8()), field("day", int8()), field("hour", int8())}; From 8f1792d88b01aa91b20228e39aadc3ec32cc5047 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 15 Feb 2021 14:01:12 -1000 Subject: [PATCH 20/33] Cleaned up logic for valid but not known case --- cpp/src/arrow/dataset/partition.cc | 6 +++--- cpp/src/arrow/dataset/partition_test.cc | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 46142560c13..e9c198e3398 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -88,9 +88,9 @@ Status KeyValuePartitioning::SetDefaultValuesFromKeys(const Expression& expr, if (known_value.concrete()) { RETURN_NOT_OK(projector->SetDefaultValue(match, known_value.datum.scalar())); } else if (known_value.valid) { - return Status::Invalid( - "Partition expression not defined enough to set default value for ", - ref_value.first.name()); + // We know some information about the value but nothing concrete enough to set. Can + // happen if expression is something like is_valid(field_ref("a")) + continue; } else { RETURN_NOT_OK(projector->SetDefaultValue(match, MakeNullScalar(field->type()))); } diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 1573b64dcbe..39303fc71e1 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -464,9 +464,9 @@ TEST_F(TestPartitioning, SetDefaultValuesConcrete) { auto full_schm = schema({field("a", int32()), field("b", utf8()), field("c", int32())}); RecordBatchProjector record_batch_projector(full_schm); HivePartitioning part(schm); - part.SetDefaultValuesFromKeys( - and_(equal(field_ref("a"), literal(10)), equal(field_ref("b"), literal("y"))), - &record_batch_projector); + ARROW_EXPECT_OK(part.SetDefaultValuesFromKeys( + and_(equal(field_ref("a"), literal(10)), is_valid(field_ref("b"))), + &record_batch_projector)); auto in_rb = RecordBatchFromJSON(small_schm, R"([{"c": 0}, {"c": 1}, @@ -475,10 +475,10 @@ TEST_F(TestPartitioning, SetDefaultValuesConcrete) { ])"); EXPECT_OK_AND_ASSIGN(auto out_rb, record_batch_projector.Project(*in_rb)); - auto expected_rb = RecordBatchFromJSON(full_schm, R"([{"a": 10, "b": "y", "c": 0}, - {"a": 10, "b": "y", "c": 1}, - {"a": 10, "b": "y", "c": 2}, - {"a": 10, "b": "y", "c": 3} + auto expected_rb = RecordBatchFromJSON(full_schm, R"([{"a": 10, "b": null, "c": 0}, + {"a": 10, "b": null, "c": 1}, + {"a": 10, "b": null, "c": 2}, + {"a": 10, "b": null, "c": 3} ])"); AssertBatchesEqual(*expected_rb, *out_rb); } From 6f7ced57d06a8b37707694dcda3732ca7834e21a Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 16 Feb 2021 08:29:20 -1000 Subject: [PATCH 21/33] Fixing compiler warning --- cpp/src/arrow/dataset/partition_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 39303fc71e1..aad828cb66d 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -489,8 +489,8 @@ TEST_F(TestPartitioning, SetDefaultValuesNull) { auto full_schm = schema({field("a", int32()), field("b", utf8()), field("c", int32())}); RecordBatchProjector record_batch_projector(full_schm); HivePartitioning part(schm); - part.SetDefaultValuesFromKeys(and_(is_null(field_ref("a")), is_null(field_ref("b"))), - &record_batch_projector); + ARROW_EXPECT_OK(part.SetDefaultValuesFromKeys( + and_(is_null(field_ref("a")), is_null(field_ref("b"))), &record_batch_projector)); auto in_rb = RecordBatchFromJSON(small_schm, R"([{"c": 0}, {"c": 1}, From 212c9bc148578e6fad5e470b503ad3deb02caa30 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 16 Feb 2021 08:49:23 -1000 Subject: [PATCH 22/33] Python lint --- python/pyarrow/tests/test_dataset.py | 63 +++++++++++++++++++--------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index a42dc83769e..e12f802e610 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -20,14 +20,12 @@ import posixpath import pathlib import pickle -from pyarrow.dataset import partitioning import textwrap import numpy as np import pytest import pyarrow as pa -import pyarrow.csv import pyarrow.fs as fs from pyarrow.tests.util import change_cwd, _filesystem_uri @@ -454,6 +452,7 @@ def test_expression_construction(): with pytest.raises(pa.ArrowInvalid): field != {1} + def test_expression_boolean_operators(): # https://issues.apache.org/jira/browse/ARROW-11412 true = ds.scalar(True) @@ -484,6 +483,7 @@ def test_partition_keys(): null = ds.field('a').is_null() assert ds._get_partition_keys(null) == {'a': None} + def test_parquet_read_options(): opts1 = ds.ParquetReadOptions() opts2 = ds.ParquetReadOptions(buffer_size=4096, @@ -1257,14 +1257,19 @@ def test_partitioning_factory_dictionary(mockfs, infer_dictionary): else: assert inferred_schema.field('key').type == pa.string() + def test_dictionary_partitioning_outer_nulls_raises(tempdir): - table = pa.table({'a': [ 'x', 'y', None ], 'b': ['x', 'y', 'z']}) - part = ds.partitioning(pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())])) + table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']}) + part = ds.partitioning( + pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())])) with pytest.raises(pa.ArrowInvalid): ds.write_dataset(table, tempdir, format='parquet', partitioning=part) + def _has_subdirs(basedir): - return any([os.path.isdir(os.path.join(basedir, el)) for el in os.listdir(basedir)]) + elements = os.listdir(basedir) + return any([os.path.isdir(os.path.join(basedir, el)) for el in elements]) + def _do_list_all_dirs(basedir, path_so_far, result): for f in os.listdir(basedir): @@ -1276,27 +1281,34 @@ def _do_list_all_dirs(basedir, path_so_far, result): else: result.append(norm_nested) + def _list_all_dirs(basedir): result = [] _do_list_all_dirs(basedir, '', result) return result + def _check_dataset_directories(tempdir, expected_directories): actual_directories = set(_list_all_dirs(tempdir)) assert actual_directories == set(expected_directories) + def test_dictionary_partitioning_inner_nulls(tempdir): - table = pa.table({'a': [ 'x', 'y', 'z' ], 'b': ['x', 'y', None]}) - part = ds.partitioning(pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())])) + table = pa.table({'a': ['x', 'y', 'z'], 'b': ['x', 'y', None]}) + part = ds.partitioning( + pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())])) ds.write_dataset(table, tempdir, format='parquet', partitioning=part) _check_dataset_directories(tempdir, ['x/x', 'y/y', 'z']) + def test_hive_partitioning_nulls(tempdir): - table = pa.table({'a': [ 'x', None, 'z' ], 'b': ['x', 'y', None]}) - part = ds.HivePartitioning(pa.schema([pa.field('a', pa.string()), pa.field('b', pa.string())]), None, 'xyz') + table = pa.table({'a': ['x', None, 'z'], 'b': ['x', 'y', None]}) + part = ds.HivePartitioning(pa.schema( + [pa.field('a', pa.string()), pa.field('b', pa.string())]), None, 'xyz') ds.write_dataset(table, tempdir, format='parquet', partitioning=part) _check_dataset_directories(tempdir, ['a=x/b=x', 'a=xyz/b=y', 'a=z/b=xyz']) + def test_partitioning_function(): schema = pa.schema([("year", pa.int16()), ("month", pa.int8())]) names = ["year", "month"] @@ -1677,7 +1689,8 @@ def test_partition_discovery( table = pa.table({'a': range(9), 'b': [0.0] * 4 + [1.0] * 5}) - if partitioning == "directory" and (None in partition_keys[0] or None in partition_keys[1]): + has_null = None in partition_keys[0] or None in partition_keys[1] + if partitioning == "directory" and has_null: # Directory partitioning can't handle the first part being null return @@ -1692,7 +1705,8 @@ def test_partition_discovery( infer_dictionary=infer_dictionary, null_fallback=null_fallback ) else: - partitioning = ds.HivePartitioning.discover(infer_dictionary=infer_dictionary) + partitioning = ds.HivePartitioning.discover( + infer_dictionary=infer_dictionary) fmt = "part1={0}/part2={1}" if null_fallback: null_value = null_fallback @@ -2385,7 +2399,8 @@ def test_dataset_project_only_partition_columns(tempdir): part_only = dataset.to_table(columns=['part'], use_threads=False) assert all_cols.column('part').equals(part_only.column('part')) - + + @pytest.mark.parquet @pytest.mark.pandas def test_write_to_dataset_given_null_just_works(tempdir): @@ -2395,17 +2410,21 @@ def test_write_to_dataset_given_null_just_works(tempdir): pa.field('col', pa.int64()), pa.field('part', pa.dictionary(pa.int32(), pa.string())) ]) - table = pa.table({'part': [None, None, 'a', 'a'], 'col': list(range(4))}, schema=schema) + table = pa.table({'part': [None, None, 'a', 'a'], + 'col': list(range(4))}, schema=schema) path = str(tempdir / 'test_dataset') - pq.write_to_dataset(table, path, partition_cols=['part'], use_legacy_dataset=False) + pq.write_to_dataset(table, path, partition_cols=[ + 'part'], use_legacy_dataset=False) actual_table = pq.read_table(tempdir / 'test_dataset') - # column.equals can handle the difference in chunking but not the fact that `part` - # will have different dictionaries for the two chunks - assert actual_table.column('part').to_pylist() == table.column('part').to_pylist() + # column.equals can handle the difference in chunking but not the fact + # that `part` will have different dictionaries for the two chunks + assert actual_table.column('part').to_pylist( + ) == table.column('part').to_pylist() assert actual_table.column('col').equals(table.column('col')) + @pytest.mark.parquet @pytest.mark.pandas def test_legacy_write_to_dataset_drops_null(tempdir): @@ -2415,15 +2434,19 @@ def test_legacy_write_to_dataset_drops_null(tempdir): pa.field('col', pa.int64()), pa.field('part', pa.dictionary(pa.int32(), pa.string())) ]) - table = pa.table({'part': ['a', 'a', None, None], 'col': list(range(4))}, schema=schema) - expected = pa.table({'part': ['a', 'a'], 'col': list(range(2))}, schema=schema) + table = pa.table({'part': ['a', 'a', None, None], + 'col': list(range(4))}, schema=schema) + expected = pa.table( + {'part': ['a', 'a'], 'col': list(range(2))}, schema=schema) path = str(tempdir / 'test_dataset') - pq.write_to_dataset(table, path, partition_cols=['part'], use_legacy_dataset=True) + pq.write_to_dataset(table, path, partition_cols=[ + 'part'], use_legacy_dataset=True) actual = pq.read_table(tempdir / 'test_dataset') assert actual == expected + @pytest.mark.parquet @pytest.mark.pandas def test_dataset_project_null_column(tempdir): From 9ef4a71697ca2a263e046aa683aa5ae1a783cad3 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 22 Feb 2021 09:32:06 -1000 Subject: [PATCH 23/33] Addressing PR comments --- cpp/src/arrow/compute/kernels/vector_hash.cc | 3 +-- cpp/src/arrow/dataset/expression.cc | 4 ---- cpp/src/arrow/dataset/expression.h | 3 --- cpp/src/arrow/dataset/expression_test.cc | 4 ++++ cpp/src/arrow/dataset/partition.cc | 25 ++++++++++---------- cpp/src/arrow/dataset/partition.h | 4 ++-- cpp/src/arrow/dataset/partition_test.cc | 12 ++++++++-- 7 files changed, 30 insertions(+), 25 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index c7b25347624..694c6265825 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -689,6 +689,7 @@ const FunctionDoc value_counts_doc( "Nulls in the input are ignored."), {"array"}); +const auto kDefaultDictionaryEncodeOptions = DictionaryEncodeOptions::Defaults(); const FunctionDoc dictionary_encode_doc( "Dictionary-encode array", ("Return a dictionary-encoded version of the input array."), {"array"}, @@ -738,8 +739,6 @@ void RegisterVectorHash(FunctionRegistry* registry) { // ---------------------------------------------------------------------- // dictionary_encode - const auto kDefaultDictionaryEncodeOptions = DictionaryEncodeOptions::Defaults(); - base.finalize = DictEncodeFinalize; // Unique and ValueCounts output unchunked arrays base.output_chunked = true; diff --git a/cpp/src/arrow/dataset/expression.cc b/cpp/src/arrow/dataset/expression.cc index ef92ae09fe7..fb62f819121 100644 --- a/cpp/src/arrow/dataset/expression.cc +++ b/cpp/src/arrow/dataset/expression.cc @@ -51,10 +51,6 @@ Expression::Expression(Parameter parameter) Expression literal(Datum lit) { return Expression(std::move(lit)); } -Expression null_literal(const std::shared_ptr& type) { - return Expression(MakeNullScalar(type)); -} - Expression field_ref(FieldRef ref) { return Expression(Expression::Parameter{std::move(ref), {}}); } diff --git a/cpp/src/arrow/dataset/expression.h b/cpp/src/arrow/dataset/expression.h index 1bbcb471015..b6b47fb8a2e 100644 --- a/cpp/src/arrow/dataset/expression.h +++ b/cpp/src/arrow/dataset/expression.h @@ -135,9 +135,6 @@ inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equ ARROW_DS_EXPORT Expression literal(Datum lit); -ARROW_DS_EXPORT -Expression null_literal(const std::shared_ptr& type); - template Expression literal(Arg&& arg) { return literal(Datum(std::forward(arg))); diff --git a/cpp/src/arrow/dataset/expression_test.cc b/cpp/src/arrow/dataset/expression_test.cc index 3aa62319e85..adaf6c3410d 100644 --- a/cpp/src/arrow/dataset/expression_test.cc +++ b/cpp/src/arrow/dataset/expression_test.cc @@ -240,6 +240,10 @@ TEST(Expression, Equality) { call("cast", {field_ref("a")}, compute::CastOptions::Unsafe(int32()))); } +Expression null_literal(const std::shared_ptr& type) { + return Expression(MakeNullScalar(type)); +} + TEST(Expression, Hash) { std::unordered_set set; diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index e9c198e3398..9515f631d1a 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -162,7 +162,7 @@ Result KeyValuePartitioning::ConvertKey(const Key& key) const { std::shared_ptr converted; - if (key.null) { + if (!key.value.has_value()) { return is_null(field_ref(field->name())); } else if (field->type()->id() == Type::DICTIONARY) { if (dictionaries_.empty() || dictionaries_[field_index] == nullptr) { @@ -181,7 +181,7 @@ Result KeyValuePartitioning::ConvertKey(const Key& key) const { } // look up the partition value in the dictionary - ARROW_ASSIGN_OR_RAISE(converted, Scalar::Parse(value.dictionary->type(), key.value)); + ARROW_ASSIGN_OR_RAISE(converted, Scalar::Parse(value.dictionary->type(), *key.value)); ARROW_ASSIGN_OR_RAISE(auto index, compute::IndexIn(converted, value.dictionary)); value.index = index.scalar(); if (!value.index->is_valid) { @@ -190,7 +190,7 @@ Result KeyValuePartitioning::ConvertKey(const Key& key) const { } converted = std::make_shared(std::move(value), field->type()); } else { - ARROW_ASSIGN_OR_RAISE(converted, Scalar::Parse(field->type(), key.value)); + ARROW_ASSIGN_OR_RAISE(converted, Scalar::Parse(field->type(), *key.value)); } return equal(field_ref(field->name()), literal(std::move(converted))); @@ -254,7 +254,7 @@ std::vector DirectoryPartitioning::ParseKeys( for (auto&& segment : fs::internal::SplitAbstractPath(path)) { if (i >= schema_->num_fields()) break; - keys.push_back({schema_->field(i++)->name(), std::move(segment), false}); + keys.push_back({schema_->field(i++)->name(), std::move(segment)}); } return keys; @@ -441,11 +441,12 @@ util::optional HivePartitioning::ParseKey( return util::nullopt; } + auto name = segment.substr(0, name_end); auto value = segment.substr(name_end + 1); if (value == null_fallback) { - return Key{segment.substr(0, name_end), "", true}; + return Key{name, util::nullopt}; } - return Key{segment.substr(0, name_end), segment.substr(name_end + 1), false}; + return Key{name, value}; } std::vector HivePartitioning::ParseKeys( @@ -493,8 +494,8 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory { for (auto path : paths) { for (auto&& segment : fs::internal::SplitAbstractPath(path)) { if (auto key = HivePartitioning::ParseKey(segment, null_fallback_)) { - if (!key->null) { - RETURN_NOT_OK(InsertRepr(key->name, key->value)); + if (key->value.has_value()) { + RETURN_NOT_OK(InsertRepr(key->name, *key->value)); } } } @@ -656,10 +657,10 @@ class StructDictionary { Status AddOne(Datum column, std::shared_ptr* fused_indices) { if (column.type()->id() == Type::DICTIONARY) { if (column.null_count() != 0) { - // TODO Optimize this by allowign DictionaryEncode to transfer a null-masked - // dictionary to a null-encoded dictionary. At the moment we decode and then - // encode causing one extra copy, and a potentially expansive decoding copy at - // that. + // TODO(ARROW-11732) Optimize this by allowign DictionaryEncode to transfer a + // null-masked dictionary to a null-encoded dictionary. At the moment we decode + // and then encode causing one extra copy, and a potentially expansive decoding + // copy at that. ARROW_ASSIGN_OR_RAISE( auto decoded_dictionary, compute::Cast( diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index bc59dfe53c5..42e1b4c4097 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -124,8 +124,8 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning { /// An unconverted equality expression consisting of a field name and the representation /// of a scalar value struct Key { - std::string name, value; - bool null; + std::string name; + util::optional value; }; static Status SetDefaultValuesFromKeys(const Expression& expr, diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index aad828cb66d..80d65daf159 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -375,7 +375,9 @@ TEST_F(TestPartitioning, HivePartitioningFormat) { } TEST_F(TestPartitioning, DiscoverHiveSchema) { - factory_ = HivePartitioning::MakeFactory(); + auto options = HivePartitioningFactoryOptions(); + options.infer_dictionary = "xyz"; + factory_ = HivePartitioning::MakeFactory(options); // type is int32 if possible AssertInspect({"/alpha=0/beta=1"}, {Int("alpha"), Int("beta")}); @@ -388,6 +390,12 @@ TEST_F(TestPartitioning, DiscoverHiveSchema) { // (...so ensure your partitions are ordered the same for all paths) AssertInspect({"/alpha=0/beta=1", "/beta=2/alpha=3"}, {Int("alpha"), Int("beta")}); + // Null fallback strings shouldn't interfere with type inference + AssertInspect({"/alpha=xyz/beta=x", "/alpha=7/beta=xyz"}, {Int("alpha"), Str("beta")}); + + // Only null strings are inferred as text + AssertInspect({"/alpha=xyz"}, {Str("alpha")}); + // If there are too many digits fall back to string AssertInspect({"/alpha=3760212050"}, {Str("alpha")}); @@ -611,7 +619,7 @@ class RangePartitioning : public Partitioning { } std::smatch matches; - RETURN_NOT_OK(DoRegex(key->value, &matches)); + RETURN_NOT_OK(DoRegex(*key->value, &matches)); auto& min_cmp = matches[1] == "[" ? greater_equal : greater; std::string min_repr = matches[2]; From c54c55db45affd5cfb550171980ffedfa7f0b160 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 22 Feb 2021 09:06:40 -1000 Subject: [PATCH 24/33] Update cpp/src/arrow/compute/kernels/vector_hash.cc Co-authored-by: Benjamin Kietzman --- cpp/src/arrow/compute/kernels/vector_hash.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 694c6265825..da8e7db1929 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -578,7 +578,6 @@ std::unique_ptr DictionaryHashInit(KernelContext* ctx, DCHECK(false) << "Unsupported dictionary index type"; break; } - DictionaryEncodeOptions options = DictionaryEncodeOptions::Defaults(); return ::arrow::internal::make_unique(std::move(indices_hasher)); } From 9b0f8eecf9a0c10e655b93562959616b7e6ddeb6 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 22 Feb 2021 09:06:47 -1000 Subject: [PATCH 25/33] Update cpp/src/arrow/compute/kernels/vector_hash.cc Co-authored-by: Benjamin Kietzman --- cpp/src/arrow/compute/kernels/vector_hash.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index da8e7db1929..754d8fba83b 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -152,7 +152,7 @@ class ValueCountsAction final : ActionBase { } } - bool ShouldEncodeNulls() { return true; } + constexpr bool ShouldEncodeNulls() const { return true; } private: Int64Builder count_builder_; From ce53d4eaa773ccfbe525c7b7e93efa20af5d024f Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 22 Feb 2021 09:07:49 -1000 Subject: [PATCH 26/33] Update cpp/src/arrow/compute/kernels/vector_hash_test.cc Co-authored-by: Benjamin Kietzman --- cpp/src/arrow/compute/kernels/vector_hash_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_hash_test.cc b/cpp/src/arrow/compute/kernels/vector_hash_test.cc index f4cd7dbf41f..179792e2141 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash_test.cc @@ -306,8 +306,8 @@ TEST_F(TestHashKernel, ValueCountsBoolean) { } TEST_F(TestHashKernel, ValueCountsNull) { - CheckValueCounts( - null(), {nullptr, nullptr, nullptr}, {true, false, true}, {nullptr}, {false}, {3}); + CheckValueCounts(ArrayFromJSON(null(), "[null, null, null]"), + ArrayFromJSON(null(), "[null]"), ArrayFromJSON(int64(), "[3]")); } TEST_F(TestHashKernel, DictEncodeBoolean) { From c2aa3ad879f4371974039aa37ad344b34c645467 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 22 Feb 2021 09:09:25 -1000 Subject: [PATCH 27/33] Update cpp/src/arrow/dataset/partition_test.cc Co-authored-by: Benjamin Kietzman --- cpp/src/arrow/dataset/partition_test.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 80d65daf159..eb8f5aa957b 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -161,8 +161,6 @@ TEST_F(TestPartitioning, Partition) { expected_expressions); } -TEST_F(TestPartitioning, StructDictionaryNull) {} - TEST_F(TestPartitioning, DirectoryPartitioning) { partitioning_ = std::make_shared( schema({field("alpha", int32()), field("beta", utf8())})); From 7d5de82c5edf379fb4bb5184778e9ec6ac493f8a Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 22 Feb 2021 09:16:25 -1000 Subject: [PATCH 28/33] Update python/pyarrow/_dataset.pyx Co-authored-by: Benjamin Kietzman --- python/pyarrow/_dataset.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index e38ea626d79..104a47b98c5 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1594,7 +1594,7 @@ cdef class HivePartitioning(Partitioning): corresponding entry of `dictionaries` must be an array containing every value which may be taken by the corresponding column or an error will be raised in parsing. - null_fallback : str + null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" If any field is None then this fallback will be used as a label Returns From f1a6759da12b36d6a229f51d79c9aa5dfecbba47 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 22 Feb 2021 10:11:53 -1000 Subject: [PATCH 29/33] Added test case to probe what happens when inferring a partition column that is only null. Changed it to an error to match directory partitioning --- cpp/src/arrow/dataset/partition.cc | 17 ++++++++++------- cpp/src/arrow/dataset/partition_test.cc | 6 +++--- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 9515f631d1a..f96002b36b2 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -186,7 +186,7 @@ Result KeyValuePartitioning::ConvertKey(const Key& key) const { value.index = index.scalar(); if (!value.index->is_valid) { return Status::Invalid("Dictionary supplied for field ", field->ToString(), - " does not contain '", key.value, "'"); + " does not contain '", *key.value, "'"); } converted = std::make_shared(std::move(value), field->type()); } else { @@ -311,8 +311,13 @@ class KeyValuePartitioningFactory : public PartitioningFactory { return it_inserted.first->second; } - Status InsertRepr(const std::string& name, util::string_view repr) { - return InsertRepr(GetOrInsertField(name), repr); + Status InsertRepr(const std::string& name, util::optional repr) { + auto field_index = GetOrInsertField(name); + if (repr.has_value()) { + return InsertRepr(field_index, *repr); + } else { + return Status::OK(); + } } Status InsertRepr(int index, util::string_view repr) { @@ -333,7 +338,7 @@ class KeyValuePartitioningFactory : public PartitioningFactory { RETURN_NOT_OK(repr_memos_[index]->GetArrayData(0, &reprs)); if (reprs->length == 0) { - return Status::Invalid("No segments were available for field '", name, + return Status::Invalid("No non-null segments were available for field '", name, "'; couldn't infer type"); } @@ -494,9 +499,7 @@ class HivePartitioningFactory : public KeyValuePartitioningFactory { for (auto path : paths) { for (auto&& segment : fs::internal::SplitAbstractPath(path)) { if (auto key = HivePartitioning::ParseKey(segment, null_fallback_)) { - if (key->value.has_value()) { - RETURN_NOT_OK(InsertRepr(key->name, *key->value)); - } + RETURN_NOT_OK(InsertRepr(key->name, key->value)); } } } diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index eb8f5aa957b..75e60f994f0 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -374,7 +374,7 @@ TEST_F(TestPartitioning, HivePartitioningFormat) { TEST_F(TestPartitioning, DiscoverHiveSchema) { auto options = HivePartitioningFactoryOptions(); - options.infer_dictionary = "xyz"; + options.null_fallback = "xyz"; factory_ = HivePartitioning::MakeFactory(options); // type is int32 if possible @@ -391,8 +391,8 @@ TEST_F(TestPartitioning, DiscoverHiveSchema) { // Null fallback strings shouldn't interfere with type inference AssertInspect({"/alpha=xyz/beta=x", "/alpha=7/beta=xyz"}, {Int("alpha"), Str("beta")}); - // Only null strings are inferred as text - AssertInspect({"/alpha=xyz"}, {Str("alpha")}); + // Cannot infer if the only values are null + AssertInspectError({"/alpha=xyz"}); // If there are too many digits fall back to string AssertInspect({"/alpha=3760212050"}, {Str("alpha")}); From dadbe8b3362346ff2d54e076215b46f44835c976 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 19 Feb 2021 13:01:35 -0500 Subject: [PATCH 30/33] Use null scalars for known-null fields --- cpp/src/arrow/dataset/expression.cc | 82 ++++++++------------ cpp/src/arrow/dataset/expression.h | 24 +----- cpp/src/arrow/dataset/expression_test.cc | 48 ++++++------ cpp/src/arrow/dataset/partition.cc | 60 +++++++------- cpp/src/arrow/dataset/projector.cc | 16 +++- python/pyarrow/_dataset.pyx | 11 +-- python/pyarrow/includes/libarrow_dataset.pxd | 9 +-- python/pyarrow/public-api.pxi | 3 + python/pyarrow/tests/test_dataset.py | 1 + 9 files changed, 108 insertions(+), 146 deletions(-) diff --git a/cpp/src/arrow/dataset/expression.cc b/cpp/src/arrow/dataset/expression.cc index fb62f819121..9764700816c 100644 --- a/cpp/src/arrow/dataset/expression.cc +++ b/cpp/src/arrow/dataset/expression.cc @@ -95,6 +95,8 @@ namespace { std::string PrintDatum(const Datum& datum) { if (datum.is_scalar()) { + if (!datum.scalar()->is_valid) return "null"; + switch (datum.type()->id()) { case Type::STRING: case Type::LARGE_STRING: @@ -110,6 +112,7 @@ std::string PrintDatum(const Datum& datum) { default: break; } + return datum.scalar()->ToString(); } return datum.ToString(); @@ -684,27 +687,27 @@ std::vector GuaranteeConjunctionMembers( // conjunction_members Status ExtractKnownFieldValuesImpl( std::vector* conjunction_members, - std::unordered_map* known_values) { - auto unconsumed_end = std::partition( - conjunction_members->begin(), conjunction_members->end(), - [](const Expression& expr) { - // search for an equality conditions between a field and a literal - auto call = expr.call(); - if (!call) return true; - - if (call->function_name == "equal") { - auto ref = call->arguments[0].field_ref(); - auto lit = call->arguments[1].literal(); - return !(ref && lit); - } - - if (call->function_name == "is_null" || call->function_name == "is_valid") { - auto ref = call->arguments[0].field_ref(); - return !ref; - } - - return true; - }); + std::unordered_map* known_values) { + auto unconsumed_end = + std::partition(conjunction_members->begin(), conjunction_members->end(), + [](const Expression& expr) { + // search for an equality conditions between a field and a literal + auto call = expr.call(); + if (!call) return true; + + if (call->function_name == "equal") { + auto ref = call->arguments[0].field_ref(); + auto lit = call->arguments[1].literal(); + return !(ref && lit); + } + + if (call->function_name == "is_null") { + auto ref = call->arguments[0].field_ref(); + return !ref; + } + + return true; + }); for (auto it = unconsumed_end; it != conjunction_members->end(); ++it) { auto call = CallNotNull(*it); @@ -715,10 +718,7 @@ Status ExtractKnownFieldValuesImpl( known_values->emplace(*ref, *lit); } else if (call->function_name == "is_null") { auto ref = call->arguments[0].field_ref(); - known_values->emplace(*ref, false); - } else if (call->function_name == "is_valid") { - auto ref = call->arguments[0].field_ref(); - known_values->emplace(*ref, true); + known_values->emplace(*ref, std::make_shared()); } } @@ -729,16 +729,16 @@ Status ExtractKnownFieldValuesImpl( } // namespace -Result> -ExtractKnownFieldValues(const Expression& guaranteed_true_predicate) { +Result> ExtractKnownFieldValues( + const Expression& guaranteed_true_predicate) { auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate); - std::unordered_map known_values; + std::unordered_map known_values; RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values)); return known_values; } Result ReplaceFieldsWithKnownValues( - const std::unordered_map& known_values, + const std::unordered_map& known_values, Expression expr) { if (!expr.IsBound()) { return Status::Invalid( @@ -751,11 +751,7 @@ Result ReplaceFieldsWithKnownValues( if (auto ref = expr.field_ref()) { auto it = known_values.find(*ref); if (it != known_values.end()) { - const auto& known_value = it->second; - if (!known_value.concrete()) { - return expr; - } - auto lit = known_value.datum; + Datum lit = it->second; if (expr.type()->id() == Type::DICTIONARY) { if (lit.is_scalar()) { // FIXME the "right" way to support this is adding support for scalars to @@ -775,22 +771,6 @@ Result ReplaceFieldsWithKnownValues( ARROW_ASSIGN_OR_RAISE(lit, compute::Cast(lit, expr.type())); return literal(std::move(lit)); } - } else if (auto call = expr.call()) { - if (call->function_name == "is_null") { - if (auto ref = call->arguments[0].field_ref()) { - auto it = known_values.find(*ref); - if (it != known_values.end()) { - return literal(!it->second.valid); - } - } - } else if (call->function_name == "is_valid") { - if (auto ref = call->arguments[0].field_ref()) { - auto it = known_values.find(*ref); - if (it != known_values.end()) { - return literal(it->second.valid); - } - } - } } return expr; }, @@ -967,7 +947,7 @@ Result SimplifyWithGuarantee(Expression expr, const Expression& guaranteed_true_predicate) { auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate); - std::unordered_map known_values; + std::unordered_map known_values; RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values)); ARROW_ASSIGN_OR_RAISE(expr, diff --git a/cpp/src/arrow/dataset/expression.h b/cpp/src/arrow/dataset/expression.h index b6b47fb8a2e..8bdcb4a0ffa 100644 --- a/cpp/src/arrow/dataset/expression.h +++ b/cpp/src/arrow/dataset/expression.h @@ -159,27 +159,10 @@ Expression call(std::string function, std::vector arguments, ARROW_DS_EXPORT std::vector FieldsInExpression(const Expression&); -/// Represents either a concrete value or a hint that a field is valid/invalid -struct KnownFieldValue { - Datum datum; - bool valid; - - KnownFieldValue() : datum(), valid(false) {} - KnownFieldValue(const Datum& datum) // NOLINT implicit conversion - : datum(datum), valid(datum.length() != datum.null_count()) {} - KnownFieldValue(bool is_valid) // NOLINT implicit conversion - : datum(), valid(is_valid) {} - - inline bool concrete() const { return datum.kind() != Datum::Kind::NONE; } - bool operator==(const KnownFieldValue& other) const { - return datum == other.datum && valid == other.valid; - } -}; - /// Assemble a mapping from field references to known values. ARROW_DS_EXPORT -Result> -ExtractKnownFieldValues(const Expression& guaranteed_true_predicate); +Result> ExtractKnownFieldValues( + const Expression& guaranteed_true_predicate); /// \defgroup expression-passes Functions for modification of Expressions /// @@ -208,8 +191,7 @@ Result FoldConstants(Expression); /// Simplify Expressions by replacing with known values of the fields which it references. ARROW_DS_EXPORT Result ReplaceFieldsWithKnownValues( - const std::unordered_map& known_values, - Expression); + const std::unordered_map& known_values, Expression); /// Simplify an expression by replacing subexpressions based on a guarantee: /// a boolean expression which is guaranteed to evaluate to `true`. For example, this is diff --git a/cpp/src/arrow/dataset/expression_test.cc b/cpp/src/arrow/dataset/expression_test.cc index adaf6c3410d..c837c5be893 100644 --- a/cpp/src/arrow/dataset/expression_test.cc +++ b/cpp/src/arrow/dataset/expression_test.cc @@ -680,9 +680,8 @@ TEST(Expression, FoldConstantsBoolean) { TEST(Expression, ExtractKnownFieldValues) { struct { - void operator()( - Expression guarantee, - std::unordered_map expected) { + void operator()(Expression guarantee, + std::unordered_map expected) { ASSERT_OK_AND_ASSIGN(auto actual, ExtractKnownFieldValues(guarantee)); EXPECT_THAT(actual, UnorderedElementsAreArray(expected)) << " guarantee: " << guarantee.ToString(); @@ -730,20 +729,20 @@ TEST(Expression, ExtractKnownFieldValues) { } TEST(Expression, ReplaceFieldsWithKnownValues) { - auto ExpectReplacesTo = [](Expression expr, - const std::unordered_map& known_values, - Expression unbound_expected) { - ASSERT_OK_AND_ASSIGN(expr, expr.Bind(*kBoringSchema)); - ASSERT_OK_AND_ASSIGN(auto expected, unbound_expected.Bind(*kBoringSchema)); - ASSERT_OK_AND_ASSIGN(auto replaced, ReplaceFieldsWithKnownValues(known_values, expr)); + auto ExpectReplacesTo = + [](Expression expr, + const std::unordered_map& known_values, + Expression unbound_expected) { + ASSERT_OK_AND_ASSIGN(expr, expr.Bind(*kBoringSchema)); + ASSERT_OK_AND_ASSIGN(auto expected, unbound_expected.Bind(*kBoringSchema)); + ASSERT_OK_AND_ASSIGN(auto replaced, + ReplaceFieldsWithKnownValues(known_values, expr)); - EXPECT_EQ(replaced, expected); - ExpectIdenticalIfUnchanged(replaced, expr); - }; + EXPECT_EQ(replaced, expected); + ExpectIdenticalIfUnchanged(replaced, expr); + }; - std::unordered_map i32_is_3{ - {"i32", Datum(3)}}; + std::unordered_map i32_is_3{{"i32", Datum(3)}}; ExpectReplacesTo(literal(1), i32_is_3, literal(1)); @@ -776,13 +775,18 @@ TEST(Expression, ReplaceFieldsWithKnownValues) { literal(2), })); - std::unordered_map a_valid_b_invalid{ - {"a", true}, {"b", false}}; + std::unordered_map i32_valid_str_null{ + {"i32", Datum(3)}, {"str", MakeNullScalar(utf8())}}; + + ExpectReplacesTo(is_null(field_ref("i32")), i32_valid_str_null, is_null(literal(3))); - ExpectReplacesTo(is_null(field_ref("a")), a_valid_b_invalid, literal(false)); - ExpectReplacesTo(is_valid(field_ref("a")), a_valid_b_invalid, literal(true)); - ExpectReplacesTo(is_null(field_ref("b")), a_valid_b_invalid, literal(true)); - ExpectReplacesTo(is_valid(field_ref("b")), a_valid_b_invalid, literal(false)); + ExpectReplacesTo(is_valid(field_ref("i32")), i32_valid_str_null, is_valid(literal(3))); + + ExpectReplacesTo(is_null(field_ref("str")), i32_valid_str_null, + is_null(null_literal(utf8()))); + + ExpectReplacesTo(is_valid(field_ref("str")), i32_valid_str_null, + is_valid(null_literal(utf8()))); } struct { @@ -1046,7 +1050,7 @@ TEST(Expression, SimplifyWithGuarantee) { Simplify{is_valid(field_ref("i32"))} .WithGuarantee(is_valid(field_ref("i32"))) - .Expect(literal(true)); + .Expect(is_valid(field_ref("i32"))); } TEST(Expression, SimplifyThenExecute) { diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index f96002b36b2..522dbbeb5d2 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -74,31 +74,20 @@ Status KeyValuePartitioning::SetDefaultValuesFromKeys(const Expression& expr, RecordBatchProjector* projector) { ARROW_ASSIGN_OR_RAISE(auto known_values, ExtractKnownFieldValues(expr)); for (const auto& ref_value : known_values) { - const auto& known_value = ref_value.second; - if (known_value.concrete() && !known_value.datum.is_scalar()) { - return Status::Invalid("non-scalar partition key ", known_value.datum.ToString()); + if (!ref_value.second.is_scalar()) { + return Status::Invalid("non-scalar partition key ", ref_value.second.ToString()); } ARROW_ASSIGN_OR_RAISE(auto match, ref_value.first.FindOneOrNone(*projector->schema())); if (match.empty()) continue; - - const auto& field = projector->schema()->field(match[0]); - if (known_value.concrete()) { - RETURN_NOT_OK(projector->SetDefaultValue(match, known_value.datum.scalar())); - } else if (known_value.valid) { - // We know some information about the value but nothing concrete enough to set. Can - // happen if expression is something like is_valid(field_ref("a")) - continue; - } else { - RETURN_NOT_OK(projector->SetDefaultValue(match, MakeNullScalar(field->type()))); - } + RETURN_NOT_OK(projector->SetDefaultValue(match, ref_value.second.scalar())); } return Status::OK(); } -Expression ConjunctionFromGroupingRow(Scalar* row) { +inline Expression ConjunctionFromGroupingRow(Scalar* row) { ScalarVector* values = &checked_cast(row)->value; std::vector equality_expressions(values->size()); for (size_t i = 0; i < values->size(); ++i) { @@ -213,34 +202,37 @@ Result KeyValuePartitioning::Format(const Expression& expr) const { ARROW_ASSIGN_OR_RAISE(auto known_values, ExtractKnownFieldValues(expr)); for (const auto& ref_value : known_values) { - const auto& known_value = ref_value.second; - if (known_value.concrete() && !known_value.datum.is_scalar()) { - return Status::Invalid("non-scalar partition key ", known_value.datum.ToString()); + if (!ref_value.second.is_scalar()) { + return Status::Invalid("non-scalar partition key ", ref_value.second.ToString()); } ARROW_ASSIGN_OR_RAISE(auto match, ref_value.first.FindOneOrNone(*schema_)); if (match.empty()) continue; - const auto& field = schema_->field(match[0]); - - if (known_value.concrete()) { - auto value = known_value.datum.scalar(); - if (!value->type->Equals(field->type())) { - return Status::TypeError("scalar ", value->ToString(), " (of type ", *value->type, - ") is invalid for ", field->ToString()); - } + auto value = ref_value.second.scalar(); - if (value->type->id() == Type::DICTIONARY) { - ARROW_ASSIGN_OR_RAISE( - value, checked_cast(*value).GetEncodedValue()); + const auto& field = schema_->field(match[0]); + if (!value->type->Equals(field->type())) { + if (value->is_valid) { + auto maybe_converted = compute::Cast(value, field->type()); + if (!maybe_converted.ok()) { + return Status::TypeError("Error converting scalar ", value->ToString(), + " (of type ", *value->type, + ") to a partition key for ", field->ToString(), ": ", + maybe_converted.status().message()); + } + value = maybe_converted->scalar(); + } else { + value = MakeNullScalar(field->type()); } + } - values[match[0]] = std::move(value); - } else { - if (!known_value.valid) { - values[match[0]] = MakeNullScalar(field->type()); - } + if (value->type->id() == Type::DICTIONARY) { + ARROW_ASSIGN_OR_RAISE( + value, checked_cast(*value).GetEncodedValue()); } + + values[match[0]] = std::move(value); } return FormatValues(values); diff --git a/cpp/src/arrow/dataset/projector.cc b/cpp/src/arrow/dataset/projector.cc index 2ba679ce6e7..ba0eb2ddff5 100644 --- a/cpp/src/arrow/dataset/projector.cc +++ b/cpp/src/arrow/dataset/projector.cc @@ -23,6 +23,7 @@ #include #include "arrow/array.h" +#include "arrow/compute/cast.h" #include "arrow/dataset/type_fwd.h" #include "arrow/record_batch.h" #include "arrow/result.h" @@ -88,9 +89,18 @@ Status RecordBatchProjector::SetDefaultValue(FieldRef ref, auto field_type = to_->field(index)->type(); if (!field_type->Equals(scalar->type)) { - return Status::TypeError("field ", to_->field(index)->ToString(), - " cannot be materialized from scalar of type ", - *scalar->type); + if (scalar->is_valid) { + auto maybe_converted = compute::Cast(scalar, field_type); + if (!maybe_converted.ok()) { + return Status::TypeError("Field ", to_->field(index)->ToString(), + " cannot be materialized from scalar of type ", + *scalar->type, + ". Cast error: ", maybe_converted.status().message()); + } + scalar = maybe_converted->scalar(); + } else { + scalar = MakeNullScalar(field_type); + } } scalars_[index] = std::move(scalar); diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 104a47b98c5..1c4e5d302c5 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -2361,17 +2361,14 @@ def _get_partition_keys(Expression partition_expression): """ cdef: CExpression expr = partition_expression.unwrap() - pair[CFieldRef, CKnownFieldValue] ref_val + pair[CFieldRef, CDatum] ref_val out = {} for ref_val in GetResultValue(CExtractKnownFieldValues(expr)): assert ref_val.first.name() != nullptr - if ref_val.second.valid: - assert ref_val.second.datum.kind() == DatumType_SCALAR - val = pyarrow_wrap_scalar(ref_val.second.datum.scalar()) - out[frombytes(deref(ref_val.first.name()))] = val.as_py() - else: - out[frombytes(deref(ref_val.first.name()))] = None + assert ref_val.second.kind() == DatumType_SCALAR + val = pyarrow_wrap_scalar(ref_val.second.scalar()) + out[frombytes(deref(ref_val.first.name()))] = val.as_py() return out diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd index 2127b3dccff..93bc0edddc1 100644 --- a/python/pyarrow/includes/libarrow_dataset.pxd +++ b/python/pyarrow/includes/libarrow_dataset.pxd @@ -315,14 +315,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: const CExpression& partition_expression, CRecordBatchProjector* projector) - cdef cppclass CKnownFieldValue "arrow::dataset::KnownFieldValue": - CDatum datum - c_bool valid - CKnownFieldValue(CDatum datum) - CKnownFieldValue(c_bool valid) - c_bool operator==(const CKnownFieldValue&) const - - cdef CResult[unordered_map[CFieldRef, CKnownFieldValue, CFieldRefHash]] \ + cdef CResult[unordered_map[CFieldRef, CDatum, CFieldRefHash]] \ CExtractKnownFieldValues "arrow::dataset::ExtractKnownFieldValues"( const CExpression& partition_expression) diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index aa738f9aaea..998af512c55 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -251,6 +251,9 @@ cdef api object pyarrow_wrap_scalar(const shared_ptr[CScalar]& sp_scalar): if data_type == NULL: raise ValueError('Scalar data type was NULL') + if data_type.id() == _Type_NA: + return _NULL + if data_type.id() not in _scalar_classes: raise ValueError('Scalar type not supported') diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index e12f802e610..67870b1e70d 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -26,6 +26,7 @@ import pytest import pyarrow as pa +import pyarrow.csv import pyarrow.fs as fs from pyarrow.tests.util import change_cwd, _filesystem_uri From d3bfe09500d40a8c95874734cfd34bad507ec540 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 22 Feb 2021 14:57:43 -1000 Subject: [PATCH 31/33] constexpr not supported in this context in all gcc versions due to gcc bugs. Pulling out for a second --- cpp/src/arrow/compute/kernels/vector_hash.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 754d8fba83b..de4d3ee3022 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -152,7 +152,7 @@ class ValueCountsAction final : ActionBase { } } - constexpr bool ShouldEncodeNulls() const { return true; } + bool ShouldEncodeNulls() const { return true; } private: Int64Builder count_builder_; From f18c70110ddc04cde6099ae04fd9e05a83c24ff8 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 23 Feb 2021 09:22:37 -1000 Subject: [PATCH 32/33] Missed one of the merge conflicts --- python/pyarrow/tests/test_dataset.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 67870b1e70d..57179f391de 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -548,12 +548,8 @@ def test_file_format_pickling(): 'subdir/2/yyy/file1.parquet', ] ]) -<<<<<<< HEAD @pytest.mark.parametrize('pre_buffer', [False, True]) def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): -======= -def test_filesystem_factory(mockfs, paths_or_selector): ->>>>>>> Final lint pass. Turns out I was relying on black which was messing up everything format = ds.ParquetFileFormat( read_options=ds.ParquetReadOptions(dictionary_columns={"str"}, pre_buffer=pre_buffer) From 591021e53ea141a006bd6a30c7be6966becf040e Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 23 Feb 2021 10:51:53 -1000 Subject: [PATCH 33/33] Putting in suggestion from Ben. It got lost on rebase / force-push --- cpp/src/arrow/dataset/expression.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/dataset/expression.cc b/cpp/src/arrow/dataset/expression.cc index 9764700816c..5ddb270451a 100644 --- a/cpp/src/arrow/dataset/expression.cc +++ b/cpp/src/arrow/dataset/expression.cc @@ -718,7 +718,7 @@ Status ExtractKnownFieldValuesImpl( known_values->emplace(*ref, *lit); } else if (call->function_name == "is_null") { auto ref = call->arguments[0].field_ref(); - known_values->emplace(*ref, std::make_shared()); + known_values->emplace(*ref, Datum(std::make_shared())); } }