From e120ade48a1a46f175ac7c3e3fb0e223205716cb Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 17 Mar 2020 09:12:27 -0400 Subject: [PATCH 1/3] ARROW-8088: [C++][Dataset] Support dictionary partition columns --- cpp/src/arrow/array.cc | 14 +++ cpp/src/arrow/builder.cc | 4 +- cpp/src/arrow/dataset/dataset_test.cc | 62 +++++++++++--- cpp/src/arrow/ipc/json_simple.cc | 112 +++++++++++++++++++++++- cpp/src/arrow/ipc/json_simple_test.cc | 112 ++++++++++++++++++++++-- cpp/src/arrow/scalar.cc | 79 ++++++++++------- cpp/src/arrow/scalar.h | 118 +++++++++++++------------- 7 files changed, 386 insertions(+), 115 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 4bf0b8ef4c6..1b387cd7cc6 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -1569,6 +1569,20 @@ class RepeatedArrayFactory { return FinishFixedWidth(value.data(), value.size()); } + Status Visit(const DictionaryType& type) { + std::shared_ptr dictionary, indices; + + const auto& value = checked_cast(scalar_).value; + RETURN_NOT_OK(MakeArrayFromScalar(pool_, *value, 1, &dictionary)); + + ARROW_ASSIGN_OR_RAISE(auto zero, MakeScalar(type.index_type(), 0)); + RETURN_NOT_OK(MakeArrayFromScalar(pool_, *zero, length_, &indices)); + + *out_ = std::make_shared(scalar_.type, std::move(indices), + std::move(dictionary)); + return Status::OK(); + } + Status Visit(const DataType& type) { return Status::NotImplemented("construction from scalar of type ", *scalar_.type); } diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 0a085aceb5e..d6ee4f1d26a 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -35,8 +35,8 @@ class MemoryPool; // Helper functions struct DictionaryBuilderCase { - template - Status Visit(const ValueType&, typename ValueType::c_type* = nullptr) { + template + Status Visit(const ValueType&) { return CreateFor(); } diff --git a/cpp/src/arrow/dataset/dataset_test.cc b/cpp/src/arrow/dataset/dataset_test.cc index 8741d169eab..77806b0e0d0 100644 --- a/cpp/src/arrow/dataset/dataset_test.cc +++ b/cpp/src/arrow/dataset/dataset_test.cc @@ -403,13 +403,21 @@ TEST_F(TestEndToEnd, EndToEndSingleDataset) { AssertTablesEqual(*expected, *table, false, true); } +inline std::shared_ptr SchemaFromNames(const std::vector names) { + std::vector> fields; + for (const auto& name : names) { + fields.push_back(field(name, int32())); + } + + return schema(fields); +} + class TestSchemaUnification : public TestDataset { public: using i32 = util::optional; + using PathAndContent = std::vector>; - void SetUp() { - using PathAndContent = std::vector>; - + void SetUp() override { // The following test creates 2 sources with divergent but compatible // schemas. Each source have a common partitioning where the // fields are not materialized in the data fragments. @@ -443,7 +451,7 @@ class TestSchemaUnification : public TestDataset { auto get_source = [this](std::string base, std::vector paths) -> Result> { - auto resolver = [this](const FileSource& source) -> std::shared_ptr { + auto resolver = [](const FileSource& source) -> std::shared_ptr { auto path = source.path(); // A different schema for each data fragment. if (path == ds1_df1) { @@ -488,15 +496,6 @@ class TestSchemaUnification : public TestDataset { std::make_shared(schema_, DatasetVector{ds1, ds2}); } - std::shared_ptr SchemaFromNames(const std::vector names) { - std::vector> fields; - for (const auto& name : names) { - fields.push_back(field(name, int32())); - } - - return schema(fields); - } - template void AssertScanEquals(std::shared_ptr scanner, const std::vector& expected_rows) { @@ -642,5 +641,42 @@ TEST_F(TestSchemaUnification, SelectMixedColumnsAndFilter) { AssertBuilderEquals(scan_builder, rows); } +TEST(TestDictPartitionColumn, SelectPartitionColumnFilterPhysicalColumn) { + auto partition_field = field("part", dictionary(int32(), utf8())); + auto path = "/dataset/part=one/data.json"; + + auto mock_fs = std::make_shared(fs::kNoTime); + ARROW_EXPECT_OK(mock_fs->CreateFile(path, R"([ {"phy_1": 111, "phy_2": 211} ])", + /* recursive */ true)); + + auto physical_schema = SchemaFromNames({"phy_1", "phy_2"}); + auto format = std::make_shared( + [=](const FileSource&) { return physical_schema; }); + + FileSystemFactoryOptions options; + options.partition_base_dir = "/dataset"; + options.partitioning = std::make_shared(schema({partition_field})); + + ASSERT_OK_AND_ASSIGN(auto factory, + FileSystemDatasetFactory::Make(mock_fs, {path}, format, options)); + + ASSERT_OK_AND_ASSIGN(auto schema, factory->Inspect()); + + ASSERT_OK_AND_ASSIGN(auto dataset, factory->Finish(schema)); + + // Selects re-ordered virtual column with a filter on a physical column + ASSERT_OK_AND_ASSIGN(auto scan_builder, dataset->NewScan()); + ASSERT_OK(scan_builder->Filter("phy_1"_ == 111)); + + ASSERT_OK(scan_builder->Project({"part"})); + + ASSERT_OK_AND_ASSIGN(auto scanner, scan_builder->Finish()); + ASSERT_OK_AND_ASSIGN(auto table, scanner->ToTable()); + AssertArraysEqual(*table->column(0)->chunk(0), + *ArrayFromJSON(partition_field->type(), R"([ + [0, ["one"]] + ])")); +} + } // namespace dataset } // namespace arrow diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index 06320b844d2..16c0c91fcac 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -660,7 +660,111 @@ class StructConverter final : public ConcreteConverter { }; // ------------------------------------------------------------------------ -// Converter for struct arrays +// Converter for dictionary arrays + +class DictionaryConverter final : public ConcreteConverter { + public: + explicit DictionaryConverter(const std::shared_ptr& type) { type_ = type; } + + struct FixedDictionaryBuilder : ArrayBuilder { + using ArrayBuilder::ArrayBuilder; + + Status AppendNull() override { return index_builder_->AppendNull(); } + + Status AppendNulls(int64_t length) override { + return index_builder_->AppendNulls(length); + } + + Status FinishInternal(std::shared_ptr* out) override { + if (dictionary_ == nullptr) { + return Status::Invalid("dictionary was never supplied"); + } + + if (max_index_ >= dictionary_->length()) { + return Status::IndexError("An index was out of bounds: ", max_index_, + " cannot refer to an element of ", + dictionary_->ToString()); + } + + RETURN_NOT_OK(index_builder_->FinishInternal(out)); + (*out)->type = type_; + (*out)->dictionary = dictionary_; + + Reset(); + return Status::OK(); + } + + void Reset() override { + index_builder_->Reset(); + dictionary_ = nullptr; + max_index_ = 0; + } + + std::shared_ptr type() const override { return type_; } + + void UpdateMax(const rj::Value& json_index) { + if (json_index.IsNull()) return; + + auto index = json_index.GetInt64(); + if (index > max_index_) max_index_ = index; + } + + int64_t max_index_ = 0; + ArrayBuilder* index_builder_; + std::shared_ptr type_; + std::shared_ptr dictionary_; + }; + + Status Init() override { + auto dictionary_type = checked_cast(type_.get()); + RETURN_NOT_OK(GetConverter(dictionary_type->index_type(), &index_converter_)); + RETURN_NOT_OK(GetConverter(dictionary_type->value_type(), &dictionary_converter_)); + + builder_.reset(new FixedDictionaryBuilder(default_memory_pool())); + builder_->index_builder_ = index_converter_->builder().get(); + builder_->type_ = type_; + + return Status::OK(); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); + } + + if (json_obj.Size() != 1 && json_obj.Size() != 2) { + return Status::Invalid( + "Expected [index] or [index, dictionary], got array of size ", json_obj.Size()); + } + + RETURN_NOT_OK(index_converter_->AppendValue(json_obj[0])); + builder_->UpdateMax(json_obj[0]); + + if (json_obj.Size() == 1) return Status::OK(); + + if (builder_->dictionary_ != nullptr) { + return Status::Invalid("dictionary was already specified"); + } + + RETURN_NOT_OK(dictionary_converter_->AppendValues(json_obj[1])); + return dictionary_converter_->Finish(&builder_->dictionary_); + } + + std::shared_ptr builder() override { return builder_; } + + private: + std::shared_ptr builder_; + std::shared_ptr index_converter_, dictionary_converter_; +}; + +// ------------------------------------------------------------------------ +// Converter for union arrays class UnionConverter final : public ConcreteConverter { public: @@ -701,9 +805,8 @@ class UnionConverter final : public ConcreteConverter { return builder_->AppendNull(); } - // Append a JSON value that is either an array of N elements in order - // or an object mapping struct names to values (omitted struct members - // are mapped to null). + // Append a JSON value that must be a 2-long array, containing the type_id + // and value of the UnionArray's slot. Status AppendValue(const rj::Value& json_obj) override { if (json_obj.IsNull()) { return AppendNull(); @@ -798,6 +901,7 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter) SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) SIMPLE_CONVERTER_CASE(Type::UNION, UnionConverter) + SIMPLE_CONVERTER_CASE(Type::DICTIONARY, DictionaryConverter) case Type::INTERVAL: { switch (checked_cast(*type).interval_type()) { case IntervalType::MONTHS: diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index f88dbe63c0c..16e8b485da4 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -1179,10 +1179,14 @@ TEST(TestDenseUnion, Errors) { std::shared_ptr type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE); std::shared_ptr array; - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"\"]", &array)); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, 8]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"not a valid type_id\"]", &array)); + ASSERT_RAISES(Invalid, + ArrayFromJSON(type, "[[0, 99]]", &array)); // 0 is not one of {4, 8} + ASSERT_RAISES(Invalid, + ArrayFromJSON(type, "[[4, \"\"]]", &array)); // "" is not a valid int8() + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"not a pair\"]", &array)); ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array)); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[4, \"\"]]", &array)); ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[8, true, 1]]", &array)); } @@ -1192,13 +1196,109 @@ TEST(TestSparseUnion, Errors) { std::shared_ptr type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE); std::shared_ptr array; - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"\"]", &array)); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, 8]]", &array)); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"not a valid type_id\"]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, 99]]", &array)); ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[4, \"\"]]", &array)); + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"not a pair\"]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array)); ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[8, true, 1]]", &array)); } +TEST(TestDictionary, Basics) { + auto type = dictionary(int32(), utf8()); + auto array = ArrayFromJSON(type, R"([ + [null, ["whiskey", "tango", "foxtrot"]], + [2], + [1], + [0] + ])"); + + auto expected_indices = ArrayFromJSON(int32(), "[null, 2, 1, 0]"); + auto expected_dictionary = ArrayFromJSON(utf8(), R"(["whiskey", "tango", "foxtrot"])"); + + DictionaryArray expected(type, expected_indices, expected_dictionary); + + ASSERT_ARRAYS_EQUAL(expected, *array); +} + +TEST(TestDictionary, Nested) { + auto type = dictionary(int32(), utf8()); + auto array = ArrayFromJSON(struct_({field("dict", type)}), R"([ + {"dict": [null, ["whiskey", "tango", "foxtrot"]]}, + null, + {"dict": [2]}, + {"dict": [1]}, + null, + {"dict": [0]} + ])"); + + auto expected_indices = ArrayFromJSON(int32(), "[null, null, 2, 1, null, 0]"); + auto expected_dictionary = ArrayFromJSON(utf8(), R"(["whiskey", "tango", "foxtrot"])"); + auto expected_dict_array = + std::make_shared(type, expected_indices, expected_dictionary); + ASSERT_OK_AND_ASSIGN(auto expected_null_bitmap, + BitUtil::BytesToBits({1, 0, 1, 1, 0, 1})); + + ASSERT_OK_AND_ASSIGN(auto expected, StructArray::Make({expected_dict_array}, {"dict"}, + expected_null_bitmap)); + + ASSERT_ARRAYS_EQUAL(*expected, *array); +} + +TEST(TestDictionary, VeryNested) { + auto dict_type = dictionary(int32(), utf8()); + auto struct_type = struct_({field("dict", dict_type)}); + auto type = dictionary(int8(), struct_type); + + auto array = ArrayFromJSON(type, R"([ + [null, [ + {"dict": [null, ["whiskey", "tango", "foxtrot"]]}, + {"dict": [2]}, + {"dict": [1]}, + {"dict": [0]} + ]], + [0], + [1], + [2] + ])"); + + auto expected_dictionary = ArrayFromJSON(struct_type, R"([ + {"dict": [null, ["whiskey", "tango", "foxtrot"]]}, + {"dict": [2]}, + {"dict": [1]}, + {"dict": [0]} + ])"); + + auto expected_indices = ArrayFromJSON(int8(), "[null, 0, 1, 2]"); + auto expected_dict_array = + std::make_shared(type, expected_indices, expected_dictionary); + + DictionaryArray expected(type, expected_indices, expected_dictionary); + + ASSERT_ARRAYS_EQUAL(expected, *array); +} + +TEST(TestDictionary, Errors) { + auto type = dictionary(int32(), utf8()); + std::shared_ptr array; + + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[\"not a valid index\"]]", &array)); + ASSERT_RAISES(Invalid, + ArrayFromJSON(type, "[[0, \"not a valid dictionary\"]]", &array)); + ASSERT_RAISES( + Invalid, ArrayFromJSON(type, "[[0], [1], [2]]", &array)); // no dictionary supplied + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, [\"a\"]], [0, [\"a\"]]]", + &array)); // multiple dictionaries supplied + ASSERT_RAISES(IndexError, ArrayFromJSON(type, "[[0, [\"a\"]], [2]]", + &array)); // index out of bounds + + ASSERT_RAISES(Invalid, + ArrayFromJSON(type, "[\"not an index or index,dict pair\"]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array)); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, [\"\"], false]]", &array)); +} + } // namespace json } // namespace internal } // namespace ipc diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index ae53b3c19ed..9ef05271b8d 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -127,25 +127,31 @@ size_t Scalar::Hash::hash(const Scalar& scalar) { return ScalarHashImpl(scalar). StringScalar::StringScalar(std::string s) : StringScalar(Buffer::FromString(std::move(s))) {} -FixedSizeBinaryScalar::FixedSizeBinaryScalar(const std::shared_ptr& value, - const std::shared_ptr& type) - : BinaryScalar(value, type) { - ARROW_CHECK_EQ(checked_cast(*type).byte_width(), - value->size()); +FixedSizeBinaryScalar::FixedSizeBinaryScalar(std::shared_ptr value, + std::shared_ptr type) + : BinaryScalar(std::move(value), std::move(type)) { + ARROW_CHECK_EQ(checked_cast(*this->type).byte_width(), + this->value->size()); } -BaseListScalar::BaseListScalar(const std::shared_ptr& value, - const std::shared_ptr& type) - : Scalar{type, true}, value(value) {} +BaseListScalar::BaseListScalar(std::shared_ptr value, + std::shared_ptr type) + : Scalar{std::move(type), true}, value(std::move(value)) {} -BaseListScalar::BaseListScalar(const std::shared_ptr& value) - : BaseListScalar(value, value->type()) {} +BaseListScalar::BaseListScalar(std::shared_ptr value) + : Scalar(value->type(), true), value(std::move(value)) {} -FixedSizeListScalar::FixedSizeListScalar(const std::shared_ptr& value, - const std::shared_ptr& type) - : BaseListScalar(value, type) { - ARROW_CHECK_EQ(value->length(), - checked_cast(type.get())->list_size()); +FixedSizeListScalar::FixedSizeListScalar(std::shared_ptr value, + std::shared_ptr type) + : BaseListScalar(std::move(value), std::move(type)) { + ARROW_CHECK_EQ(this->value->length(), + checked_cast(*this->type).list_size()); +} + +DictionaryScalar::DictionaryScalar(std::shared_ptr type) + : Scalar(std::move(type)), + value( + MakeNullScalar(checked_cast(*this->type).value_type())) { } template @@ -173,15 +179,18 @@ struct MakeNullImpl { return Status::OK(); } - const std::shared_ptr& type_; + std::shared_ptr Finish() && { + // Should not fail. + DCHECK_OK(VisitTypeInline(*type_, this)); + return std::move(out_); + } + + std::shared_ptr type_; std::shared_ptr out_; }; -std::shared_ptr MakeNullScalar(const std::shared_ptr& type) { - MakeNullImpl impl = {type, nullptr}; - // Should not fail. - DCHECK_OK(VisitTypeInline(*type, &impl)); - return std::move(impl.out_); +std::shared_ptr MakeNullScalar(std::shared_ptr type) { + return MakeNullImpl{std::move(type), nullptr}.Finish(); } std::string Scalar::ToString() const { @@ -207,7 +216,12 @@ struct ScalarParseImpl { Status Visit(const LargeBinaryType&) { return FinishWithBuffer(); } - Status Visit(const FixedSizeBinaryType& t) { return FinishWithBuffer(); } + Status Visit(const FixedSizeBinaryType&) { return FinishWithBuffer(); } + + Status Visit(const DictionaryType& t) { + ARROW_ASSIGN_OR_RAISE(auto value, Scalar::Parse(t.value_type(), s_)); + return Finish(std::move(value)); + } Status Visit(const DataType& t) { return Status::NotImplemented("parsing scalars of type ", t); @@ -215,26 +229,27 @@ struct ScalarParseImpl { template Status Finish(Arg&& arg) { - return MakeScalar(type_, std::forward(arg)).Value(out_); + return MakeScalar(std::move(type_), std::forward(arg)).Value(&out_); } Status FinishWithBuffer() { return Finish(Buffer::FromString(s_.to_string())); } - ScalarParseImpl(const std::shared_ptr& type, util::string_view s, - std::shared_ptr* out) - : type_(type), s_(s), out_(out) {} + Result> Finish() && { + RETURN_NOT_OK(VisitTypeInline(*type_, this)); + return std::move(out_); + } + + ScalarParseImpl(std::shared_ptr type, util::string_view s) + : type_(std::move(type)), s_(s) {} - const std::shared_ptr& type_; + std::shared_ptr type_; util::string_view s_; - std::shared_ptr* out_; + std::shared_ptr out_; }; Result> Scalar::Parse(const std::shared_ptr& type, util::string_view s) { - std::shared_ptr out; - ScalarParseImpl impl = {type, s, &out}; - RETURN_NOT_OK(VisitTypeInline(*type, &impl)); - return out; + return ScalarParseImpl{type, s}.Finish(); } namespace internal { diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 4a21689339a..7636ecbe79a 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -48,7 +48,7 @@ class Array; struct ARROW_EXPORT Scalar : public util::EqualityComparable { virtual ~Scalar() = default; - explicit Scalar(const std::shared_ptr& type) : type(type), is_valid(false) {} + explicit Scalar(std::shared_ptr type) : type(std::move(type)) {} /// \brief The type of the scalar value std::shared_ptr type; @@ -79,8 +79,8 @@ struct ARROW_EXPORT Scalar : public util::EqualityComparable { Result> CastTo(std::shared_ptr to) const; protected: - Scalar(const std::shared_ptr& type, bool is_valid) - : type(type), is_valid(is_valid) {} + Scalar(std::shared_ptr type, bool is_valid) + : type(std::move(type)), is_valid(is_valid) {} }; /// \brief A scalar value for NullType. Never valid @@ -100,9 +100,9 @@ struct ARROW_EXPORT PrimitiveScalar : public Scalar { using ValueType = CType; // Non-null constructor. - PrimitiveScalar(ValueType value, const std::shared_ptr& type) - : Scalar(type, true), value(value) { - ARROW_CHECK_EQ(type->id(), T::type_id); + PrimitiveScalar(ValueType value, std::shared_ptr type) + : Scalar(std::move(type), true), value(value) { + ARROW_CHECK_EQ(this->type->id(), T::type_id); } explicit PrimitiveScalar(ValueType value) @@ -175,21 +175,19 @@ struct ARROW_EXPORT BaseBinaryScalar : public Scalar { std::shared_ptr value; protected: - BaseBinaryScalar(const std::shared_ptr& value, - const std::shared_ptr& type) - : Scalar{type, true}, value(value) {} + BaseBinaryScalar(std::shared_ptr value, std::shared_ptr type) + : Scalar{std::move(type), true}, value(std::move(value)) {} }; struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryScalar; - BinaryScalar(const std::shared_ptr& value, - const std::shared_ptr& type) - : BaseBinaryScalar(value, type) {} + BinaryScalar(std::shared_ptr value, std::shared_ptr type) + : BaseBinaryScalar(std::move(value), std::move(type)) {} - explicit BinaryScalar(const std::shared_ptr& value) - : BinaryScalar(value, binary()) {} + explicit BinaryScalar(std::shared_ptr value) + : BinaryScalar(std::move(value), binary()) {} BinaryScalar() : BinaryScalar(binary()) {} }; @@ -198,8 +196,8 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { using BinaryScalar::BinaryScalar; using TypeClass = StringType; - explicit StringScalar(const std::shared_ptr& value) - : StringScalar(value, utf8()) {} + explicit StringScalar(std::shared_ptr value) + : StringScalar(std::move(value), utf8()) {} explicit StringScalar(std::string s); @@ -210,12 +208,11 @@ struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = LargeBinaryScalar; - LargeBinaryScalar(const std::shared_ptr& value, - const std::shared_ptr& type) - : BaseBinaryScalar(value, type) {} + LargeBinaryScalar(std::shared_ptr value, std::shared_ptr type) + : BaseBinaryScalar(std::move(value), std::move(type)) {} - explicit LargeBinaryScalar(const std::shared_ptr& value) - : LargeBinaryScalar(value, large_binary()) {} + explicit LargeBinaryScalar(std::shared_ptr value) + : LargeBinaryScalar(std::move(value), large_binary()) {} LargeBinaryScalar() : LargeBinaryScalar(large_binary()) {} }; @@ -224,8 +221,8 @@ struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar { using LargeBinaryScalar::LargeBinaryScalar; using TypeClass = LargeStringType; - explicit LargeStringScalar(const std::shared_ptr& value) - : LargeStringScalar(value, large_utf8()) {} + explicit LargeStringScalar(std::shared_ptr value) + : LargeStringScalar(std::move(value), large_utf8()) {} LargeStringScalar() : LargeStringScalar(large_utf8()) {} }; @@ -233,11 +230,9 @@ struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar { struct ARROW_EXPORT FixedSizeBinaryScalar : public BinaryScalar { using TypeClass = FixedSizeBinaryType; - FixedSizeBinaryScalar(const std::shared_ptr& value, - const std::shared_ptr& type); + FixedSizeBinaryScalar(std::shared_ptr value, std::shared_ptr type); - explicit FixedSizeBinaryScalar(const std::shared_ptr& type) - : BinaryScalar(type) {} + explicit FixedSizeBinaryScalar(std::shared_ptr type) : BinaryScalar(type) {} }; template @@ -246,10 +241,11 @@ struct ARROW_EXPORT TemporalScalar : public Scalar { using TypeClass = T; using ValueType = typename T::c_type; - TemporalScalar(ValueType value, const std::shared_ptr& type) - : Scalar(type, true), value(value) {} + TemporalScalar(ValueType value, std::shared_ptr type) + : Scalar(std::move(type), true), value(value) {} - explicit TemporalScalar(const std::shared_ptr& type) : Scalar(type, false) {} + explicit TemporalScalar(std::shared_ptr type) + : Scalar(std::move(type), false) {} ValueType value; }; @@ -260,7 +256,7 @@ struct ARROW_EXPORT DateScalar : public TemporalScalar { using ValueType = typename TemporalScalar::ValueType; explicit DateScalar(ValueType value) - : TemporalScalar(value, TypeTraits::type_singleton()) {} + : TemporalScalar(std::move(value), TypeTraits::type_singleton()) {} DateScalar() : TemporalScalar(TypeTraits::type_singleton()) {} }; @@ -316,8 +312,8 @@ struct ARROW_EXPORT Decimal128Scalar : public Scalar { using TypeClass = Decimal128Type; using ValueType = Decimal128; - Decimal128Scalar(const Decimal128& value, const std::shared_ptr& type) - : Scalar(type, true), value(std::move(value)) {} + Decimal128Scalar(Decimal128 value, std::shared_ptr type) + : Scalar(std::move(type), true), value(value) {} Decimal128 value; }; @@ -326,10 +322,9 @@ struct ARROW_EXPORT BaseListScalar : public Scalar { using Scalar::Scalar; using ValueType = std::shared_ptr; - BaseListScalar(const std::shared_ptr& value, - const std::shared_ptr& type); + BaseListScalar(std::shared_ptr value, std::shared_ptr type); - explicit BaseListScalar(const std::shared_ptr& value); + explicit BaseListScalar(std::shared_ptr value); std::shared_ptr value; }; @@ -353,8 +348,7 @@ struct ARROW_EXPORT FixedSizeListScalar : public BaseListScalar { using TypeClass = FixedSizeListType; using BaseListScalar::BaseListScalar; - FixedSizeListScalar(const std::shared_ptr& value, - const std::shared_ptr& type); + FixedSizeListScalar(std::shared_ptr value, std::shared_ptr type); }; struct ARROW_EXPORT StructScalar : public Scalar { @@ -363,10 +357,10 @@ struct ARROW_EXPORT StructScalar : public Scalar { std::vector> value; - StructScalar(ValueType value, const std::shared_ptr& type) - : Scalar(type, true), value(std::move(value)) {} + StructScalar(ValueType value, std::shared_ptr type) + : Scalar(std::move(type), true), value(std::move(value)) {} - explicit StructScalar(const std::shared_ptr& type) : Scalar(type) {} + explicit StructScalar(std::shared_ptr type) : Scalar(std::move(type)) {} }; struct ARROW_EXPORT UnionScalar : public Scalar { @@ -375,8 +369,14 @@ struct ARROW_EXPORT UnionScalar : public Scalar { }; struct ARROW_EXPORT DictionaryScalar : public Scalar { - using Scalar::Scalar; using TypeClass = DictionaryType; + using ValueType = std::shared_ptr; + ValueType value; + + explicit DictionaryScalar(std::shared_ptr type); + + DictionaryScalar(ValueType value, std::shared_ptr type) + : Scalar(std::move(type), true), value(std::move(value)) {} }; struct ARROW_EXPORT ExtensionScalar : public Scalar { @@ -385,7 +385,7 @@ struct ARROW_EXPORT ExtensionScalar : public Scalar { }; ARROW_EXPORT -std::shared_ptr MakeNullScalar(const std::shared_ptr& type); +std::shared_ptr MakeNullScalar(std::shared_ptr type); namespace internal { @@ -412,15 +412,15 @@ struct is_simple_scalar< template struct MakeScalarImpl { - template < - typename T, typename ScalarType = typename TypeTraits::ScalarType, - typename ValueType = typename ScalarType::ValueType, - typename Enable = typename std::enable_if< - internal::is_simple_scalar::value && - std::is_same::type>::value>::type> + template ::ScalarType, + typename ValueType = typename ScalarType::ValueType, + typename Enable = typename std::enable_if< + internal::is_simple_scalar::value && + std::is_constructible::value>::type> Status Visit(const T& t) { ARROW_RETURN_NOT_OK(internal::CheckBufferLength(&t, &value_)); - *out_ = std::make_shared(ValueType(static_cast(value_)), type_); + out_ = std::make_shared(ValueType(static_cast(value_)), + std::move(type_)); return Status::OK(); } @@ -428,18 +428,20 @@ struct MakeScalarImpl { return Status::NotImplemented("constructing scalars of type ", t, " from ", value_); } - const std::shared_ptr& type_; + Result> Finish() && { + ARROW_RETURN_NOT_OK(VisitTypeInline(*type_, this)); + return std::move(out_); + } + + std::shared_ptr type_; ValueRef value_; - std::shared_ptr* out_; + std::shared_ptr out_; }; template -Result> MakeScalar(const std::shared_ptr& type, +Result> MakeScalar(std::shared_ptr type, Value&& value) { - std::shared_ptr out; - MakeScalarImpl impl = {type, std::forward(value), &out}; - ARROW_RETURN_NOT_OK(VisitTypeInline(*type, &impl)); - return out; + return MakeScalarImpl{type, std::forward(value), NULLPTR}.Finish(); } /// \brief type inferring scalar factory @@ -452,7 +454,7 @@ std::shared_ptr MakeScalar(Value value) { } inline std::shared_ptr MakeScalar(std::string value) { - return std::make_shared(value); + return std::make_shared(std::move(value)); } } // namespace arrow From ff57b987bf4ac315c4c10ce60be4123a2140e499 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 17 Mar 2020 10:18:59 -0400 Subject: [PATCH 2/3] msvc fix: MakeScalarImpl casting --- cpp/src/arrow/scalar.h | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 7636ecbe79a..e3880f62ae9 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -394,20 +394,6 @@ inline Status CheckBufferLength(...) { return Status::OK(); } ARROW_EXPORT Status CheckBufferLength(const FixedSizeBinaryType* t, const std::shared_ptr* b); -template -struct is_simple_scalar : std::false_type {}; - -template -struct is_simple_scalar< - T, - typename std::enable_if< - // scalar has a single extra data member named "value" with type "ValueType" - std::is_same().value), typename T::ValueType>::value && - // scalar is constructible from (value, type) - std::is_constructible>::value>::type> : std::true_type { -}; - } // namespace internal template @@ -415,12 +401,13 @@ struct MakeScalarImpl { template ::ScalarType, typename ValueType = typename ScalarType::ValueType, typename Enable = typename std::enable_if< - internal::is_simple_scalar::value && - std::is_constructible::value>::type> + std::is_constructible>::value && + std::is_convertible::value>::type> Status Visit(const T& t) { ARROW_RETURN_NOT_OK(internal::CheckBufferLength(&t, &value_)); - out_ = std::make_shared(ValueType(static_cast(value_)), - std::move(type_)); + out_ = std::make_shared( + static_cast(static_cast(value_)), std::move(type_)); return Status::OK(); } From a3f3aa0581ab0c4994a524a20b1ec62c2dc78162 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 20 Mar 2020 11:12:24 -0400 Subject: [PATCH 3/3] extract to DictionaryArrayFromJSON --- cpp/src/arrow/dataset/dataset_test.cc | 4 +- cpp/src/arrow/ipc/json_simple.cc | 127 ++++---------------------- cpp/src/arrow/ipc/json_simple.h | 11 ++- cpp/src/arrow/ipc/json_simple_test.cc | 86 ++--------------- cpp/src/arrow/testing/gtest_util.cc | 9 ++ cpp/src/arrow/testing/gtest_util.h | 10 +- 6 files changed, 51 insertions(+), 196 deletions(-) diff --git a/cpp/src/arrow/dataset/dataset_test.cc b/cpp/src/arrow/dataset/dataset_test.cc index 77806b0e0d0..4cc748920c9 100644 --- a/cpp/src/arrow/dataset/dataset_test.cc +++ b/cpp/src/arrow/dataset/dataset_test.cc @@ -673,9 +673,7 @@ TEST(TestDictPartitionColumn, SelectPartitionColumnFilterPhysicalColumn) { ASSERT_OK_AND_ASSIGN(auto scanner, scan_builder->Finish()); ASSERT_OK_AND_ASSIGN(auto table, scanner->ToTable()); AssertArraysEqual(*table->column(0)->chunk(0), - *ArrayFromJSON(partition_field->type(), R"([ - [0, ["one"]] - ])")); + *DictArrayFromJSON(partition_field->type(), "[0]", "[\"one\"]")); } } // namespace dataset diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index 16c0c91fcac..c301417c43e 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -659,110 +659,6 @@ class StructConverter final : public ConcreteConverter { std::vector> child_converters_; }; -// ------------------------------------------------------------------------ -// Converter for dictionary arrays - -class DictionaryConverter final : public ConcreteConverter { - public: - explicit DictionaryConverter(const std::shared_ptr& type) { type_ = type; } - - struct FixedDictionaryBuilder : ArrayBuilder { - using ArrayBuilder::ArrayBuilder; - - Status AppendNull() override { return index_builder_->AppendNull(); } - - Status AppendNulls(int64_t length) override { - return index_builder_->AppendNulls(length); - } - - Status FinishInternal(std::shared_ptr* out) override { - if (dictionary_ == nullptr) { - return Status::Invalid("dictionary was never supplied"); - } - - if (max_index_ >= dictionary_->length()) { - return Status::IndexError("An index was out of bounds: ", max_index_, - " cannot refer to an element of ", - dictionary_->ToString()); - } - - RETURN_NOT_OK(index_builder_->FinishInternal(out)); - (*out)->type = type_; - (*out)->dictionary = dictionary_; - - Reset(); - return Status::OK(); - } - - void Reset() override { - index_builder_->Reset(); - dictionary_ = nullptr; - max_index_ = 0; - } - - std::shared_ptr type() const override { return type_; } - - void UpdateMax(const rj::Value& json_index) { - if (json_index.IsNull()) return; - - auto index = json_index.GetInt64(); - if (index > max_index_) max_index_ = index; - } - - int64_t max_index_ = 0; - ArrayBuilder* index_builder_; - std::shared_ptr type_; - std::shared_ptr dictionary_; - }; - - Status Init() override { - auto dictionary_type = checked_cast(type_.get()); - RETURN_NOT_OK(GetConverter(dictionary_type->index_type(), &index_converter_)); - RETURN_NOT_OK(GetConverter(dictionary_type->value_type(), &dictionary_converter_)); - - builder_.reset(new FixedDictionaryBuilder(default_memory_pool())); - builder_->index_builder_ = index_converter_->builder().get(); - builder_->type_ = type_; - - return Status::OK(); - } - - Status AppendNull() override { return builder_->AppendNull(); } - - Status AppendValue(const rj::Value& json_obj) override { - if (json_obj.IsNull()) { - return AppendNull(); - } - - if (!json_obj.IsArray()) { - return JSONTypeError("array", json_obj.GetType()); - } - - if (json_obj.Size() != 1 && json_obj.Size() != 2) { - return Status::Invalid( - "Expected [index] or [index, dictionary], got array of size ", json_obj.Size()); - } - - RETURN_NOT_OK(index_converter_->AppendValue(json_obj[0])); - builder_->UpdateMax(json_obj[0]); - - if (json_obj.Size() == 1) return Status::OK(); - - if (builder_->dictionary_ != nullptr) { - return Status::Invalid("dictionary was already specified"); - } - - RETURN_NOT_OK(dictionary_converter_->AppendValues(json_obj[1])); - return dictionary_converter_->Finish(&builder_->dictionary_); - } - - std::shared_ptr builder() override { return builder_; } - - private: - std::shared_ptr builder_; - std::shared_ptr index_converter_, dictionary_converter_; -}; - // ------------------------------------------------------------------------ // Converter for union arrays @@ -901,7 +797,6 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter) SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) SIMPLE_CONVERTER_CASE(Type::UNION, UnionConverter) - SIMPLE_CONVERTER_CASE(Type::DICTIONARY, DictionaryConverter) case Type::INTERVAL: { switch (checked_cast(*type).interval_type()) { case IntervalType::MONTHS: @@ -926,8 +821,8 @@ Status GetConverter(const std::shared_ptr& type, return Status::OK(); } -Status ArrayFromJSON(const std::shared_ptr& type, - const util::string_view& json_string, std::shared_ptr* out) { +Status ArrayFromJSON(const std::shared_ptr& type, util::string_view json_string, + std::shared_ptr* out) { std::shared_ptr converter; RETURN_NOT_OK(GetConverter(type, &converter)); @@ -953,6 +848,24 @@ Status ArrayFromJSON(const std::shared_ptr& type, const char* json_str return ArrayFromJSON(type, util::string_view(json_string), out); } +Status DictArrayFromJSON(const std::shared_ptr& type, + util::string_view indices_json, + util::string_view dictionary_json, std::shared_ptr* out) { + if (type->id() != Type::DICTIONARY) { + return Status::TypeError("DictArrayFromJSON requires dictionary type, got ", *type); + } + + const auto& dictionary_type = checked_cast(*type); + + std::shared_ptr indices, dictionary; + RETURN_NOT_OK(ArrayFromJSON(dictionary_type.index_type(), indices_json, &indices)); + RETURN_NOT_OK( + ArrayFromJSON(dictionary_type.value_type(), dictionary_json, &dictionary)); + + return DictionaryArray::FromArrays(type, std::move(indices), std::move(dictionary), + out); +} + } // namespace json } // namespace internal } // namespace ipc diff --git a/cpp/src/arrow/ipc/json_simple.h b/cpp/src/arrow/ipc/json_simple.h index da6483ff155..8f6b57a4608 100644 --- a/cpp/src/arrow/ipc/json_simple.h +++ b/cpp/src/arrow/ipc/json_simple.h @@ -17,8 +17,7 @@ // Implement a simple JSON representation format for arrays -#ifndef ARROW_IPC_JSON_SIMPLE_H -#define ARROW_IPC_JSON_SIMPLE_H +#pragma once #include #include @@ -41,16 +40,18 @@ Status ArrayFromJSON(const std::shared_ptr&, const std::string& json, std::shared_ptr* out); ARROW_EXPORT -Status ArrayFromJSON(const std::shared_ptr&, const util::string_view& json, +Status ArrayFromJSON(const std::shared_ptr&, util::string_view json, std::shared_ptr* out); ARROW_EXPORT Status ArrayFromJSON(const std::shared_ptr&, const char* json, std::shared_ptr* out); +ARROW_EXPORT +Status DictArrayFromJSON(const std::shared_ptr&, util::string_view indices_json, + util::string_view dictionary_json, std::shared_ptr* out); + } // namespace json } // namespace internal } // namespace ipc } // namespace arrow - -#endif // ARROW_IPC_JSON_SIMPLE_H diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index 16e8b485da4..940f985f5d2 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -1207,96 +1207,24 @@ TEST(TestSparseUnion, Errors) { TEST(TestDictionary, Basics) { auto type = dictionary(int32(), utf8()); - auto array = ArrayFromJSON(type, R"([ - [null, ["whiskey", "tango", "foxtrot"]], - [2], - [1], - [0] - ])"); + auto array = + DictArrayFromJSON(type, "[null, 2, 1, 0]", R"(["whiskey", "tango", "foxtrot"])"); auto expected_indices = ArrayFromJSON(int32(), "[null, 2, 1, 0]"); auto expected_dictionary = ArrayFromJSON(utf8(), R"(["whiskey", "tango", "foxtrot"])"); - DictionaryArray expected(type, expected_indices, expected_dictionary); - - ASSERT_ARRAYS_EQUAL(expected, *array); -} - -TEST(TestDictionary, Nested) { - auto type = dictionary(int32(), utf8()); - auto array = ArrayFromJSON(struct_({field("dict", type)}), R"([ - {"dict": [null, ["whiskey", "tango", "foxtrot"]]}, - null, - {"dict": [2]}, - {"dict": [1]}, - null, - {"dict": [0]} - ])"); - - auto expected_indices = ArrayFromJSON(int32(), "[null, null, 2, 1, null, 0]"); - auto expected_dictionary = ArrayFromJSON(utf8(), R"(["whiskey", "tango", "foxtrot"])"); - auto expected_dict_array = - std::make_shared(type, expected_indices, expected_dictionary); - ASSERT_OK_AND_ASSIGN(auto expected_null_bitmap, - BitUtil::BytesToBits({1, 0, 1, 1, 0, 1})); - - ASSERT_OK_AND_ASSIGN(auto expected, StructArray::Make({expected_dict_array}, {"dict"}, - expected_null_bitmap)); - - ASSERT_ARRAYS_EQUAL(*expected, *array); -} - -TEST(TestDictionary, VeryNested) { - auto dict_type = dictionary(int32(), utf8()); - auto struct_type = struct_({field("dict", dict_type)}); - auto type = dictionary(int8(), struct_type); - - auto array = ArrayFromJSON(type, R"([ - [null, [ - {"dict": [null, ["whiskey", "tango", "foxtrot"]]}, - {"dict": [2]}, - {"dict": [1]}, - {"dict": [0]} - ]], - [0], - [1], - [2] - ])"); - - auto expected_dictionary = ArrayFromJSON(struct_type, R"([ - {"dict": [null, ["whiskey", "tango", "foxtrot"]]}, - {"dict": [2]}, - {"dict": [1]}, - {"dict": [0]} - ])"); - - auto expected_indices = ArrayFromJSON(int8(), "[null, 0, 1, 2]"); - auto expected_dict_array = - std::make_shared(type, expected_indices, expected_dictionary); - - DictionaryArray expected(type, expected_indices, expected_dictionary); - - ASSERT_ARRAYS_EQUAL(expected, *array); + ASSERT_ARRAYS_EQUAL(DictionaryArray(type, expected_indices, expected_dictionary), + *array); } TEST(TestDictionary, Errors) { auto type = dictionary(int32(), utf8()); std::shared_ptr array; - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[\"not a valid index\"]]", &array)); ASSERT_RAISES(Invalid, - ArrayFromJSON(type, "[[0, \"not a valid dictionary\"]]", &array)); - ASSERT_RAISES( - Invalid, ArrayFromJSON(type, "[[0], [1], [2]]", &array)); // no dictionary supplied - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, [\"a\"]], [0, [\"a\"]]]", - &array)); // multiple dictionaries supplied - ASSERT_RAISES(IndexError, ArrayFromJSON(type, "[[0, [\"a\"]], [2]]", - &array)); // index out of bounds - - ASSERT_RAISES(Invalid, - ArrayFromJSON(type, "[\"not an index or index,dict pair\"]", &array)); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array)); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, [\"\"], false]]", &array)); + DictArrayFromJSON(type, "[\"not a valid index\"]", "[\"\"]", &array)); + ASSERT_RAISES(Invalid, DictArrayFromJSON(type, "[0, 1]", "[1]", + &array)); // dict value isn't string } } // namespace json diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 009ee80b8a7..d0d864b3e74 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -218,6 +218,15 @@ std::shared_ptr ArrayFromJSON(const std::shared_ptr& type, return out; } +std::shared_ptr DictArrayFromJSON(const std::shared_ptr& type, + util::string_view indices_json, + util::string_view dictionary_json) { + std::shared_ptr out; + ABORT_NOT_OK( + ipc::internal::json::DictArrayFromJSON(type, indices_json, dictionary_json, &out)); + return out; +} + std::shared_ptr ChunkedArrayFromJSON(const std::shared_ptr& type, const std::vector& json) { ArrayVector out_chunks; diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index cd19628ad14..850a52ef814 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -256,8 +256,14 @@ ARROW_EXPORT std::shared_ptr ArrayFromJSON(const std::shared_ptr&, util::string_view json); -ARROW_EXPORT std::shared_ptr RecordBatchFromJSON( - const std::shared_ptr&, util::string_view); +ARROW_EXPORT +std::shared_ptr DictArrayFromJSON(const std::shared_ptr& type, + util::string_view indices_json, + util::string_view dictionary_json); + +ARROW_EXPORT +std::shared_ptr RecordBatchFromJSON(const std::shared_ptr&, + util::string_view); ARROW_EXPORT std::shared_ptr ChunkedArrayFromJSON(const std::shared_ptr&,