From c9ec74c481602b1a425ac414e2a4034426377627 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 15 Mar 2021 17:45:06 -0400 Subject: [PATCH 01/11] ARROW-11745: [C++] Add helper to generate random record batches by schema --- cpp/src/arrow/testing/CMakeLists.txt | 4 + cpp/src/arrow/testing/random.cc | 289 ++++++++++++++++++++++++++- cpp/src/arrow/testing/random.h | 8 + cpp/src/arrow/testing/random_test.cc | 195 ++++++++++++++++++ 4 files changed, 486 insertions(+), 10 deletions(-) create mode 100644 cpp/src/arrow/testing/random_test.cc diff --git a/cpp/src/arrow/testing/CMakeLists.txt b/cpp/src/arrow/testing/CMakeLists.txt index 125b385ad9e..073224d519b 100644 --- a/cpp/src/arrow/testing/CMakeLists.txt +++ b/cpp/src/arrow/testing/CMakeLists.txt @@ -17,6 +17,10 @@ arrow_install_all_headers("arrow/testing") +if(ARROW_BUILD_TESTS) + add_arrow_test(random_test) +endif() + # json_integration_test is two things at the same time: # - an executable that can be called to answer integration test requests # - a self-(unit)test for the C++ side of integration testing diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 7bf5dd22d43..cc4cba3abca 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -39,7 +39,9 @@ #include "arrow/util/bitmap_reader.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" +#include "arrow/util/value_parsing.h" namespace arrow { @@ -369,12 +371,16 @@ std::shared_ptr RandomArrayGenerator::FixedSizeBinary(int64_t size, std::move(null_bitmap), null_count); } -std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset, - int32_t last_offset, - double null_probability, - bool force_empty_nulls) { - using GenOpt = GenerateOptions>; - GenOpt options(seed(), first_offset, last_offset, null_probability); +namespace { +template +std::shared_ptr GenerateOffsets(SeedType seed, int64_t size, + typename ArrayType::value_type first_offset, + typename ArrayType::value_type last_offset, + double null_probability, bool force_empty_nulls) { + using GenOpt = + GenerateOptions>; + GenOpt options(seed, first_offset, last_offset, null_probability); BufferVector buffers{2}; @@ -387,8 +393,9 @@ std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first arrow::BitUtil::SetBit(null_bitmap, 0); arrow::BitUtil::SetBit(null_bitmap, size - 1); - buffers[1] = *AllocateBuffer(sizeof(int32_t) * size); - auto data = reinterpret_cast(buffers[1]->mutable_data()); + buffers[1] = *AllocateBuffer(sizeof(typename ArrayType::value_type) * size); + auto data = + reinterpret_cast(buffers[1]->mutable_data()); options.GenerateTypedData(data, size); // Ensure offsets are in increasing order std::sort(data, data + size); @@ -410,8 +417,27 @@ std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first } } - auto array_data = ArrayData::Make(int32(), size, buffers, null_count); - return std::make_shared(array_data); + auto array_data = ArrayData::Make(std::make_shared(), + size, buffers, null_count); + return std::make_shared(array_data); +} +} // namespace + +std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset, + int32_t last_offset, + double null_probability, + bool force_empty_nulls) { + return GenerateOffsets>(seed(), size, first_offset, last_offset, + null_probability, force_empty_nulls); +} + +std::shared_ptr RandomArrayGenerator::LargeOffsets(int64_t size, + int64_t first_offset, + int64_t last_offset, + double null_probability, + bool force_empty_nulls) { + return GenerateOffsets>(seed(), size, first_offset, last_offset, + null_probability, force_empty_nulls); } std::shared_ptr RandomArrayGenerator::List(const Array& values, int64_t size, @@ -558,5 +584,248 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(std::shared_ptr t return RandomArrayGeneratorOfImpl{this, type, size, null_probability, nullptr}.Finish(); } +namespace { +template +typename T::c_type GetMetadata(const KeyValueMetadata* metadata, const std::string& key, + typename T::c_type default_value) { + if (!metadata) return default_value; + const auto index = metadata->FindKey(key); + if (index < 0) return default_value; + const auto& value = metadata->value(index); + typename T::c_type output{}; + auto type = checked_pointer_cast(TypeTraits::type_singleton()); + if (!internal::ParseValue(*type, value.data(), value.length(), &output)) { + ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value)); + } + return output; +} + +Result> GenerateArray(const Field& field, int64_t length, + RandomArrayGenerator* generator) { +#define GENERATE_INTEGRAL_CASE_VIEW(BASE_TYPE, VIEW_TYPE) \ + case VIEW_TYPE::type_id: { \ + const BASE_TYPE::c_type min_value = GetMetadata( \ + field.metadata().get(), "min", std::numeric_limits::min()); \ + const BASE_TYPE::c_type max_value = GetMetadata( \ + field.metadata().get(), "max", std::numeric_limits::max()); \ + return generator->Numeric(length, min_value, max_value, null_probability) \ + ->View(field.type()); \ + } +#define GENERATE_INTEGRAL_CASE(ARROW_TYPE) \ + GENERATE_INTEGRAL_CASE_VIEW(ARROW_TYPE, ARROW_TYPE) +#define GENERATE_FLOATING_CASE(ARROW_TYPE, GENERATOR_FUNC) \ + case ARROW_TYPE::type_id: { \ + const ARROW_TYPE::c_type min_value = GetMetadata( \ + field.metadata().get(), "min", std::numeric_limits::min()); \ + const ARROW_TYPE::c_type max_value = GetMetadata( \ + field.metadata().get(), "max", std::numeric_limits::max()); \ + const double nan_probability = \ + GetMetadata(field.metadata().get(), "nan_probability", 10); \ + return generator->GENERATOR_FUNC(length, min_value, max_value, null_probability, \ + nan_probability); \ + } + + const double null_probability = + field.nullable() + ? GetMetadata(field.metadata().get(), "null_probability", 0.01) + : 0.0; + switch (field.type()->id()) { + case Type::type::NA: + return std::make_shared(length); + + case Type::type::BOOL: { + const double true_probability = + GetMetadata(field.metadata().get(), "true_probability", 0.5); + return generator->Boolean(length, true_probability, null_probability); + } + + GENERATE_INTEGRAL_CASE(UInt8Type); + GENERATE_INTEGRAL_CASE(Int8Type); + GENERATE_INTEGRAL_CASE(UInt16Type); + GENERATE_INTEGRAL_CASE(Int16Type); + GENERATE_INTEGRAL_CASE(UInt32Type); + GENERATE_INTEGRAL_CASE(Int32Type); + GENERATE_INTEGRAL_CASE(UInt64Type); + GENERATE_INTEGRAL_CASE(Int64Type); + GENERATE_INTEGRAL_CASE_VIEW(Int16Type, HalfFloatType); + GENERATE_FLOATING_CASE(FloatType, Float32); + GENERATE_FLOATING_CASE(DoubleType, Float64); + + case Type::type::STRING: + case Type::type::BINARY: { + const int32_t min_length = GetMetadata(field.metadata().get(), "min", 0); + const int32_t max_length = + GetMetadata(field.metadata().get(), "max", 1024); + const int32_t unique_values = + GetMetadata(field.metadata().get(), "unique", -1); + if (unique_values > 0) { + return generator + ->StringWithRepeats(length, unique_values, min_length, max_length, + null_probability) + ->View(field.type()); + } + return generator->String(length, min_length, max_length, null_probability) + ->View(field.type()); + } + + case Type::type::DECIMAL128: + case Type::type::DECIMAL256: + case Type::type::FIXED_SIZE_BINARY: { + auto byte_width = + internal::checked_pointer_cast(field.type())->byte_width(); + return generator->FixedSizeBinary(length, byte_width, null_probability) + ->View(field.type()); + } + + GENERATE_INTEGRAL_CASE_VIEW(Int32Type, Date32Type); + GENERATE_INTEGRAL_CASE_VIEW(Int64Type, Date64Type); + GENERATE_INTEGRAL_CASE_VIEW(Int64Type, TimestampType); + GENERATE_INTEGRAL_CASE_VIEW(Int32Type, Time32Type); + GENERATE_INTEGRAL_CASE_VIEW(Int64Type, Time64Type); + GENERATE_INTEGRAL_CASE_VIEW(Int32Type, MonthIntervalType); + + // This isn't as flexible as it could be, but the array-of-structs layout of this + // type means it's not a (useful) composition of other generators + GENERATE_INTEGRAL_CASE_VIEW(Int64Type, DayTimeIntervalType); + + case Type::type::LIST: { + const int32_t values_length = + GetMetadata(field.metadata().get(), "values", length); + const bool force_empty_nulls = + GetMetadata(field.metadata().get(), "force_empty_nulls", false); + auto values = GenerateArray( + *internal::checked_pointer_cast(field.type())->value_field(), + values_length, generator); + // need N + 1 offsets to have N values + auto offsets = generator->Offsets(length + 1, 0, values_length, null_probability, + force_empty_nulls); + return ListArray::FromArrays(*offsets, **values); + } + + case Type::type::STRUCT: { + ArrayVector child_arrays(field.type()->num_fields()); + std::vector field_names; + for (int i = 0; i < field.type()->num_fields(); i++) { + const auto& child_field = field.type()->field(i); + child_arrays[i] = *GenerateArray(*child_field, length, generator); + field_names.push_back(child_field->name()); + } + return StructArray::Make(child_arrays, field_names, + generator->NullBitmap(length, null_probability)); + } + + case Type::type::SPARSE_UNION: + case Type::type::DENSE_UNION: { + ArrayVector child_arrays(field.type()->num_fields()); + for (int i = 0; i < field.type()->num_fields(); i++) { + const auto& child_field = field.type()->field(i); + child_arrays[i] = *GenerateArray(*child_field, length, generator); + } + return field.type()->id() == Type::type::SPARSE_UNION + ? generator->SparseUnion(child_arrays, length) + : generator->DenseUnion(child_arrays, length); + } + + case Type::type::DICTIONARY: { + const int64_t values_length = + GetMetadata(field.metadata().get(), "values", 4); + auto dict_type = internal::checked_pointer_cast(field.type()); + // TODO: no way to control generation of dictionary + auto values = *GenerateArray( + *arrow::field("temporary", dict_type->value_type(), /*nullable=*/false), + values_length, generator); + auto merged = field.metadata() ? field.metadata() : key_value_metadata({}, {}); + merged = merged->Merge(*key_value_metadata( + {{"min", "0"}, {"max", std::to_string(values_length - 1)}})); + auto indices = *GenerateArray( + *arrow::field("temporary", dict_type->index_type(), field.nullable(), merged), + length, generator); + return DictionaryArray::FromArrays(field.type(), indices, values); + } + + case Type::type::MAP: { + const int32_t values_length = + GetMetadata(field.metadata().get(), "values", length); + const bool force_empty_nulls = + GetMetadata(field.metadata().get(), "force_empty_nulls", false); + auto map_type = internal::checked_pointer_cast(field.type()); + auto keys = *GenerateArray(*map_type->key_field(), values_length, generator); + auto items = *GenerateArray(*map_type->item_field(), values_length, generator); + // need N + 1 offsets to have N values + auto offsets = generator->Offsets(length + 1, 0, values_length, null_probability, + force_empty_nulls); + return MapArray::FromArrays(map_type, offsets, keys, items); + } + + case Type::type::EXTENSION: + // Could be supported by generating the storage type (though any extension + // invariants wouldn't be preserved) + break; + + case Type::type::FIXED_SIZE_LIST: { + auto list_type = internal::checked_pointer_cast(field.type()); + const int64_t values_length = list_type->list_size() * length; + auto values = *GenerateArray(*list_type->value_field(), values_length, generator); + auto null_bitmap = generator->NullBitmap(length, null_probability); + return std::make_shared(list_type, length, values, null_bitmap); + } + + GENERATE_INTEGRAL_CASE_VIEW(Int64Type, DurationType); + + case Type::type::LARGE_STRING: + case Type::type::LARGE_BINARY: { + const int32_t min_length = GetMetadata(field.metadata().get(), "min", 0); + const int32_t max_length = + GetMetadata(field.metadata().get(), "max", 1024); + const int32_t unique_values = + GetMetadata(field.metadata().get(), "unique", -1); + if (unique_values > 0) { + ABORT_NOT_OK( + Status::NotImplemented("Generating random array with repeated values for " + "large string/large binary types")); + } + return generator->LargeString(length, min_length, max_length, null_probability) + ->View(field.type()); + } + + case Type::type::LARGE_LIST: { + const int64_t values_length = + GetMetadata(field.metadata().get(), "values", length); + const bool force_empty_nulls = + GetMetadata(field.metadata().get(), "force_empty_nulls", false); + auto values = GenerateArray( + *internal::checked_pointer_cast(field.type())->value_field(), + values_length, generator); + // need N + 1 offsets to have N values + auto offsets = generator->LargeOffsets(length + 1, 0, values_length, + null_probability, force_empty_nulls); + return LargeListArray::FromArrays(*offsets, **values); + } + + default: + break; + } +#undef GENERATE_INTEGRAL_CASE_VIEW +#undef GENERATE_INTEGRAL_CASE +#undef GENERATE_FLOATING_CASE + + ABORT_NOT_OK( + Status::NotImplemented("Generating random array for field ", field.ToString())); + return nullptr; +} + +} // namespace + +std::shared_ptr Generate(const FieldVector& fields, int64_t length, + SeedType seed) { + std::vector> arrays(fields.size()); + RandomArrayGenerator generator(seed); + for (size_t i = 0; i < fields.size(); i++) { + const auto& field = fields[i]; + arrays[i] = *GenerateArray(*field, length, &generator); + } + return RecordBatch::Make(schema(fields), length, std::move(arrays)); +} + } // namespace random } // namespace arrow diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index 2358ab0911f..c57093da0d0 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -249,6 +249,10 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { double null_probability = 0, bool force_empty_nulls = false); + std::shared_ptr LargeOffsets(int64_t size, int64_t first_offset, + int64_t last_offset, double null_probability = 0, + bool force_empty_nulls = false); + /// \brief Generate a random StringArray /// /// \param[in] size the size of the array to generate @@ -358,6 +362,10 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { std::default_random_engine seed_rng_; }; +ARROW_TESTING_EXPORT +std::shared_ptr Generate(const FieldVector& fields, int64_t size, + SeedType seed); + } // namespace random // diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc new file mode 100644 index 00000000000..6bf86fa6af8 --- /dev/null +++ b/cpp/src/arrow/testing/random_test.cc @@ -0,0 +1,195 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array.h" +#include "arrow/record_batch.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/type.h" +#include "arrow/util/key_value_metadata.h" + +namespace arrow { +namespace random { + +class RandomArrayTest : public ::testing::TestWithParam> { + protected: + std::shared_ptr GetField() { return GetParam(); } +}; + +template +class RandomNumericArrayTest : public ::testing::Test { + protected: + std::shared_ptr GetField() { return field("field0", std::make_shared()); } + + std::shared_ptr> Downcast(std::shared_ptr array) { + return internal::checked_pointer_cast>(array); + } +}; + +TEST_P(RandomArrayTest, GenerateArray) { + auto field = GetField(); + auto batch = Generate({field}, 128, 0xDEADBEEF); + AssertSchemaEqual(schema({field}), batch->schema()); + auto array = batch->column(0); + ASSERT_EQ(128, array->length()); + ASSERT_OK(array->ValidateFull()); +} + +TEST_P(RandomArrayTest, GenerateNonNullArray) { + auto field = + GetField()->WithMetadata(key_value_metadata({{"null_probability", "0.0"}})); + if (field->type()->id() == Type::type::NA) { + GTEST_SKIP() << "Cannot generate non-null null arrays"; + } + auto batch = Generate({field}, 128, 0xDEADBEEF); + AssertSchemaEqual(schema({field}), batch->schema()); + auto array = batch->column(0); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(0, array->null_count()); +} + +TEST_P(RandomArrayTest, GenerateNonNullableArray) { + auto field = GetField()->WithNullable(false); + if (field->type()->id() == Type::type::NA) { + GTEST_SKIP() << "Cannot generate non-null null arrays"; + } + auto batch = Generate({field}, 128, 0xDEADBEEF); + AssertSchemaEqual(schema({field}), batch->schema()); + auto array = batch->column(0); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(0, array->null_count()); +} + +struct FieldParamName { + template + std::string operator()(const ::testing::TestParamInfo& info) const { + return std::to_string(info.index) + info.param->name(); + } +}; + +auto values = ::testing::Values( + field("null", null()), field("bool", boolean()), field("uint8", uint8()), + field("int8", int8()), field("uint16", uint16()), field("int16", int16()), + field("uint32", uint32()), field("int32", int32()), field("uint64", uint64()), + field("int64", int64()), field("float16", float16()), field("float32", float32()), + field("float64", float64()), field("string", utf8()), field("binary", binary()), + field("fixed_size_binary", fixed_size_binary(8)), + field("decimal128", decimal128(8, 3)), field("decimal256", decimal256(16, 4)), + field("date32", date32()), field("date64", date64()), + field("timestampns", timestamp(TimeUnit::NANO)), + field("timestamps", timestamp(TimeUnit::SECOND, "America/Phoenix")), + field("time32ms", time32(TimeUnit::MILLI)), field("time64ns", time64(TimeUnit::NANO)), + field("time32s", time32(TimeUnit::SECOND)), + field("time64us", time64(TimeUnit::MICRO)), field("month_interval", month_interval()), + field("daytime_interval", day_time_interval()), field("listint8", list(int8())), + field("listlistint8", list(list(int8()))), + field("listint8emptynulls", list(int8()), true, + key_value_metadata({{"force_empty_nulls", "true"}})), + field("listint81024values", list(int8()), true, + key_value_metadata({{"values", "1024"}})), + field("structints", struct_({ + field("int8", int8()), + field("int16", int16()), + field("int32", int32()), + })), + field("structnested", struct_({ + field("string", utf8()), + field("list", list(int64())), + field("timestamp", timestamp(TimeUnit::MILLI)), + })), + field("sparseunion", sparse_union({ + field("int8", int8()), + field("int16", int16()), + field("int32", int32()), + })), + field("denseunion", dense_union({ + field("int8", int8()), + field("int16", int16()), + field("int32", int32()), + })), + field("dictionary", dictionary(int8(), utf8())), field("map", map(int8(), utf8())), + field("fixedsizelist", fixed_size_list(int8(), 4)), + field("durationns", duration(TimeUnit::NANO)), field("largestring", large_utf8()), + field("largebinary", large_binary()), + field("largelistlistint8", large_list(list(int8())))); + +INSTANTIATE_TEST_SUITE_P( + TestRandomArrayGeneration, RandomArrayTest, values, + [](const ::testing::TestParamInfo& info) { + return std::to_string(info.index) + info.param->name(); + }); + +using NumericTypes = + ::testing::Types; +TYPED_TEST_SUITE(RandomNumericArrayTest, NumericTypes); + +TYPED_TEST(RandomNumericArrayTest, GenerateMinMax) { + auto field = + this->GetField()->WithMetadata(key_value_metadata({{"min", "0"}, {"max", "127"}})); + auto batch = Generate({field}, 128, 0xDEADBEEF); + AssertSchemaEqual(schema({field}), batch->schema()); + auto array = this->Downcast(batch->column(0)); + auto it = array->begin(); + while (it != array->end()) { + if ((*it).has_value() && !std::isnan(**it)) { + ASSERT_GE(**it, 0); + ASSERT_LE(**it, 128); + } + it++; + } +} + +TEST(TypeSpecificTests, FloatNan) { + auto field = arrow::field("float32", float32()) + ->WithMetadata(key_value_metadata({{"nan_probability", "1.0"}})); + auto batch = Generate({field}, 128, 0xDEADBEEF); + AssertSchemaEqual(schema({field}), batch->schema()); + auto array = internal::checked_pointer_cast>(batch->column(0)); + auto it = array->begin(); + while (it != array->end()) { + if ((*it).has_value()) { + ASSERT_TRUE(std::isnan(**it)); + } + it++; + } +} + +TEST(TypeSpecificTests, RepeatedStrings) { + auto field = + arrow::field("string", utf8())->WithMetadata(key_value_metadata({{"unique", "1"}})); + auto batch = Generate({field}, 128, 0xDEADBEEF); + AssertSchemaEqual(schema({field}), batch->schema()); + auto array = internal::checked_pointer_cast(batch->column(0)); + auto it = array->begin(); + util::optional singular_value; + while (it != array->end()) { + if ((*it).has_value()) { + if (!singular_value.has_value()) { + singular_value = *it; + } else { + ASSERT_EQ(*singular_value, **it); + } + } + it++; + } +} + +} // namespace random +} // namespace arrow From 36385c9d67f1cce27fcb1a513f1cb2d2fd803386 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 16 Mar 2021 09:21:58 -0400 Subject: [PATCH 02/11] ARROW-11745: [C++] Use random batch helper in S3FS benchmark --- cpp/src/arrow/filesystem/s3fs_benchmark.cc | 38 +++++++++------------- cpp/src/arrow/testing/random.cc | 10 +++--- cpp/src/arrow/testing/random_test.cc | 10 +++--- 3 files changed, 26 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs_benchmark.cc b/cpp/src/arrow/filesystem/s3fs_benchmark.cc index 88911fd2aa9..ccf78434ae9 100644 --- a/cpp/src/arrow/filesystem/s3fs_benchmark.cc +++ b/cpp/src/arrow/filesystem/s3fs_benchmark.cc @@ -36,6 +36,7 @@ #include "arrow/table.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" +#include "arrow/util/key_value_metadata.h" #include "arrow/util/range.h" #include "parquet/arrow/reader.h" @@ -146,32 +147,25 @@ class MinioFixture : public benchmark::Fixture { /// Appends integer columns to the beginning (to act as indices). Status MakeParquetObject(const std::string& path, int num_columns, int num_rows) { std::vector> columns; - std::vector> fields; - - { - arrow::random::RandomArrayGenerator generator(0); - std::shared_ptr values = generator.Int64(num_rows, 0, 1e10, 0); - columns.push_back(std::make_shared(values)); - fields.push_back(::arrow::field("timestamp", values->type())); - } - { - arrow::random::RandomArrayGenerator generator(1); - std::shared_ptr values = generator.Int32(num_rows, 0, 1e9, 0); - columns.push_back(std::make_shared(values)); - fields.push_back(::arrow::field("val", values->type())); - } - + FieldVector fields{::arrow::field("timestamp", int64(), /*nullable=*/true, + key_value_metadata({{"min", "0"}, + {"max", "10000000000"}, + {"null_probability", "0"}})), + ::arrow::field("val", int32(), /*nullable=*/true, + key_value_metadata({{"min", "0"}, + {"max", "1000000000"}, + {"null_probability", "0"}}))}; for (int i = 0; i < num_columns; i++) { - arrow::random::RandomArrayGenerator generator(i); - std::shared_ptr values = generator.Float64(num_rows, -1.e10, 1e10, 0); std::stringstream ss; ss << "col" << i; - columns.push_back(std::make_shared(values)); - fields.push_back(::arrow::field(ss.str(), values->type())); + fields.push_back(::arrow::field( + ss.str(), float64(), /*nullable=*/true, + key_value_metadata( + {{"min", "-1.e10"}, {"max", "1e10"}, {"null_probability", "0"}}))); } - auto schema = std::make_shared<::arrow::Schema>(fields); - - std::shared_ptr table = Table::Make(schema, columns); + auto batch = random::Generate(fields, num_rows, 0); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr
table, + Table::FromRecordBatches({batch})); std::shared_ptr sink; ARROW_ASSIGN_OR_RAISE(sink, fs_->OpenOutputStream(path)); diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index cc4cba3abca..781f67abec9 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -620,7 +620,7 @@ Result> GenerateArray(const Field& field, int64_t length, const ARROW_TYPE::c_type max_value = GetMetadata( \ field.metadata().get(), "max", std::numeric_limits::max()); \ const double nan_probability = \ - GetMetadata(field.metadata().get(), "nan_probability", 10); \ + GetMetadata(field.metadata().get(), "nan_probability", 0); \ return generator->GENERATOR_FUNC(length, min_value, max_value, null_probability, \ nan_probability); \ } @@ -689,8 +689,8 @@ Result> GenerateArray(const Field& field, int64_t length, GENERATE_INTEGRAL_CASE_VIEW(Int64Type, DayTimeIntervalType); case Type::type::LIST: { - const int32_t values_length = - GetMetadata(field.metadata().get(), "values", length); + const int32_t values_length = GetMetadata( + field.metadata().get(), "values", static_cast(length)); const bool force_empty_nulls = GetMetadata(field.metadata().get(), "force_empty_nulls", false); auto values = GenerateArray( @@ -744,8 +744,8 @@ Result> GenerateArray(const Field& field, int64_t length, } case Type::type::MAP: { - const int32_t values_length = - GetMetadata(field.metadata().get(), "values", length); + const int32_t values_length = GetMetadata( + field.metadata().get(), "values", static_cast(length)); const bool force_empty_nulls = GetMetadata(field.metadata().get(), "force_empty_nulls", false); auto map_type = internal::checked_pointer_cast(field.type()); diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index 6bf86fa6af8..a3cfa408865 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -141,16 +141,16 @@ using NumericTypes = TYPED_TEST_SUITE(RandomNumericArrayTest, NumericTypes); TYPED_TEST(RandomNumericArrayTest, GenerateMinMax) { - auto field = - this->GetField()->WithMetadata(key_value_metadata({{"min", "0"}, {"max", "127"}})); + auto field = this->GetField()->WithMetadata( + key_value_metadata({{"min", "0"}, {"max", "127"}, {"nan_probability", "0.0"}})); auto batch = Generate({field}, 128, 0xDEADBEEF); AssertSchemaEqual(schema({field}), batch->schema()); auto array = this->Downcast(batch->column(0)); auto it = array->begin(); while (it != array->end()) { - if ((*it).has_value() && !std::isnan(**it)) { - ASSERT_GE(**it, 0); - ASSERT_LE(**it, 128); + if ((*it).has_value()) { + ASSERT_GE(**it, typename TypeParam::c_type(0)); + ASSERT_LE(**it, typename TypeParam::c_type(127)); } it++; } From 3aa89b26bb7cdb131cf4d50e6638fd37d4c29c5f Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 17 Mar 2021 10:49:19 -0400 Subject: [PATCH 03/11] Apply suggestions from code review Co-authored-by: Benjamin Kietzman --- cpp/src/arrow/filesystem/s3fs_benchmark.cc | 2 +- cpp/src/arrow/testing/random.cc | 12 ++++---- cpp/src/arrow/testing/random.h | 2 +- cpp/src/arrow/testing/random_test.cc | 33 ++++++---------------- 4 files changed, 16 insertions(+), 33 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs_benchmark.cc b/cpp/src/arrow/filesystem/s3fs_benchmark.cc index ccf78434ae9..b732813aedc 100644 --- a/cpp/src/arrow/filesystem/s3fs_benchmark.cc +++ b/cpp/src/arrow/filesystem/s3fs_benchmark.cc @@ -147,7 +147,7 @@ class MinioFixture : public benchmark::Fixture { /// Appends integer columns to the beginning (to act as indices). Status MakeParquetObject(const std::string& path, int num_columns, int num_rows) { std::vector> columns; - FieldVector fields{::arrow::field("timestamp", int64(), /*nullable=*/true, + FieldVector fields{field("timestamp", int64(), /*nullable=*/true, key_value_metadata({{"min", "0"}, {"max", "10000000000"}, {"null_probability", "0"}})), diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 781f67abec9..f0b90f8f1ff 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -372,7 +372,7 @@ std::shared_ptr RandomArrayGenerator::FixedSizeBinary(int64_t size, } namespace { -template +template std::shared_ptr GenerateOffsets(SeedType seed, int64_t size, typename ArrayType::value_type first_offset, typename ArrayType::value_type last_offset, @@ -585,16 +585,14 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(std::shared_ptr t } namespace { -template -typename T::c_type GetMetadata(const KeyValueMetadata* metadata, const std::string& key, - typename T::c_type default_value) { +template ::ArrowType> +enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, const std::string& key, T default_value) { if (!metadata) return default_value; const auto index = metadata->FindKey(key); if (index < 0) return default_value; const auto& value = metadata->value(index); - typename T::c_type output{}; - auto type = checked_pointer_cast(TypeTraits::type_singleton()); - if (!internal::ParseValue(*type, value.data(), value.length(), &output)) { + T output{}; + if (!internal::ParseValue(value.data(), value.length(), &output)) { ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value)); } return output; diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index c57093da0d0..bce792c53b4 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -363,7 +363,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { }; ARROW_TESTING_EXPORT -std::shared_ptr Generate(const FieldVector& fields, int64_t size, +std::shared_ptr GenerateBatch(const FieldVector& fields, int64_t size, SeedType seed); } // namespace random diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index a3cfa408865..09742d64cc9 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -51,7 +51,7 @@ TEST_P(RandomArrayTest, GenerateArray) { ASSERT_OK(array->ValidateFull()); } -TEST_P(RandomArrayTest, GenerateNonNullArray) { +TEST_P(RandomArrayTest, GenerateArrayWithZeroNullProbability) { auto field = GetField()->WithMetadata(key_value_metadata({{"null_probability", "0.0"}})); if (field->type()->id() == Type::type::NA) { @@ -76,12 +76,6 @@ TEST_P(RandomArrayTest, GenerateNonNullableArray) { ASSERT_EQ(0, array->null_count()); } -struct FieldParamName { - template - std::string operator()(const ::testing::TestParamInfo& info) const { - return std::to_string(info.index) + info.param->name(); - } -}; auto values = ::testing::Values( field("null", null()), field("bool", boolean()), field("uint8", uint8()), @@ -146,13 +140,10 @@ TYPED_TEST(RandomNumericArrayTest, GenerateMinMax) { auto batch = Generate({field}, 128, 0xDEADBEEF); AssertSchemaEqual(schema({field}), batch->schema()); auto array = this->Downcast(batch->column(0)); - auto it = array->begin(); - while (it != array->end()) { - if ((*it).has_value()) { - ASSERT_GE(**it, typename TypeParam::c_type(0)); - ASSERT_LE(**it, typename TypeParam::c_type(127)); - } - it++; + for (auto slot : *array) { + if (!slot.has_value()) continue; + ASSERT_GE(slot, 0); + ASSERT_LE(slot, 127); } } @@ -178,16 +169,10 @@ TEST(TypeSpecificTests, RepeatedStrings) { AssertSchemaEqual(schema({field}), batch->schema()); auto array = internal::checked_pointer_cast(batch->column(0)); auto it = array->begin(); - util::optional singular_value; - while (it != array->end()) { - if ((*it).has_value()) { - if (!singular_value.has_value()) { - singular_value = *it; - } else { - ASSERT_EQ(*singular_value, **it); - } - } - it++; + util::string_view singular_value = array->GetView(0); + for (auto slot : *array) { + if (!slot.has_value()) continue; + ASSERT_EQ(slot, singular_value); } } From 4ffb420076280826785a536c195a8d179886a234 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 17 Mar 2021 11:42:30 -0400 Subject: [PATCH 04/11] Fix up code review suggestions --- cpp/src/arrow/filesystem/s3fs_benchmark.cc | 16 +-- cpp/src/arrow/testing/random.cc | 120 ++++++++++++--------- cpp/src/arrow/testing/random.h | 27 ++++- cpp/src/arrow/testing/random_test.cc | 26 +++-- 4 files changed, 122 insertions(+), 67 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs_benchmark.cc b/cpp/src/arrow/filesystem/s3fs_benchmark.cc index b732813aedc..2bf2b400e8d 100644 --- a/cpp/src/arrow/filesystem/s3fs_benchmark.cc +++ b/cpp/src/arrow/filesystem/s3fs_benchmark.cc @@ -147,14 +147,14 @@ class MinioFixture : public benchmark::Fixture { /// Appends integer columns to the beginning (to act as indices). Status MakeParquetObject(const std::string& path, int num_columns, int num_rows) { std::vector> columns; - FieldVector fields{field("timestamp", int64(), /*nullable=*/true, - key_value_metadata({{"min", "0"}, - {"max", "10000000000"}, - {"null_probability", "0"}})), - ::arrow::field("val", int32(), /*nullable=*/true, - key_value_metadata({{"min", "0"}, - {"max", "1000000000"}, - {"null_probability", "0"}}))}; + FieldVector fields{ + field("timestamp", int64(), /*nullable=*/true, + key_value_metadata( + {{"min", "0"}, {"max", "10000000000"}, {"null_probability", "0"}})), + ::arrow::field( + "val", int32(), /*nullable=*/true, + key_value_metadata( + {{"min", "0"}, {"max", "1000000000"}, {"null_probability", "0"}}))}; for (int i = 0; i < num_columns; i++) { std::stringstream ss; ss << "col" << i; diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index f0b90f8f1ff..e311512f13c 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -374,12 +374,12 @@ std::shared_ptr RandomArrayGenerator::FixedSizeBinary(int64_t size, namespace { template std::shared_ptr GenerateOffsets(SeedType seed, int64_t size, - typename ArrayType::value_type first_offset, - typename ArrayType::value_type last_offset, + typename OffsetArrayType::value_type first_offset, + typename OffsetArrayType::value_type last_offset, double null_probability, bool force_empty_nulls) { - using GenOpt = - GenerateOptions>; + using GenOpt = GenerateOptions< + typename OffsetArrayType::value_type, + std::uniform_int_distribution>; GenOpt options(seed, first_offset, last_offset, null_probability); BufferVector buffers{2}; @@ -393,9 +393,9 @@ std::shared_ptr GenerateOffsets(SeedType seed, int64_t size, arrow::BitUtil::SetBit(null_bitmap, 0); arrow::BitUtil::SetBit(null_bitmap, size - 1); - buffers[1] = *AllocateBuffer(sizeof(typename ArrayType::value_type) * size); + buffers[1] = *AllocateBuffer(sizeof(typename OffsetArrayType::value_type) * size); auto data = - reinterpret_cast(buffers[1]->mutable_data()); + reinterpret_cast(buffers[1]->mutable_data()); options.GenerateTypedData(data, size); // Ensure offsets are in increasing order std::sort(data, data + size); @@ -417,9 +417,9 @@ std::shared_ptr GenerateOffsets(SeedType seed, int64_t size, } } - auto array_data = ArrayData::Make(std::make_shared(), - size, buffers, null_count); - return std::make_shared(array_data); + auto array_data = ArrayData::Make( + std::make_shared(), size, buffers, null_count); + return std::make_shared(array_data); } } // namespace @@ -586,7 +586,9 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(std::shared_ptr t namespace { template ::ArrowType> -enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, const std::string& key, T default_value) { +enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, + const std::string& key, + T default_value) { if (!metadata) return default_value; const auto index = metadata->FindKey(key); if (index < 0) return default_value; @@ -600,12 +602,25 @@ enable_if_parameter_free GetMetadata(const KeyValueMetadata* metad Result> GenerateArray(const Field& field, int64_t length, RandomArrayGenerator* generator) { + // TODO: check min <= max in tests +#define VALIDATE_RANGE(PARAM, MIN, MAX) \ + if (PARAM < MIN || PARAM > MAX) { \ + ABORT_NOT_OK(Status::Invalid(field.ToString(), ": ", ARROW_STRINGIFY(PARAM), \ + " must be in [", MIN, ", ", MAX, " ] but got ", \ + null_probability)); \ + } +#define VALIDATE_MIN_MAX(MIN, MAX) \ + if (MIN > MAX) { \ + ABORT_NOT_OK( \ + Status::Invalid(field.ToString(), ": min ", MIN, " must be <= max ", MAX)); \ + } #define GENERATE_INTEGRAL_CASE_VIEW(BASE_TYPE, VIEW_TYPE) \ case VIEW_TYPE::type_id: { \ - const BASE_TYPE::c_type min_value = GetMetadata( \ + const BASE_TYPE::c_type min_value = GetMetadata( \ field.metadata().get(), "min", std::numeric_limits::min()); \ - const BASE_TYPE::c_type max_value = GetMetadata( \ + const BASE_TYPE::c_type max_value = GetMetadata( \ field.metadata().get(), "max", std::numeric_limits::max()); \ + VALIDATE_MIN_MAX(min_value, max_value); \ return generator->Numeric(length, min_value, max_value, null_probability) \ ->View(field.type()); \ } @@ -613,27 +628,31 @@ Result> GenerateArray(const Field& field, int64_t length, GENERATE_INTEGRAL_CASE_VIEW(ARROW_TYPE, ARROW_TYPE) #define GENERATE_FLOATING_CASE(ARROW_TYPE, GENERATOR_FUNC) \ case ARROW_TYPE::type_id: { \ - const ARROW_TYPE::c_type min_value = GetMetadata( \ + const ARROW_TYPE::c_type min_value = GetMetadata( \ field.metadata().get(), "min", std::numeric_limits::min()); \ - const ARROW_TYPE::c_type max_value = GetMetadata( \ + const ARROW_TYPE::c_type max_value = GetMetadata( \ field.metadata().get(), "max", std::numeric_limits::max()); \ const double nan_probability = \ - GetMetadata(field.metadata().get(), "nan_probability", 0); \ + GetMetadata(field.metadata().get(), "nan_probability", 0); \ + VALIDATE_MIN_MAX(min_value, max_value); \ + VALIDATE_RANGE(nan_probability, 0.0, 1.0); \ return generator->GENERATOR_FUNC(length, min_value, max_value, null_probability, \ nan_probability); \ } const double null_probability = field.nullable() - ? GetMetadata(field.metadata().get(), "null_probability", 0.01) + ? GetMetadata(field.metadata().get(), "null_probability", 0.01) : 0.0; + VALIDATE_RANGE(null_probability, 0.0, 1.0); switch (field.type()->id()) { - case Type::type::NA: + case Type::type::NA: { return std::make_shared(length); + } case Type::type::BOOL: { const double true_probability = - GetMetadata(field.metadata().get(), "true_probability", 0.5); + GetMetadata(field.metadata().get(), "true_probability", 0.5); return generator->Boolean(length, true_probability, null_probability); } @@ -651,11 +670,10 @@ Result> GenerateArray(const Field& field, int64_t length, case Type::type::STRING: case Type::type::BINARY: { - const int32_t min_length = GetMetadata(field.metadata().get(), "min", 0); - const int32_t max_length = - GetMetadata(field.metadata().get(), "max", 1024); - const int32_t unique_values = - GetMetadata(field.metadata().get(), "unique", -1); + const auto min_length = GetMetadata(field.metadata().get(), "min", 0); + const auto max_length = GetMetadata(field.metadata().get(), "max", 1024); + const auto unique_values = + GetMetadata(field.metadata().get(), "unique", -1); if (unique_values > 0) { return generator ->StringWithRepeats(length, unique_values, min_length, max_length, @@ -687,10 +705,10 @@ Result> GenerateArray(const Field& field, int64_t length, GENERATE_INTEGRAL_CASE_VIEW(Int64Type, DayTimeIntervalType); case Type::type::LIST: { - const int32_t values_length = GetMetadata( - field.metadata().get(), "values", static_cast(length)); - const bool force_empty_nulls = - GetMetadata(field.metadata().get(), "force_empty_nulls", false); + const auto values_length = GetMetadata(field.metadata().get(), "values", + static_cast(length)); + const auto force_empty_nulls = + GetMetadata(field.metadata().get(), "force_empty_nulls", false); auto values = GenerateArray( *internal::checked_pointer_cast(field.type())->value_field(), values_length, generator); @@ -719,14 +737,15 @@ Result> GenerateArray(const Field& field, int64_t length, const auto& child_field = field.type()->field(i); child_arrays[i] = *GenerateArray(*child_field, length, generator); } - return field.type()->id() == Type::type::SPARSE_UNION - ? generator->SparseUnion(child_arrays, length) - : generator->DenseUnion(child_arrays, length); + auto array = field.type()->id() == Type::type::SPARSE_UNION + ? generator->SparseUnion(child_arrays, length) + : generator->DenseUnion(child_arrays, length); + return array->View(field.type()); } case Type::type::DICTIONARY: { - const int64_t values_length = - GetMetadata(field.metadata().get(), "values", 4); + const auto values_length = + GetMetadata(field.metadata().get(), "values", 4); auto dict_type = internal::checked_pointer_cast(field.type()); // TODO: no way to control generation of dictionary auto values = *GenerateArray( @@ -742,10 +761,10 @@ Result> GenerateArray(const Field& field, int64_t length, } case Type::type::MAP: { - const int32_t values_length = GetMetadata( - field.metadata().get(), "values", static_cast(length)); - const bool force_empty_nulls = - GetMetadata(field.metadata().get(), "force_empty_nulls", false); + const auto values_length = GetMetadata(field.metadata().get(), "values", + static_cast(length)); + const auto force_empty_nulls = + GetMetadata(field.metadata().get(), "force_empty_nulls", false); auto map_type = internal::checked_pointer_cast(field.type()); auto keys = *GenerateArray(*map_type->key_field(), values_length, generator); auto items = *GenerateArray(*map_type->item_field(), values_length, generator); @@ -772,11 +791,10 @@ Result> GenerateArray(const Field& field, int64_t length, case Type::type::LARGE_STRING: case Type::type::LARGE_BINARY: { - const int32_t min_length = GetMetadata(field.metadata().get(), "min", 0); - const int32_t max_length = - GetMetadata(field.metadata().get(), "max", 1024); - const int32_t unique_values = - GetMetadata(field.metadata().get(), "unique", -1); + const auto min_length = GetMetadata(field.metadata().get(), "min", 0); + const auto max_length = GetMetadata(field.metadata().get(), "max", 1024); + const auto unique_values = + GetMetadata(field.metadata().get(), "unique", -1); if (unique_values > 0) { ABORT_NOT_OK( Status::NotImplemented("Generating random array with repeated values for " @@ -787,10 +805,10 @@ Result> GenerateArray(const Field& field, int64_t length, } case Type::type::LARGE_LIST: { - const int64_t values_length = - GetMetadata(field.metadata().get(), "values", length); - const bool force_empty_nulls = - GetMetadata(field.metadata().get(), "force_empty_nulls", false); + const auto values_length = + GetMetadata(field.metadata().get(), "values", length); + const auto force_empty_nulls = + GetMetadata(field.metadata().get(), "force_empty_nulls", false); auto values = GenerateArray( *internal::checked_pointer_cast(field.type())->value_field(), values_length, generator); @@ -814,8 +832,14 @@ Result> GenerateArray(const Field& field, int64_t length, } // namespace -std::shared_ptr Generate(const FieldVector& fields, int64_t length, - SeedType seed) { +std::shared_ptr GenerateArray(const Field& field, int64_t length, + SeedType seed) { + RandomArrayGenerator generator(seed); + return *GenerateArray(field, length, &generator); +} + +std::shared_ptr GenerateBatch(const FieldVector& fields, + int64_t length, SeedType seed) { std::vector> arrays(fields.size()); RandomArrayGenerator generator(seed); for (size_t i = 0; i < fields.size(); i++) { diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index bce792c53b4..942af583ec9 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -362,9 +362,34 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { std::default_random_engine seed_rng_; }; +/// Generate a record batch with random data of the specified length. +/// +/// Generation options are read from key-value metadata for each field +/// (including nested fields). +/// +/// The following options are supported: +/// +/// For all types except NullType: +/// - null_probability (double): range [0.0, 1.0] the probability of a null value. +/// Default/value is 0.0 if the field is marked non-nullable, else it is 0.01 +/// +/// For all numeric types T: +/// - min (T::c_type): the minimum value to generate (inclusive), default +/// std::numeric_limits::min() +/// - max (T::c_type): the maximum value to generate (inclusive), default +/// std::numeric_limits::max() Note this means that, for example, min/max are +/// int16_t values for HalfFloatType. +/// +/// For floating point types T for which is_physical_floating_type: +/// - nan_probability (double): range [0.0, 1.0] ARROW_TESTING_EXPORT std::shared_ptr GenerateBatch(const FieldVector& fields, int64_t size, - SeedType seed); + SeedType seed); + +/// Generate an array with random data. See GenerateBatch for usage info. +ARROW_TESTING_EXPORT +std::shared_ptr GenerateArray(const Field& field, int64_t size, + SeedType seed); } // namespace random diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index 09742d64cc9..b193f3bd97e 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -44,7 +44,15 @@ class RandomNumericArrayTest : public ::testing::Test { TEST_P(RandomArrayTest, GenerateArray) { auto field = GetField(); - auto batch = Generate({field}, 128, 0xDEADBEEF); + auto array = GenerateArray(*field, 128, 0xDEADBEEF); + AssertTypeEqual(field->type(), array->type()); + ASSERT_EQ(128, array->length()); + ASSERT_OK(array->ValidateFull()); +} + +TEST_P(RandomArrayTest, GenerateBatch) { + auto field = GetField(); + auto batch = GenerateBatch({field}, 128, 0xDEADBEEF); AssertSchemaEqual(schema({field}), batch->schema()); auto array = batch->column(0); ASSERT_EQ(128, array->length()); @@ -57,7 +65,7 @@ TEST_P(RandomArrayTest, GenerateArrayWithZeroNullProbability) { if (field->type()->id() == Type::type::NA) { GTEST_SKIP() << "Cannot generate non-null null arrays"; } - auto batch = Generate({field}, 128, 0xDEADBEEF); + auto batch = GenerateBatch({field}, 128, 0xDEADBEEF); AssertSchemaEqual(schema({field}), batch->schema()); auto array = batch->column(0); ASSERT_OK(array->ValidateFull()); @@ -69,14 +77,13 @@ TEST_P(RandomArrayTest, GenerateNonNullableArray) { if (field->type()->id() == Type::type::NA) { GTEST_SKIP() << "Cannot generate non-null null arrays"; } - auto batch = Generate({field}, 128, 0xDEADBEEF); + auto batch = GenerateBatch({field}, 128, 0xDEADBEEF); AssertSchemaEqual(schema({field}), batch->schema()); auto array = batch->column(0); ASSERT_OK(array->ValidateFull()); ASSERT_EQ(0, array->null_count()); } - auto values = ::testing::Values( field("null", null()), field("bool", boolean()), field("uint8", uint8()), field("int8", int8()), field("uint16", uint16()), field("int16", int16()), @@ -137,20 +144,20 @@ TYPED_TEST_SUITE(RandomNumericArrayTest, NumericTypes); TYPED_TEST(RandomNumericArrayTest, GenerateMinMax) { auto field = this->GetField()->WithMetadata( key_value_metadata({{"min", "0"}, {"max", "127"}, {"nan_probability", "0.0"}})); - auto batch = Generate({field}, 128, 0xDEADBEEF); + auto batch = GenerateBatch({field}, 128, 0xDEADBEEF); AssertSchemaEqual(schema({field}), batch->schema()); auto array = this->Downcast(batch->column(0)); for (auto slot : *array) { if (!slot.has_value()) continue; - ASSERT_GE(slot, 0); - ASSERT_LE(slot, 127); + ASSERT_GE(slot, typename TypeParam::c_type(0)); + ASSERT_LE(slot, typename TypeParam::c_type(127)); } } TEST(TypeSpecificTests, FloatNan) { auto field = arrow::field("float32", float32()) ->WithMetadata(key_value_metadata({{"nan_probability", "1.0"}})); - auto batch = Generate({field}, 128, 0xDEADBEEF); + auto batch = GenerateBatch({field}, 128, 0xDEADBEEF); AssertSchemaEqual(schema({field}), batch->schema()); auto array = internal::checked_pointer_cast>(batch->column(0)); auto it = array->begin(); @@ -165,10 +172,9 @@ TEST(TypeSpecificTests, FloatNan) { TEST(TypeSpecificTests, RepeatedStrings) { auto field = arrow::field("string", utf8())->WithMetadata(key_value_metadata({{"unique", "1"}})); - auto batch = Generate({field}, 128, 0xDEADBEEF); + auto batch = GenerateBatch({field}, 128, 0xDEADBEEF); AssertSchemaEqual(schema({field}), batch->schema()); auto array = internal::checked_pointer_cast(batch->column(0)); - auto it = array->begin(); util::string_view singular_value = array->GetView(0); for (auto slot : *array) { if (!slot.has_value()) continue; From df6c12db6f783e4d5689248bfa1c1723aa5b98b8 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 17 Mar 2021 15:37:01 -0400 Subject: [PATCH 05/11] Make list/string generators more consistent --- cpp/src/arrow/testing/random.cc | 104 ++++++++++++++++++++------- cpp/src/arrow/testing/random_test.cc | 22 +++--- 2 files changed, 90 insertions(+), 36 deletions(-) diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index e311512f13c..3ef451b55e0 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -31,6 +31,7 @@ #include "arrow/array/builder_decimal.h" #include "arrow/array/builder_primitive.h" #include "arrow/buffer.h" +#include "arrow/compute/api_aggregate.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" #include "arrow/type_fwd.h" @@ -421,6 +422,57 @@ std::shared_ptr GenerateOffsets(SeedType seed, int64_t size, std::make_shared(), size, buffers, null_count); return std::make_shared(array_data); } + +template +std::shared_ptr OffsetsFromLengthsArray(OffsetArrayType* lengths, + bool force_empty_nulls) { + // TODO: length 0 arrays (need test case) + DCHECK(!lengths->IsNull(0)); + DCHECK(!lengths->IsNull(lengths->length() - 1)); + // Need N + 1 offsets for N items + int64_t size = lengths->length() + 1; + BufferVector buffers{2}; + + int64_t null_count = 0; + + buffers[0] = *AllocateEmptyBitmap(size); + uint8_t* null_bitmap = buffers[0]->mutable_data(); + // Make sure the first and last entry are non-null + arrow::BitUtil::SetBit(null_bitmap, 0); + arrow::BitUtil::SetBit(null_bitmap, size - 1); + + buffers[1] = *AllocateBuffer(sizeof(typename OffsetArrayType::value_type) * size); + auto data = + reinterpret_cast(buffers[1]->mutable_data()); + data[0] = 0; + int index = 1; + for (const auto& length : *lengths) { + if (length.has_value()) { + arrow::BitUtil::SetBit(null_bitmap, index); + data[index] = data[index - 1] + *length; + DCHECK_GE(*length, 0); + } else { + data[index] = data[index - 1]; + } + index++; + } + + if (force_empty_nulls) { + arrow::internal::BitmapReader reader(null_bitmap, 0, size); + for (int64_t i = 0; i < size; ++i) { + if (reader.IsNotSet()) { + // Ensure a null entry corresponds to a 0-sized list extent + // (note this can be neither the first nor the last list entry, see above) + data[i + 1] = data[i]; + } + reader.Next(); + } + } + + auto array_data = ArrayData::Make( + std::make_shared(), size, buffers, null_count); + return std::make_shared(array_data); +} } // namespace std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset, @@ -639,6 +691,27 @@ Result> GenerateArray(const Field& field, int64_t length, return generator->GENERATOR_FUNC(length, min_value, max_value, null_probability, \ nan_probability); \ } +#define GENERATE_LIST_CASE(ARRAY_TYPE) \ + case ARRAY_TYPE::TypeClass::type_id: { \ + const auto min_length = GetMetadata( \ + field.metadata().get(), "min", 0); \ + const auto max_length = GetMetadata( \ + field.metadata().get(), "max", 1024); \ + const auto lengths = internal::checked_pointer_cast< \ + CTypeTraits::ArrayType>( \ + generator->Numeric::ArrowType>( \ + length, min_length, max_length, null_probability)); \ + ARROW_ASSIGN_OR_RAISE(const auto values_datum, compute::Sum(lengths)); \ + const auto values_length = values_datum.scalar_as().value; \ + const auto force_empty_nulls = \ + GetMetadata(field.metadata().get(), "force_empty_nulls", false); \ + const auto values = GenerateArray( \ + *internal::checked_pointer_cast(field.type()) \ + ->value_field(), \ + values_length, generator); \ + const auto offsets = OffsetsFromLengthsArray(lengths.get(), force_empty_nulls); \ + return ARRAY_TYPE::FromArrays(*offsets, **values); \ + } const double null_probability = field.nullable() @@ -704,19 +777,7 @@ Result> GenerateArray(const Field& field, int64_t length, // type means it's not a (useful) composition of other generators GENERATE_INTEGRAL_CASE_VIEW(Int64Type, DayTimeIntervalType); - case Type::type::LIST: { - const auto values_length = GetMetadata(field.metadata().get(), "values", - static_cast(length)); - const auto force_empty_nulls = - GetMetadata(field.metadata().get(), "force_empty_nulls", false); - auto values = GenerateArray( - *internal::checked_pointer_cast(field.type())->value_field(), - values_length, generator); - // need N + 1 offsets to have N values - auto offsets = generator->Offsets(length + 1, 0, values_length, null_probability, - force_empty_nulls); - return ListArray::FromArrays(*offsets, **values); - } + GENERATE_LIST_CASE(ListArray); case Type::type::STRUCT: { ArrayVector child_arrays(field.type()->num_fields()); @@ -804,19 +865,7 @@ Result> GenerateArray(const Field& field, int64_t length, ->View(field.type()); } - case Type::type::LARGE_LIST: { - const auto values_length = - GetMetadata(field.metadata().get(), "values", length); - const auto force_empty_nulls = - GetMetadata(field.metadata().get(), "force_empty_nulls", false); - auto values = GenerateArray( - *internal::checked_pointer_cast(field.type())->value_field(), - values_length, generator); - // need N + 1 offsets to have N values - auto offsets = generator->LargeOffsets(length + 1, 0, values_length, - null_probability, force_empty_nulls); - return LargeListArray::FromArrays(*offsets, **values); - } + GENERATE_LIST_CASE(LargeListArray); default: break; @@ -824,6 +873,9 @@ Result> GenerateArray(const Field& field, int64_t length, #undef GENERATE_INTEGRAL_CASE_VIEW #undef GENERATE_INTEGRAL_CASE #undef GENERATE_FLOATING_CASE +#undef GENERATE_LIST_CASE +#undef VALIDATE_RANGE +#undef VALIDATE_MIN_MAX ABORT_NOT_OK( Status::NotImplemented("Generating random array for field ", field.ToString())); diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index b193f3bd97e..3855ab7426f 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -14,7 +14,6 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. - #include #include "arrow/array.h" @@ -27,6 +26,9 @@ namespace arrow { namespace random { +// Use short arrays since especially in debug mode, generating list(list()) is slow +constexpr int64_t kExpectedLength = 24; + class RandomArrayTest : public ::testing::TestWithParam> { protected: std::shared_ptr GetField() { return GetParam(); } @@ -44,18 +46,18 @@ class RandomNumericArrayTest : public ::testing::Test { TEST_P(RandomArrayTest, GenerateArray) { auto field = GetField(); - auto array = GenerateArray(*field, 128, 0xDEADBEEF); + auto array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); AssertTypeEqual(field->type(), array->type()); - ASSERT_EQ(128, array->length()); + ASSERT_EQ(kExpectedLength, array->length()); ASSERT_OK(array->ValidateFull()); } TEST_P(RandomArrayTest, GenerateBatch) { auto field = GetField(); - auto batch = GenerateBatch({field}, 128, 0xDEADBEEF); + auto batch = GenerateBatch({field}, kExpectedLength, 0xDEADBEEF); AssertSchemaEqual(schema({field}), batch->schema()); auto array = batch->column(0); - ASSERT_EQ(128, array->length()); + ASSERT_EQ(kExpectedLength, array->length()); ASSERT_OK(array->ValidateFull()); } @@ -65,7 +67,7 @@ TEST_P(RandomArrayTest, GenerateArrayWithZeroNullProbability) { if (field->type()->id() == Type::type::NA) { GTEST_SKIP() << "Cannot generate non-null null arrays"; } - auto batch = GenerateBatch({field}, 128, 0xDEADBEEF); + auto batch = GenerateBatch({field}, kExpectedLength, 0xDEADBEEF); AssertSchemaEqual(schema({field}), batch->schema()); auto array = batch->column(0); ASSERT_OK(array->ValidateFull()); @@ -77,7 +79,7 @@ TEST_P(RandomArrayTest, GenerateNonNullableArray) { if (field->type()->id() == Type::type::NA) { GTEST_SKIP() << "Cannot generate non-null null arrays"; } - auto batch = GenerateBatch({field}, 128, 0xDEADBEEF); + auto batch = GenerateBatch({field}, kExpectedLength, 0xDEADBEEF); AssertSchemaEqual(schema({field}), batch->schema()); auto array = batch->column(0); ASSERT_OK(array->ValidateFull()); @@ -144,7 +146,7 @@ TYPED_TEST_SUITE(RandomNumericArrayTest, NumericTypes); TYPED_TEST(RandomNumericArrayTest, GenerateMinMax) { auto field = this->GetField()->WithMetadata( key_value_metadata({{"min", "0"}, {"max", "127"}, {"nan_probability", "0.0"}})); - auto batch = GenerateBatch({field}, 128, 0xDEADBEEF); + auto batch = GenerateBatch({field}, kExpectedLength, 0xDEADBEEF); AssertSchemaEqual(schema({field}), batch->schema()); auto array = this->Downcast(batch->column(0)); for (auto slot : *array) { @@ -157,7 +159,7 @@ TYPED_TEST(RandomNumericArrayTest, GenerateMinMax) { TEST(TypeSpecificTests, FloatNan) { auto field = arrow::field("float32", float32()) ->WithMetadata(key_value_metadata({{"nan_probability", "1.0"}})); - auto batch = GenerateBatch({field}, 128, 0xDEADBEEF); + auto batch = GenerateBatch({field}, kExpectedLength, 0xDEADBEEF); AssertSchemaEqual(schema({field}), batch->schema()); auto array = internal::checked_pointer_cast>(batch->column(0)); auto it = array->begin(); @@ -172,7 +174,7 @@ TEST(TypeSpecificTests, FloatNan) { TEST(TypeSpecificTests, RepeatedStrings) { auto field = arrow::field("string", utf8())->WithMetadata(key_value_metadata({{"unique", "1"}})); - auto batch = GenerateBatch({field}, 128, 0xDEADBEEF); + auto batch = GenerateBatch({field}, kExpectedLength, 0xDEADBEEF); AssertSchemaEqual(schema({field}), batch->schema()); auto array = internal::checked_pointer_cast(batch->column(0)); util::string_view singular_value = array->GetView(0); From 1326337c08489a5100b7913a26b3c19f2876d7c2 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 17 Mar 2021 16:37:11 -0400 Subject: [PATCH 06/11] Add docs, type-specific tests --- cpp/src/arrow/testing/random.cc | 25 ++-- cpp/src/arrow/testing/random.h | 36 +++++- cpp/src/arrow/testing/random_test.cc | 170 +++++++++++++++++++++++++-- 3 files changed, 204 insertions(+), 27 deletions(-) diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 3ef451b55e0..90ddbcca491 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -426,9 +426,8 @@ std::shared_ptr GenerateOffsets(SeedType seed, int64_t size, template std::shared_ptr OffsetsFromLengthsArray(OffsetArrayType* lengths, bool force_empty_nulls) { - // TODO: length 0 arrays (need test case) - DCHECK(!lengths->IsNull(0)); - DCHECK(!lengths->IsNull(lengths->length() - 1)); + DCHECK(lengths->length() == 0 || !lengths->IsNull(0)); + DCHECK(lengths->length() == 0 || !lengths->IsNull(lengths->length() - 1)); // Need N + 1 offsets for N items int64_t size = lengths->length() + 1; BufferVector buffers{2}; @@ -694,9 +693,9 @@ Result> GenerateArray(const Field& field, int64_t length, #define GENERATE_LIST_CASE(ARRAY_TYPE) \ case ARRAY_TYPE::TypeClass::type_id: { \ const auto min_length = GetMetadata( \ - field.metadata().get(), "min", 0); \ + field.metadata().get(), "min_length", 0); \ const auto max_length = GetMetadata( \ - field.metadata().get(), "max", 1024); \ + field.metadata().get(), "max_length", 1024); \ const auto lengths = internal::checked_pointer_cast< \ CTypeTraits::ArrayType>( \ generator->Numeric::ArrowType>( \ @@ -743,8 +742,10 @@ Result> GenerateArray(const Field& field, int64_t length, case Type::type::STRING: case Type::type::BINARY: { - const auto min_length = GetMetadata(field.metadata().get(), "min", 0); - const auto max_length = GetMetadata(field.metadata().get(), "max", 1024); + const auto min_length = + GetMetadata(field.metadata().get(), "min_length", 0); + const auto max_length = + GetMetadata(field.metadata().get(), "max_length", 1024); const auto unique_values = GetMetadata(field.metadata().get(), "unique", -1); if (unique_values > 0) { @@ -813,6 +814,10 @@ Result> GenerateArray(const Field& field, int64_t length, *arrow::field("temporary", dict_type->value_type(), /*nullable=*/false), values_length, generator); auto merged = field.metadata() ? field.metadata() : key_value_metadata({}, {}); + if (merged->Contains("min")) + ABORT_NOT_OK(Status::Invalid(field.ToString(), ": cannot specify min")); + if (merged->Contains("max")) + ABORT_NOT_OK(Status::Invalid(field.ToString(), ": cannot specify max")); merged = merged->Merge(*key_value_metadata( {{"min", "0"}, {"max", std::to_string(values_length - 1)}})); auto indices = *GenerateArray( @@ -852,8 +857,10 @@ Result> GenerateArray(const Field& field, int64_t length, case Type::type::LARGE_STRING: case Type::type::LARGE_BINARY: { - const auto min_length = GetMetadata(field.metadata().get(), "min", 0); - const auto max_length = GetMetadata(field.metadata().get(), "max", 1024); + const auto min_length = + GetMetadata(field.metadata().get(), "min_length", 0); + const auto max_length = + GetMetadata(field.metadata().get(), "max_length", 1024); const auto unique_values = GetMetadata(field.metadata().get(), "unique", -1); if (unique_values > 0) { diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index 942af583ec9..7457cedcdec 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -364,8 +364,9 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { /// Generate a record batch with random data of the specified length. /// -/// Generation options are read from key-value metadata for each field -/// (including nested fields). +/// Generation options are read from key-value metadata for each field. Options +/// are applied recursively, e.g. for list(field(int8())), metadata of the child +/// field will be used when generating child values. /// /// The following options are supported: /// @@ -375,13 +376,36 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { /// /// For all numeric types T: /// - min (T::c_type): the minimum value to generate (inclusive), default -/// std::numeric_limits::min() +/// std::numeric_limits::min() /// - max (T::c_type): the maximum value to generate (inclusive), default -/// std::numeric_limits::max() Note this means that, for example, min/max are -/// int16_t values for HalfFloatType. +/// std::numeric_limits::max() +/// Note this means that, for example, min/max are int16_t values for HalfFloatType. /// /// For floating point types T for which is_physical_floating_type: -/// - nan_probability (double): range [0.0, 1.0] +/// - nan_probability (double): range [0.0, 1.0] the probability of a NaN value. +/// +/// For BooleanType: +/// - true_probability (double): range [0.0, 1.0] the probability of a true. +/// +/// For DictionaryType: +/// - values (int32_t): the size of the dictionary. +/// Other properties are passed to the generator for the dictionary indices. However, min +/// and max cannot be specified. Note it is not possible to otherwise customize the +/// generation of dictionary values. +/// +/// For list, string, and binary types T, including their large variants: +/// - min_length (T::offset_type): the minimum length of the child to generate, +/// default 0 +/// - max_length (T::offset_type): the minimum length of the child to generate, +/// default 1024 +/// +/// For string and binary types T (not including their large variants): +/// - unique (int32_t): if positive, this many distinct values will be generated +/// and all array values will be one of these values, default -1 +/// +/// For MapType: +/// - values (int32_t): the number of key-value pairs to generate, which will be +/// partitioned among the array values. ARROW_TESTING_EXPORT std::shared_ptr GenerateBatch(const FieldVector& fields, int64_t size, SeedType seed); diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index 3855ab7426f..7ec3dee6e7e 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -61,6 +61,17 @@ TEST_P(RandomArrayTest, GenerateBatch) { ASSERT_OK(array->ValidateFull()); } +TEST_P(RandomArrayTest, GenerateZeroLengthArray) { + auto field = GetField(); + if (field->type()->id() == Type::type::DENSE_UNION) { + GTEST_SKIP() << "Cannot generate zero-length dense union arrays"; + } + auto array = GenerateArray(*field, 0, 0xDEADBEEF); + AssertTypeEqual(field->type(), array->type()); + ASSERT_EQ(0, array->length()); + ASSERT_OK(array->ValidateFull()); +} + TEST_P(RandomArrayTest, GenerateArrayWithZeroNullProbability) { auto field = GetField()->WithMetadata(key_value_metadata({{"null_probability", "0.0"}})); @@ -156,32 +167,167 @@ TYPED_TEST(RandomNumericArrayTest, GenerateMinMax) { } } -TEST(TypeSpecificTests, FloatNan) { +// Test all the supported options +TEST(TypeSpecificTests, BoolTrueProbability) { + auto field = arrow::field("bool", boolean()) + ->WithMetadata(key_value_metadata({{"true_probability", "1.0"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + for (const auto& value : *array) { + ASSERT_TRUE(!value.has_value() || *value); + } +} + +TEST(TypeSpecificTests, DictionaryValues) { + auto field = arrow::field("dictionary", dictionary(int8(), utf8())) + ->WithMetadata(key_value_metadata({{"values", "16"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(16, array->dictionary()->length()); +} + +TEST(TypeSpecificTests, Float32Nan) { auto field = arrow::field("float32", float32()) ->WithMetadata(key_value_metadata({{"nan_probability", "1.0"}})); - auto batch = GenerateBatch({field}, kExpectedLength, 0xDEADBEEF); - AssertSchemaEqual(schema({field}), batch->schema()); - auto array = internal::checked_pointer_cast>(batch->column(0)); - auto it = array->begin(); - while (it != array->end()) { - if ((*it).has_value()) { - ASSERT_TRUE(std::isnan(**it)); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast>(base_array); + ASSERT_OK(array->ValidateFull()); + for (const auto& value : *array) { + ASSERT_TRUE(!value.has_value() || std::isnan(*value)); + } +} + +TEST(TypeSpecificTests, Float64Nan) { + auto field = arrow::field("float64", float64()) + ->WithMetadata(key_value_metadata({{"nan_probability", "1.0"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast>(base_array); + ASSERT_OK(array->ValidateFull()); + for (const auto& value : *array) { + ASSERT_TRUE(!value.has_value() || std::isnan(*value)); + } +} + +TEST(TypeSpecificTests, ListLengths) { + { + auto field = arrow::field("list", list(int8())) + ->WithMetadata( + key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(1, array->value_length(i)); + } + } + } + { + auto field = arrow::field("list", large_list(int8())) + ->WithMetadata(key_value_metadata( + {{"min_length", "10"}, {"max_length", "10"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(10, array->value_length(i)); + } } - it++; } } +TEST(TypeSpecificTests, MapValues) { + auto field = arrow::field("map", map(int8(), int8())) + ->WithMetadata(key_value_metadata({{"values", "4"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(4, array->keys()->length()); + ASSERT_EQ(4, array->items()->length()); +} + TEST(TypeSpecificTests, RepeatedStrings) { auto field = arrow::field("string", utf8())->WithMetadata(key_value_metadata({{"unique", "1"}})); - auto batch = GenerateBatch({field}, kExpectedLength, 0xDEADBEEF); - AssertSchemaEqual(schema({field}), batch->schema()); - auto array = internal::checked_pointer_cast(batch->column(0)); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); util::string_view singular_value = array->GetView(0); for (auto slot : *array) { if (!slot.has_value()) continue; ASSERT_EQ(slot, singular_value); } + // N.B. LargeString does not support unique +} + +TEST(TypeSpecificTests, StringLengths) { + { + auto field = arrow::field("list", utf8()) + ->WithMetadata( + key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(1, array->value_length(i)); + } + } + } + { + auto field = arrow::field("list", binary()) + ->WithMetadata( + key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(1, array->value_length(i)); + } + } + } + { + auto field = arrow::field("list", large_utf8()) + ->WithMetadata(key_value_metadata( + {{"min_length", "10"}, {"max_length", "10"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(10, array->value_length(i)); + } + } + } + { + auto field = arrow::field("list", large_binary()) + ->WithMetadata(key_value_metadata( + {{"min_length", "10"}, {"max_length", "10"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(10, array->value_length(i)); + } + } + } } } // namespace random From e2e9a9c8d8e0103a338bd43d0613f9381b8dc587 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 17 Mar 2021 16:42:39 -0400 Subject: [PATCH 07/11] Add field-with-metadata overload --- cpp/src/arrow/testing/random_test.cc | 57 +++++++++++++--------------- cpp/src/arrow/type.cc | 6 +++ cpp/src/arrow/type_fwd.h | 11 ++++++ 3 files changed, 44 insertions(+), 30 deletions(-) diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index 7ec3dee6e7e..42624eb3af8 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -169,8 +169,8 @@ TYPED_TEST(RandomNumericArrayTest, GenerateMinMax) { // Test all the supported options TEST(TypeSpecificTests, BoolTrueProbability) { - auto field = arrow::field("bool", boolean()) - ->WithMetadata(key_value_metadata({{"true_probability", "1.0"}})); + auto field = + arrow::field("bool", boolean(), key_value_metadata({{"true_probability", "1.0"}})); auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); AssertTypeEqual(field->type(), base_array->type()); auto array = internal::checked_pointer_cast(base_array); @@ -181,8 +181,8 @@ TEST(TypeSpecificTests, BoolTrueProbability) { } TEST(TypeSpecificTests, DictionaryValues) { - auto field = arrow::field("dictionary", dictionary(int8(), utf8())) - ->WithMetadata(key_value_metadata({{"values", "16"}})); + auto field = arrow::field("dictionary", dictionary(int8(), utf8()), + key_value_metadata({{"values", "16"}})); auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); AssertTypeEqual(field->type(), base_array->type()); auto array = internal::checked_pointer_cast(base_array); @@ -191,8 +191,8 @@ TEST(TypeSpecificTests, DictionaryValues) { } TEST(TypeSpecificTests, Float32Nan) { - auto field = arrow::field("float32", float32()) - ->WithMetadata(key_value_metadata({{"nan_probability", "1.0"}})); + auto field = arrow::field("float32", float32(), + key_value_metadata({{"nan_probability", "1.0"}})); auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); AssertTypeEqual(field->type(), base_array->type()); auto array = internal::checked_pointer_cast>(base_array); @@ -203,8 +203,8 @@ TEST(TypeSpecificTests, Float32Nan) { } TEST(TypeSpecificTests, Float64Nan) { - auto field = arrow::field("float64", float64()) - ->WithMetadata(key_value_metadata({{"nan_probability", "1.0"}})); + auto field = arrow::field("float64", float64(), + key_value_metadata({{"nan_probability", "1.0"}})); auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); AssertTypeEqual(field->type(), base_array->type()); auto array = internal::checked_pointer_cast>(base_array); @@ -216,9 +216,9 @@ TEST(TypeSpecificTests, Float64Nan) { TEST(TypeSpecificTests, ListLengths) { { - auto field = arrow::field("list", list(int8())) - ->WithMetadata( - key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); + auto field = + arrow::field("list", list(int8()), + key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); AssertTypeEqual(field->type(), base_array->type()); auto array = internal::checked_pointer_cast(base_array); @@ -230,9 +230,9 @@ TEST(TypeSpecificTests, ListLengths) { } } { - auto field = arrow::field("list", large_list(int8())) - ->WithMetadata(key_value_metadata( - {{"min_length", "10"}, {"max_length", "10"}})); + auto field = + arrow::field("list", large_list(int8()), + key_value_metadata({{"min_length", "10"}, {"max_length", "10"}})); auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); AssertTypeEqual(field->type(), base_array->type()); auto array = internal::checked_pointer_cast(base_array); @@ -246,8 +246,8 @@ TEST(TypeSpecificTests, ListLengths) { } TEST(TypeSpecificTests, MapValues) { - auto field = arrow::field("map", map(int8(), int8())) - ->WithMetadata(key_value_metadata({{"values", "4"}})); + auto field = + arrow::field("map", map(int8(), int8()), key_value_metadata({{"values", "4"}})); auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); AssertTypeEqual(field->type(), base_array->type()); auto array = internal::checked_pointer_cast(base_array); @@ -257,8 +257,7 @@ TEST(TypeSpecificTests, MapValues) { } TEST(TypeSpecificTests, RepeatedStrings) { - auto field = - arrow::field("string", utf8())->WithMetadata(key_value_metadata({{"unique", "1"}})); + auto field = arrow::field("string", utf8(), key_value_metadata({{"unique", "1"}})); auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); AssertTypeEqual(field->type(), base_array->type()); auto array = internal::checked_pointer_cast(base_array); @@ -273,9 +272,8 @@ TEST(TypeSpecificTests, RepeatedStrings) { TEST(TypeSpecificTests, StringLengths) { { - auto field = arrow::field("list", utf8()) - ->WithMetadata( - key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); + auto field = arrow::field( + "list", utf8(), key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); AssertTypeEqual(field->type(), base_array->type()); auto array = internal::checked_pointer_cast(base_array); @@ -287,9 +285,8 @@ TEST(TypeSpecificTests, StringLengths) { } } { - auto field = arrow::field("list", binary()) - ->WithMetadata( - key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); + auto field = arrow::field( + "list", binary(), key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); AssertTypeEqual(field->type(), base_array->type()); auto array = internal::checked_pointer_cast(base_array); @@ -301,9 +298,9 @@ TEST(TypeSpecificTests, StringLengths) { } } { - auto field = arrow::field("list", large_utf8()) - ->WithMetadata(key_value_metadata( - {{"min_length", "10"}, {"max_length", "10"}})); + auto field = + arrow::field("list", large_utf8(), + key_value_metadata({{"min_length", "10"}, {"max_length", "10"}})); auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); AssertTypeEqual(field->type(), base_array->type()); auto array = internal::checked_pointer_cast(base_array); @@ -315,9 +312,9 @@ TEST(TypeSpecificTests, StringLengths) { } } { - auto field = arrow::field("list", large_binary()) - ->WithMetadata(key_value_metadata( - {{"min_length", "10"}, {"max_length", "10"}})); + auto field = + arrow::field("list", large_binary(), + key_value_metadata({{"min_length", "10"}, {"max_length", "10"}})); auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); AssertTypeEqual(field->type(), base_array->type()); auto array = internal::checked_pointer_cast(base_array); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 0a9f50529c2..344585446fc 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -2233,6 +2233,12 @@ std::shared_ptr field(std::string name, std::shared_ptr type, std::move(metadata)); } +std::shared_ptr field(std::string name, std::shared_ptr type, + std::shared_ptr metadata) { + return std::make_shared(std::move(name), std::move(type), /*nullable=*/true, + std::move(metadata)); +} + std::shared_ptr decimal(int32_t precision, int32_t scale) { return precision <= Decimal128Type::kMaxPrecision ? decimal128(precision, scale) : decimal256(precision, scale); diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 230c1ff6cb6..168e172bc88 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -629,6 +629,17 @@ std::shared_ptr ARROW_EXPORT field(std::string name, std::shared_ptr type, bool nullable = true, std::shared_ptr metadata = NULLPTR); +/// \brief Create a Field instance with metadata +/// +/// The field will be assumed to be nullable. +/// +/// \param name the field name +/// \param type the field value type +/// \param metadata any custom key-value metadata +std::shared_ptr ARROW_EXPORT +field(std::string name, std::shared_ptr type, + std::shared_ptr metadata); + /// \brief Create a Schema instance /// /// \param fields the schema's fields From 76ce5ab6d5559824800d741d64e04ace7674c50f Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 17 Mar 2021 16:52:16 -0400 Subject: [PATCH 08/11] Don't assume we have access to arrow::compute --- cpp/src/arrow/testing/random.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 90ddbcca491..ef7045d2f40 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -31,7 +31,6 @@ #include "arrow/array/builder_decimal.h" #include "arrow/array/builder_primitive.h" #include "arrow/buffer.h" -#include "arrow/compute/api_aggregate.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" #include "arrow/type_fwd.h" @@ -690,6 +689,8 @@ Result> GenerateArray(const Field& field, int64_t length, return generator->GENERATOR_FUNC(length, min_value, max_value, null_probability, \ nan_probability); \ } + + // Don't use compute::Sum since that may not get built #define GENERATE_LIST_CASE(ARRAY_TYPE) \ case ARRAY_TYPE::TypeClass::type_id: { \ const auto min_length = GetMetadata( \ @@ -700,8 +701,10 @@ Result> GenerateArray(const Field& field, int64_t length, CTypeTraits::ArrayType>( \ generator->Numeric::ArrowType>( \ length, min_length, max_length, null_probability)); \ - ARROW_ASSIGN_OR_RAISE(const auto values_datum, compute::Sum(lengths)); \ - const auto values_length = values_datum.scalar_as().value; \ + int64_t values_length = 0; \ + for (const auto& length : *lengths) { \ + if (length.has_value()) values_length += *length; \ + } \ const auto force_empty_nulls = \ GetMetadata(field.metadata().get(), "force_empty_nulls", false); \ const auto values = GenerateArray( \ From b5ef9fa3cfdbddb1e93b36a0b7f1e30429c905bd Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 18 Mar 2021 09:03:27 -0400 Subject: [PATCH 09/11] Fix null_count --- cpp/src/arrow/filesystem/s3fs_benchmark.cc | 2 +- cpp/src/arrow/testing/random.cc | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/s3fs_benchmark.cc b/cpp/src/arrow/filesystem/s3fs_benchmark.cc index 2bf2b400e8d..045fc8808de 100644 --- a/cpp/src/arrow/filesystem/s3fs_benchmark.cc +++ b/cpp/src/arrow/filesystem/s3fs_benchmark.cc @@ -163,7 +163,7 @@ class MinioFixture : public benchmark::Fixture { key_value_metadata( {{"min", "-1.e10"}, {"max", "1e10"}, {"null_probability", "0"}}))); } - auto batch = random::Generate(fields, num_rows, 0); + auto batch = random::GenerateBatch(fields, num_rows, 0); ARROW_ASSIGN_OR_RAISE(std::shared_ptr
table, Table::FromRecordBatches({batch})); diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index ef7045d2f40..5689eedbda6 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -451,6 +451,7 @@ std::shared_ptr OffsetsFromLengthsArray(OffsetArrayType* lengths, DCHECK_GE(*length, 0); } else { data[index] = data[index - 1]; + null_count++; } index++; } From cd55a41955ec6ae16865235ee04cad334effc0ad Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 18 Mar 2021 13:52:04 -0400 Subject: [PATCH 10/11] Apply suggestions from code review Co-authored-by: Benjamin Kietzman --- cpp/src/arrow/filesystem/s3fs_benchmark.cc | 15 +++++++-------- cpp/src/arrow/testing/random.h | 7 ++++--- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs_benchmark.cc b/cpp/src/arrow/filesystem/s3fs_benchmark.cc index 045fc8808de..36564a70d29 100644 --- a/cpp/src/arrow/filesystem/s3fs_benchmark.cc +++ b/cpp/src/arrow/filesystem/s3fs_benchmark.cc @@ -151,17 +151,16 @@ class MinioFixture : public benchmark::Fixture { field("timestamp", int64(), /*nullable=*/true, key_value_metadata( {{"min", "0"}, {"max", "10000000000"}, {"null_probability", "0"}})), - ::arrow::field( - "val", int32(), /*nullable=*/true, - key_value_metadata( - {{"min", "0"}, {"max", "1000000000"}, {"null_probability", "0"}}))}; + field("val", int32(), /*nullable=*/true, + key_value_metadata( + {{"min", "0"}, {"max", "1000000000"}, {"null_probability", "0"}}))}; for (int i = 0; i < num_columns; i++) { std::stringstream ss; ss << "col" << i; - fields.push_back(::arrow::field( - ss.str(), float64(), /*nullable=*/true, - key_value_metadata( - {{"min", "-1.e10"}, {"max", "1e10"}, {"null_probability", "0"}}))); + fields.push_back( + field(ss.str(), float64(), /*nullable=*/true, + key_value_metadata( + {{"min", "-1.e10"}, {"max", "1e10"}, {"null_probability", "0"}}))); } auto batch = random::GenerateBatch(fields, num_rows, 0); ARROW_ASSIGN_OR_RAISE(std::shared_ptr
table, diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index 7457cedcdec..af8066a9fc7 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -364,9 +364,10 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { /// Generate a record batch with random data of the specified length. /// -/// Generation options are read from key-value metadata for each field. Options -/// are applied recursively, e.g. for list(field(int8())), metadata of the child -/// field will be used when generating child values. +/// Generation options are read from key-value metadata for each field, and may be +/// specified at any nesting level. For example, generation options for the child values +/// of a list array can be specified by constructing the list type with +/// list(field("item", int8(), options_metadata)) /// /// The following options are supported: /// From 04de07986bd7c3a6e5786c09ea8e6ec6b590b95c Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 18 Mar 2021 14:06:44 -0400 Subject: [PATCH 11/11] ARROW-11745: [C++] Move random batch helper to method --- cpp/src/arrow/testing/random.cc | 152 ++++++++++++++++---------------- cpp/src/arrow/testing/random.h | 99 +++++++++++---------- 2 files changed, 128 insertions(+), 123 deletions(-) diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 5689eedbda6..a29a464846a 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -650,10 +650,9 @@ enable_if_parameter_free GetMetadata(const KeyValueMetadata* metad } return output; } +} // namespace -Result> GenerateArray(const Field& field, int64_t length, - RandomArrayGenerator* generator) { - // TODO: check min <= max in tests +std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t length) { #define VALIDATE_RANGE(PARAM, MIN, MAX) \ if (PARAM < MIN || PARAM > MAX) { \ ABORT_NOT_OK(Status::Invalid(field.ToString(), ": ", ARROW_STRINGIFY(PARAM), \ @@ -665,15 +664,15 @@ Result> GenerateArray(const Field& field, int64_t length, ABORT_NOT_OK( \ Status::Invalid(field.ToString(), ": min ", MIN, " must be <= max ", MAX)); \ } -#define GENERATE_INTEGRAL_CASE_VIEW(BASE_TYPE, VIEW_TYPE) \ - case VIEW_TYPE::type_id: { \ - const BASE_TYPE::c_type min_value = GetMetadata( \ - field.metadata().get(), "min", std::numeric_limits::min()); \ - const BASE_TYPE::c_type max_value = GetMetadata( \ - field.metadata().get(), "max", std::numeric_limits::max()); \ - VALIDATE_MIN_MAX(min_value, max_value); \ - return generator->Numeric(length, min_value, max_value, null_probability) \ - ->View(field.type()); \ +#define GENERATE_INTEGRAL_CASE_VIEW(BASE_TYPE, VIEW_TYPE) \ + case VIEW_TYPE::type_id: { \ + const BASE_TYPE::c_type min_value = GetMetadata( \ + field.metadata().get(), "min", std::numeric_limits::min()); \ + const BASE_TYPE::c_type max_value = GetMetadata( \ + field.metadata().get(), "max", std::numeric_limits::max()); \ + VALIDATE_MIN_MAX(min_value, max_value); \ + return *Numeric(length, min_value, max_value, null_probability) \ + ->View(field.type()); \ } #define GENERATE_INTEGRAL_CASE(ARROW_TYPE) \ GENERATE_INTEGRAL_CASE_VIEW(ARROW_TYPE, ARROW_TYPE) @@ -687,33 +686,33 @@ Result> GenerateArray(const Field& field, int64_t length, GetMetadata(field.metadata().get(), "nan_probability", 0); \ VALIDATE_MIN_MAX(min_value, max_value); \ VALIDATE_RANGE(nan_probability, 0.0, 1.0); \ - return generator->GENERATOR_FUNC(length, min_value, max_value, null_probability, \ - nan_probability); \ + return GENERATOR_FUNC(length, min_value, max_value, null_probability, \ + nan_probability); \ } // Don't use compute::Sum since that may not get built -#define GENERATE_LIST_CASE(ARRAY_TYPE) \ - case ARRAY_TYPE::TypeClass::type_id: { \ - const auto min_length = GetMetadata( \ - field.metadata().get(), "min_length", 0); \ - const auto max_length = GetMetadata( \ - field.metadata().get(), "max_length", 1024); \ - const auto lengths = internal::checked_pointer_cast< \ - CTypeTraits::ArrayType>( \ - generator->Numeric::ArrowType>( \ - length, min_length, max_length, null_probability)); \ - int64_t values_length = 0; \ - for (const auto& length : *lengths) { \ - if (length.has_value()) values_length += *length; \ - } \ - const auto force_empty_nulls = \ - GetMetadata(field.metadata().get(), "force_empty_nulls", false); \ - const auto values = GenerateArray( \ - *internal::checked_pointer_cast(field.type()) \ - ->value_field(), \ - values_length, generator); \ - const auto offsets = OffsetsFromLengthsArray(lengths.get(), force_empty_nulls); \ - return ARRAY_TYPE::FromArrays(*offsets, **values); \ +#define GENERATE_LIST_CASE(ARRAY_TYPE) \ + case ARRAY_TYPE::TypeClass::type_id: { \ + const auto min_length = GetMetadata( \ + field.metadata().get(), "min_length", 0); \ + const auto max_length = GetMetadata( \ + field.metadata().get(), "max_length", 1024); \ + const auto lengths = internal::checked_pointer_cast< \ + CTypeTraits::ArrayType>( \ + Numeric::ArrowType>( \ + length, min_length, max_length, null_probability)); \ + int64_t values_length = 0; \ + for (const auto& length : *lengths) { \ + if (length.has_value()) values_length += *length; \ + } \ + const auto force_empty_nulls = \ + GetMetadata(field.metadata().get(), "force_empty_nulls", false); \ + const auto values = \ + ArrayOf(*internal::checked_pointer_cast(field.type()) \ + ->value_field(), \ + values_length); \ + const auto offsets = OffsetsFromLengthsArray(lengths.get(), force_empty_nulls); \ + return *ARRAY_TYPE::FromArrays(*offsets, *values); \ } const double null_probability = @@ -729,7 +728,7 @@ Result> GenerateArray(const Field& field, int64_t length, case Type::type::BOOL: { const double true_probability = GetMetadata(field.metadata().get(), "true_probability", 0.5); - return generator->Boolean(length, true_probability, null_probability); + return Boolean(length, true_probability, null_probability); } GENERATE_INTEGRAL_CASE(UInt8Type); @@ -753,13 +752,12 @@ Result> GenerateArray(const Field& field, int64_t length, const auto unique_values = GetMetadata(field.metadata().get(), "unique", -1); if (unique_values > 0) { - return generator - ->StringWithRepeats(length, unique_values, min_length, max_length, - null_probability) - ->View(field.type()); + return *StringWithRepeats(length, unique_values, min_length, max_length, + null_probability) + ->View(field.type()); } - return generator->String(length, min_length, max_length, null_probability) - ->View(field.type()); + return *String(length, min_length, max_length, null_probability) + ->View(field.type()); } case Type::type::DECIMAL128: @@ -767,8 +765,7 @@ Result> GenerateArray(const Field& field, int64_t length, case Type::type::FIXED_SIZE_BINARY: { auto byte_width = internal::checked_pointer_cast(field.type())->byte_width(); - return generator->FixedSizeBinary(length, byte_width, null_probability) - ->View(field.type()); + return *FixedSizeBinary(length, byte_width, null_probability)->View(field.type()); } GENERATE_INTEGRAL_CASE_VIEW(Int32Type, Date32Type); @@ -789,11 +786,11 @@ Result> GenerateArray(const Field& field, int64_t length, std::vector field_names; for (int i = 0; i < field.type()->num_fields(); i++) { const auto& child_field = field.type()->field(i); - child_arrays[i] = *GenerateArray(*child_field, length, generator); + child_arrays[i] = ArrayOf(*child_field, length); field_names.push_back(child_field->name()); } - return StructArray::Make(child_arrays, field_names, - generator->NullBitmap(length, null_probability)); + return *StructArray::Make(child_arrays, field_names, + NullBitmap(length, null_probability)); } case Type::type::SPARSE_UNION: @@ -801,12 +798,12 @@ Result> GenerateArray(const Field& field, int64_t length, ArrayVector child_arrays(field.type()->num_fields()); for (int i = 0; i < field.type()->num_fields(); i++) { const auto& child_field = field.type()->field(i); - child_arrays[i] = *GenerateArray(*child_field, length, generator); + child_arrays[i] = ArrayOf(*child_field, length); } auto array = field.type()->id() == Type::type::SPARSE_UNION - ? generator->SparseUnion(child_arrays, length) - : generator->DenseUnion(child_arrays, length); - return array->View(field.type()); + ? SparseUnion(child_arrays, length) + : DenseUnion(child_arrays, length); + return *array->View(field.type()); } case Type::type::DICTIONARY: { @@ -814,9 +811,9 @@ Result> GenerateArray(const Field& field, int64_t length, GetMetadata(field.metadata().get(), "values", 4); auto dict_type = internal::checked_pointer_cast(field.type()); // TODO: no way to control generation of dictionary - auto values = *GenerateArray( - *arrow::field("temporary", dict_type->value_type(), /*nullable=*/false), - values_length, generator); + auto values = + ArrayOf(*arrow::field("temporary", dict_type->value_type(), /*nullable=*/false), + values_length); auto merged = field.metadata() ? field.metadata() : key_value_metadata({}, {}); if (merged->Contains("min")) ABORT_NOT_OK(Status::Invalid(field.ToString(), ": cannot specify min")); @@ -824,10 +821,10 @@ Result> GenerateArray(const Field& field, int64_t length, ABORT_NOT_OK(Status::Invalid(field.ToString(), ": cannot specify max")); merged = merged->Merge(*key_value_metadata( {{"min", "0"}, {"max", std::to_string(values_length - 1)}})); - auto indices = *GenerateArray( + auto indices = ArrayOf( *arrow::field("temporary", dict_type->index_type(), field.nullable(), merged), - length, generator); - return DictionaryArray::FromArrays(field.type(), indices, values); + length); + return *DictionaryArray::FromArrays(field.type(), indices, values); } case Type::type::MAP: { @@ -836,12 +833,12 @@ Result> GenerateArray(const Field& field, int64_t length, const auto force_empty_nulls = GetMetadata(field.metadata().get(), "force_empty_nulls", false); auto map_type = internal::checked_pointer_cast(field.type()); - auto keys = *GenerateArray(*map_type->key_field(), values_length, generator); - auto items = *GenerateArray(*map_type->item_field(), values_length, generator); + auto keys = ArrayOf(*map_type->key_field(), values_length); + auto items = ArrayOf(*map_type->item_field(), values_length); // need N + 1 offsets to have N values - auto offsets = generator->Offsets(length + 1, 0, values_length, null_probability, - force_empty_nulls); - return MapArray::FromArrays(map_type, offsets, keys, items); + auto offsets = + Offsets(length + 1, 0, values_length, null_probability, force_empty_nulls); + return *MapArray::FromArrays(map_type, offsets, keys, items); } case Type::type::EXTENSION: @@ -852,8 +849,8 @@ Result> GenerateArray(const Field& field, int64_t length, case Type::type::FIXED_SIZE_LIST: { auto list_type = internal::checked_pointer_cast(field.type()); const int64_t values_length = list_type->list_size() * length; - auto values = *GenerateArray(*list_type->value_field(), values_length, generator); - auto null_bitmap = generator->NullBitmap(length, null_probability); + auto values = ArrayOf(*list_type->value_field(), values_length); + auto null_bitmap = NullBitmap(length, null_probability); return std::make_shared(list_type, length, values, null_bitmap); } @@ -872,8 +869,8 @@ Result> GenerateArray(const Field& field, int64_t length, Status::NotImplemented("Generating random array with repeated values for " "large string/large binary types")); } - return generator->LargeString(length, min_length, max_length, null_probability) - ->View(field.type()); + return *LargeString(length, min_length, max_length, null_probability) + ->View(field.type()); } GENERATE_LIST_CASE(LargeListArray); @@ -893,23 +890,24 @@ Result> GenerateArray(const Field& field, int64_t length, return nullptr; } -} // namespace +std::shared_ptr RandomArrayGenerator::BatchOf( + const FieldVector& fields, int64_t length) { + std::vector> arrays(fields.size()); + for (size_t i = 0; i < fields.size(); i++) { + const auto& field = fields[i]; + arrays[i] = ArrayOf(*field, length); + } + return RecordBatch::Make(schema(fields), length, std::move(arrays)); +} std::shared_ptr GenerateArray(const Field& field, int64_t length, SeedType seed) { - RandomArrayGenerator generator(seed); - return *GenerateArray(field, length, &generator); + return RandomArrayGenerator(seed).ArrayOf(field, length); } std::shared_ptr GenerateBatch(const FieldVector& fields, int64_t length, SeedType seed) { - std::vector> arrays(fields.size()); - RandomArrayGenerator generator(seed); - for (size_t i = 0; i < fields.size(); i++) { - const auto& field = fields[i]; - arrays[i] = *GenerateArray(*field, length, &generator); - } - return RecordBatch::Make(schema(fields), length, std::move(arrays)); + return RandomArrayGenerator(seed).BatchOf(fields, length); } } // namespace random diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index af8066a9fc7..5c6b0b4ae77 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -355,6 +355,57 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { std::shared_ptr ArrayOf(std::shared_ptr type, int64_t size, double null_probability); + /// \brief Generate an array with random data based on the given field. See BatchOf + /// for usage info. + std::shared_ptr ArrayOf(const Field& field, int64_t size); + + /// \brief Generate a record batch with random data of the specified length. + /// + /// Generation options are read from key-value metadata for each field, and may be + /// specified at any nesting level. For example, generation options for the child + /// values of a list array can be specified by constructing the list type with + /// list(field("item", int8(), options_metadata)) + /// + /// The following options are supported: + /// + /// For all types except NullType: + /// - null_probability (double): range [0.0, 1.0] the probability of a null value. + /// Default/value is 0.0 if the field is marked non-nullable, else it is 0.01 + /// + /// For all numeric types T: + /// - min (T::c_type): the minimum value to generate (inclusive), default + /// std::numeric_limits::min() + /// - max (T::c_type): the maximum value to generate (inclusive), default + /// std::numeric_limits::max() + /// Note this means that, for example, min/max are int16_t values for HalfFloatType. + /// + /// For floating point types T for which is_physical_floating_type: + /// - nan_probability (double): range [0.0, 1.0] the probability of a NaN value. + /// + /// For BooleanType: + /// - true_probability (double): range [0.0, 1.0] the probability of a true. + /// + /// For DictionaryType: + /// - values (int32_t): the size of the dictionary. + /// Other properties are passed to the generator for the dictionary indices. However, + /// min and max cannot be specified. Note it is not possible to otherwise customize + /// the generation of dictionary values. + /// + /// For list, string, and binary types T, including their large variants: + /// - min_length (T::offset_type): the minimum length of the child to generate, + /// default 0 + /// - max_length (T::offset_type): the minimum length of the child to generate, + /// default 1024 + /// + /// For string and binary types T (not including their large variants): + /// - unique (int32_t): if positive, this many distinct values will be generated + /// and all array values will be one of these values, default -1 + /// + /// For MapType: + /// - values (int32_t): the number of key-value pairs to generate, which will be + /// partitioned among the array values. + std::shared_ptr BatchOf(const FieldVector& fields, int64_t size); + SeedType seed() { return seed_distribution_(seed_rng_); } private: @@ -362,56 +413,12 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { std::default_random_engine seed_rng_; }; -/// Generate a record batch with random data of the specified length. -/// -/// Generation options are read from key-value metadata for each field, and may be -/// specified at any nesting level. For example, generation options for the child values -/// of a list array can be specified by constructing the list type with -/// list(field("item", int8(), options_metadata)) -/// -/// The following options are supported: -/// -/// For all types except NullType: -/// - null_probability (double): range [0.0, 1.0] the probability of a null value. -/// Default/value is 0.0 if the field is marked non-nullable, else it is 0.01 -/// -/// For all numeric types T: -/// - min (T::c_type): the minimum value to generate (inclusive), default -/// std::numeric_limits::min() -/// - max (T::c_type): the maximum value to generate (inclusive), default -/// std::numeric_limits::max() -/// Note this means that, for example, min/max are int16_t values for HalfFloatType. -/// -/// For floating point types T for which is_physical_floating_type: -/// - nan_probability (double): range [0.0, 1.0] the probability of a NaN value. -/// -/// For BooleanType: -/// - true_probability (double): range [0.0, 1.0] the probability of a true. -/// -/// For DictionaryType: -/// - values (int32_t): the size of the dictionary. -/// Other properties are passed to the generator for the dictionary indices. However, min -/// and max cannot be specified. Note it is not possible to otherwise customize the -/// generation of dictionary values. -/// -/// For list, string, and binary types T, including their large variants: -/// - min_length (T::offset_type): the minimum length of the child to generate, -/// default 0 -/// - max_length (T::offset_type): the minimum length of the child to generate, -/// default 1024 -/// -/// For string and binary types T (not including their large variants): -/// - unique (int32_t): if positive, this many distinct values will be generated -/// and all array values will be one of these values, default -1 -/// -/// For MapType: -/// - values (int32_t): the number of key-value pairs to generate, which will be -/// partitioned among the array values. +/// Generate an array with random data. See RandomArrayGenerator::BatchOf. ARROW_TESTING_EXPORT std::shared_ptr GenerateBatch(const FieldVector& fields, int64_t size, SeedType seed); -/// Generate an array with random data. See GenerateBatch for usage info. +/// Generate an array with random data. See RandomArrayGenerator::BatchOf. ARROW_TESTING_EXPORT std::shared_ptr GenerateArray(const Field& field, int64_t size, SeedType seed);