From c9ec74c481602b1a425ac414e2a4034426377627 Mon Sep 17 00:00:00 2001
From: David Li
Date: Mon, 15 Mar 2021 17:45:06 -0400
Subject: [PATCH 01/11] ARROW-11745: [C++] Add helper to generate random record
batches by schema
---
cpp/src/arrow/testing/CMakeLists.txt | 4 +
cpp/src/arrow/testing/random.cc | 289 ++++++++++++++++++++++++++-
cpp/src/arrow/testing/random.h | 8 +
cpp/src/arrow/testing/random_test.cc | 195 ++++++++++++++++++
4 files changed, 486 insertions(+), 10 deletions(-)
create mode 100644 cpp/src/arrow/testing/random_test.cc
diff --git a/cpp/src/arrow/testing/CMakeLists.txt b/cpp/src/arrow/testing/CMakeLists.txt
index 125b385ad9e..073224d519b 100644
--- a/cpp/src/arrow/testing/CMakeLists.txt
+++ b/cpp/src/arrow/testing/CMakeLists.txt
@@ -17,6 +17,10 @@
arrow_install_all_headers("arrow/testing")
+if(ARROW_BUILD_TESTS)
+ add_arrow_test(random_test)
+endif()
+
# json_integration_test is two things at the same time:
# - an executable that can be called to answer integration test requests
# - a self-(unit)test for the C++ side of integration testing
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index 7bf5dd22d43..cc4cba3abca 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -39,7 +39,9 @@
#include "arrow/util/bitmap_reader.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
+#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
+#include "arrow/util/value_parsing.h"
namespace arrow {
@@ -369,12 +371,16 @@ std::shared_ptr RandomArrayGenerator::FixedSizeBinary(int64_t size,
std::move(null_bitmap), null_count);
}
-std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset,
- int32_t last_offset,
- double null_probability,
- bool force_empty_nulls) {
- using GenOpt = GenerateOptions>;
- GenOpt options(seed(), first_offset, last_offset, null_probability);
+namespace {
+template
+std::shared_ptr GenerateOffsets(SeedType seed, int64_t size,
+ typename ArrayType::value_type first_offset,
+ typename ArrayType::value_type last_offset,
+ double null_probability, bool force_empty_nulls) {
+ using GenOpt =
+ GenerateOptions>;
+ GenOpt options(seed, first_offset, last_offset, null_probability);
BufferVector buffers{2};
@@ -387,8 +393,9 @@ std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first
arrow::BitUtil::SetBit(null_bitmap, 0);
arrow::BitUtil::SetBit(null_bitmap, size - 1);
- buffers[1] = *AllocateBuffer(sizeof(int32_t) * size);
- auto data = reinterpret_cast(buffers[1]->mutable_data());
+ buffers[1] = *AllocateBuffer(sizeof(typename ArrayType::value_type) * size);
+ auto data =
+ reinterpret_cast(buffers[1]->mutable_data());
options.GenerateTypedData(data, size);
// Ensure offsets are in increasing order
std::sort(data, data + size);
@@ -410,8 +417,27 @@ std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first
}
}
- auto array_data = ArrayData::Make(int32(), size, buffers, null_count);
- return std::make_shared(array_data);
+ auto array_data = ArrayData::Make(std::make_shared(),
+ size, buffers, null_count);
+ return std::make_shared(array_data);
+}
+} // namespace
+
+std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset,
+ int32_t last_offset,
+ double null_probability,
+ bool force_empty_nulls) {
+ return GenerateOffsets>(seed(), size, first_offset, last_offset,
+ null_probability, force_empty_nulls);
+}
+
+std::shared_ptr RandomArrayGenerator::LargeOffsets(int64_t size,
+ int64_t first_offset,
+ int64_t last_offset,
+ double null_probability,
+ bool force_empty_nulls) {
+ return GenerateOffsets>(seed(), size, first_offset, last_offset,
+ null_probability, force_empty_nulls);
}
std::shared_ptr RandomArrayGenerator::List(const Array& values, int64_t size,
@@ -558,5 +584,248 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(std::shared_ptr t
return RandomArrayGeneratorOfImpl{this, type, size, null_probability, nullptr}.Finish();
}
+namespace {
+template
+typename T::c_type GetMetadata(const KeyValueMetadata* metadata, const std::string& key,
+ typename T::c_type default_value) {
+ if (!metadata) return default_value;
+ const auto index = metadata->FindKey(key);
+ if (index < 0) return default_value;
+ const auto& value = metadata->value(index);
+ typename T::c_type output{};
+ auto type = checked_pointer_cast(TypeTraits::type_singleton());
+ if (!internal::ParseValue(*type, value.data(), value.length(), &output)) {
+ ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value));
+ }
+ return output;
+}
+
+Result> GenerateArray(const Field& field, int64_t length,
+ RandomArrayGenerator* generator) {
+#define GENERATE_INTEGRAL_CASE_VIEW(BASE_TYPE, VIEW_TYPE) \
+ case VIEW_TYPE::type_id: { \
+ const BASE_TYPE::c_type min_value = GetMetadata( \
+ field.metadata().get(), "min", std::numeric_limits::min()); \
+ const BASE_TYPE::c_type max_value = GetMetadata( \
+ field.metadata().get(), "max", std::numeric_limits::max()); \
+ return generator->Numeric(length, min_value, max_value, null_probability) \
+ ->View(field.type()); \
+ }
+#define GENERATE_INTEGRAL_CASE(ARROW_TYPE) \
+ GENERATE_INTEGRAL_CASE_VIEW(ARROW_TYPE, ARROW_TYPE)
+#define GENERATE_FLOATING_CASE(ARROW_TYPE, GENERATOR_FUNC) \
+ case ARROW_TYPE::type_id: { \
+ const ARROW_TYPE::c_type min_value = GetMetadata( \
+ field.metadata().get(), "min", std::numeric_limits::min()); \
+ const ARROW_TYPE::c_type max_value = GetMetadata( \
+ field.metadata().get(), "max", std::numeric_limits::max()); \
+ const double nan_probability = \
+ GetMetadata(field.metadata().get(), "nan_probability", 10); \
+ return generator->GENERATOR_FUNC(length, min_value, max_value, null_probability, \
+ nan_probability); \
+ }
+
+ const double null_probability =
+ field.nullable()
+ ? GetMetadata(field.metadata().get(), "null_probability", 0.01)
+ : 0.0;
+ switch (field.type()->id()) {
+ case Type::type::NA:
+ return std::make_shared(length);
+
+ case Type::type::BOOL: {
+ const double true_probability =
+ GetMetadata(field.metadata().get(), "true_probability", 0.5);
+ return generator->Boolean(length, true_probability, null_probability);
+ }
+
+ GENERATE_INTEGRAL_CASE(UInt8Type);
+ GENERATE_INTEGRAL_CASE(Int8Type);
+ GENERATE_INTEGRAL_CASE(UInt16Type);
+ GENERATE_INTEGRAL_CASE(Int16Type);
+ GENERATE_INTEGRAL_CASE(UInt32Type);
+ GENERATE_INTEGRAL_CASE(Int32Type);
+ GENERATE_INTEGRAL_CASE(UInt64Type);
+ GENERATE_INTEGRAL_CASE(Int64Type);
+ GENERATE_INTEGRAL_CASE_VIEW(Int16Type, HalfFloatType);
+ GENERATE_FLOATING_CASE(FloatType, Float32);
+ GENERATE_FLOATING_CASE(DoubleType, Float64);
+
+ case Type::type::STRING:
+ case Type::type::BINARY: {
+ const int32_t min_length = GetMetadata(field.metadata().get(), "min", 0);
+ const int32_t max_length =
+ GetMetadata(field.metadata().get(), "max", 1024);
+ const int32_t unique_values =
+ GetMetadata(field.metadata().get(), "unique", -1);
+ if (unique_values > 0) {
+ return generator
+ ->StringWithRepeats(length, unique_values, min_length, max_length,
+ null_probability)
+ ->View(field.type());
+ }
+ return generator->String(length, min_length, max_length, null_probability)
+ ->View(field.type());
+ }
+
+ case Type::type::DECIMAL128:
+ case Type::type::DECIMAL256:
+ case Type::type::FIXED_SIZE_BINARY: {
+ auto byte_width =
+ internal::checked_pointer_cast(field.type())->byte_width();
+ return generator->FixedSizeBinary(length, byte_width, null_probability)
+ ->View(field.type());
+ }
+
+ GENERATE_INTEGRAL_CASE_VIEW(Int32Type, Date32Type);
+ GENERATE_INTEGRAL_CASE_VIEW(Int64Type, Date64Type);
+ GENERATE_INTEGRAL_CASE_VIEW(Int64Type, TimestampType);
+ GENERATE_INTEGRAL_CASE_VIEW(Int32Type, Time32Type);
+ GENERATE_INTEGRAL_CASE_VIEW(Int64Type, Time64Type);
+ GENERATE_INTEGRAL_CASE_VIEW(Int32Type, MonthIntervalType);
+
+ // This isn't as flexible as it could be, but the array-of-structs layout of this
+ // type means it's not a (useful) composition of other generators
+ GENERATE_INTEGRAL_CASE_VIEW(Int64Type, DayTimeIntervalType);
+
+ case Type::type::LIST: {
+ const int32_t values_length =
+ GetMetadata(field.metadata().get(), "values", length);
+ const bool force_empty_nulls =
+ GetMetadata(field.metadata().get(), "force_empty_nulls", false);
+ auto values = GenerateArray(
+ *internal::checked_pointer_cast(field.type())->value_field(),
+ values_length, generator);
+ // need N + 1 offsets to have N values
+ auto offsets = generator->Offsets(length + 1, 0, values_length, null_probability,
+ force_empty_nulls);
+ return ListArray::FromArrays(*offsets, **values);
+ }
+
+ case Type::type::STRUCT: {
+ ArrayVector child_arrays(field.type()->num_fields());
+ std::vector field_names;
+ for (int i = 0; i < field.type()->num_fields(); i++) {
+ const auto& child_field = field.type()->field(i);
+ child_arrays[i] = *GenerateArray(*child_field, length, generator);
+ field_names.push_back(child_field->name());
+ }
+ return StructArray::Make(child_arrays, field_names,
+ generator->NullBitmap(length, null_probability));
+ }
+
+ case Type::type::SPARSE_UNION:
+ case Type::type::DENSE_UNION: {
+ ArrayVector child_arrays(field.type()->num_fields());
+ for (int i = 0; i < field.type()->num_fields(); i++) {
+ const auto& child_field = field.type()->field(i);
+ child_arrays[i] = *GenerateArray(*child_field, length, generator);
+ }
+ return field.type()->id() == Type::type::SPARSE_UNION
+ ? generator->SparseUnion(child_arrays, length)
+ : generator->DenseUnion(child_arrays, length);
+ }
+
+ case Type::type::DICTIONARY: {
+ const int64_t values_length =
+ GetMetadata(field.metadata().get(), "values", 4);
+ auto dict_type = internal::checked_pointer_cast(field.type());
+ // TODO: no way to control generation of dictionary
+ auto values = *GenerateArray(
+ *arrow::field("temporary", dict_type->value_type(), /*nullable=*/false),
+ values_length, generator);
+ auto merged = field.metadata() ? field.metadata() : key_value_metadata({}, {});
+ merged = merged->Merge(*key_value_metadata(
+ {{"min", "0"}, {"max", std::to_string(values_length - 1)}}));
+ auto indices = *GenerateArray(
+ *arrow::field("temporary", dict_type->index_type(), field.nullable(), merged),
+ length, generator);
+ return DictionaryArray::FromArrays(field.type(), indices, values);
+ }
+
+ case Type::type::MAP: {
+ const int32_t values_length =
+ GetMetadata(field.metadata().get(), "values", length);
+ const bool force_empty_nulls =
+ GetMetadata(field.metadata().get(), "force_empty_nulls", false);
+ auto map_type = internal::checked_pointer_cast(field.type());
+ auto keys = *GenerateArray(*map_type->key_field(), values_length, generator);
+ auto items = *GenerateArray(*map_type->item_field(), values_length, generator);
+ // need N + 1 offsets to have N values
+ auto offsets = generator->Offsets(length + 1, 0, values_length, null_probability,
+ force_empty_nulls);
+ return MapArray::FromArrays(map_type, offsets, keys, items);
+ }
+
+ case Type::type::EXTENSION:
+ // Could be supported by generating the storage type (though any extension
+ // invariants wouldn't be preserved)
+ break;
+
+ case Type::type::FIXED_SIZE_LIST: {
+ auto list_type = internal::checked_pointer_cast(field.type());
+ const int64_t values_length = list_type->list_size() * length;
+ auto values = *GenerateArray(*list_type->value_field(), values_length, generator);
+ auto null_bitmap = generator->NullBitmap(length, null_probability);
+ return std::make_shared(list_type, length, values, null_bitmap);
+ }
+
+ GENERATE_INTEGRAL_CASE_VIEW(Int64Type, DurationType);
+
+ case Type::type::LARGE_STRING:
+ case Type::type::LARGE_BINARY: {
+ const int32_t min_length = GetMetadata(field.metadata().get(), "min", 0);
+ const int32_t max_length =
+ GetMetadata(field.metadata().get(), "max", 1024);
+ const int32_t unique_values =
+ GetMetadata(field.metadata().get(), "unique", -1);
+ if (unique_values > 0) {
+ ABORT_NOT_OK(
+ Status::NotImplemented("Generating random array with repeated values for "
+ "large string/large binary types"));
+ }
+ return generator->LargeString(length, min_length, max_length, null_probability)
+ ->View(field.type());
+ }
+
+ case Type::type::LARGE_LIST: {
+ const int64_t values_length =
+ GetMetadata(field.metadata().get(), "values", length);
+ const bool force_empty_nulls =
+ GetMetadata(field.metadata().get(), "force_empty_nulls", false);
+ auto values = GenerateArray(
+ *internal::checked_pointer_cast(field.type())->value_field(),
+ values_length, generator);
+ // need N + 1 offsets to have N values
+ auto offsets = generator->LargeOffsets(length + 1, 0, values_length,
+ null_probability, force_empty_nulls);
+ return LargeListArray::FromArrays(*offsets, **values);
+ }
+
+ default:
+ break;
+ }
+#undef GENERATE_INTEGRAL_CASE_VIEW
+#undef GENERATE_INTEGRAL_CASE
+#undef GENERATE_FLOATING_CASE
+
+ ABORT_NOT_OK(
+ Status::NotImplemented("Generating random array for field ", field.ToString()));
+ return nullptr;
+}
+
+} // namespace
+
+std::shared_ptr Generate(const FieldVector& fields, int64_t length,
+ SeedType seed) {
+ std::vector> arrays(fields.size());
+ RandomArrayGenerator generator(seed);
+ for (size_t i = 0; i < fields.size(); i++) {
+ const auto& field = fields[i];
+ arrays[i] = *GenerateArray(*field, length, &generator);
+ }
+ return RecordBatch::Make(schema(fields), length, std::move(arrays));
+}
+
} // namespace random
} // namespace arrow
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h
index 2358ab0911f..c57093da0d0 100644
--- a/cpp/src/arrow/testing/random.h
+++ b/cpp/src/arrow/testing/random.h
@@ -249,6 +249,10 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
double null_probability = 0,
bool force_empty_nulls = false);
+ std::shared_ptr LargeOffsets(int64_t size, int64_t first_offset,
+ int64_t last_offset, double null_probability = 0,
+ bool force_empty_nulls = false);
+
/// \brief Generate a random StringArray
///
/// \param[in] size the size of the array to generate
@@ -358,6 +362,10 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
std::default_random_engine seed_rng_;
};
+ARROW_TESTING_EXPORT
+std::shared_ptr Generate(const FieldVector& fields, int64_t size,
+ SeedType seed);
+
} // namespace random
//
diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc
new file mode 100644
index 00000000000..6bf86fa6af8
--- /dev/null
+++ b/cpp/src/arrow/testing/random_test.cc
@@ -0,0 +1,195 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include
+
+#include "arrow/array.h"
+#include "arrow/record_batch.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
+#include "arrow/type.h"
+#include "arrow/util/key_value_metadata.h"
+
+namespace arrow {
+namespace random {
+
+class RandomArrayTest : public ::testing::TestWithParam> {
+ protected:
+ std::shared_ptr GetField() { return GetParam(); }
+};
+
+template
+class RandomNumericArrayTest : public ::testing::Test {
+ protected:
+ std::shared_ptr GetField() { return field("field0", std::make_shared()); }
+
+ std::shared_ptr> Downcast(std::shared_ptr array) {
+ return internal::checked_pointer_cast>(array);
+ }
+};
+
+TEST_P(RandomArrayTest, GenerateArray) {
+ auto field = GetField();
+ auto batch = Generate({field}, 128, 0xDEADBEEF);
+ AssertSchemaEqual(schema({field}), batch->schema());
+ auto array = batch->column(0);
+ ASSERT_EQ(128, array->length());
+ ASSERT_OK(array->ValidateFull());
+}
+
+TEST_P(RandomArrayTest, GenerateNonNullArray) {
+ auto field =
+ GetField()->WithMetadata(key_value_metadata({{"null_probability", "0.0"}}));
+ if (field->type()->id() == Type::type::NA) {
+ GTEST_SKIP() << "Cannot generate non-null null arrays";
+ }
+ auto batch = Generate({field}, 128, 0xDEADBEEF);
+ AssertSchemaEqual(schema({field}), batch->schema());
+ auto array = batch->column(0);
+ ASSERT_OK(array->ValidateFull());
+ ASSERT_EQ(0, array->null_count());
+}
+
+TEST_P(RandomArrayTest, GenerateNonNullableArray) {
+ auto field = GetField()->WithNullable(false);
+ if (field->type()->id() == Type::type::NA) {
+ GTEST_SKIP() << "Cannot generate non-null null arrays";
+ }
+ auto batch = Generate({field}, 128, 0xDEADBEEF);
+ AssertSchemaEqual(schema({field}), batch->schema());
+ auto array = batch->column(0);
+ ASSERT_OK(array->ValidateFull());
+ ASSERT_EQ(0, array->null_count());
+}
+
+struct FieldParamName {
+ template
+ std::string operator()(const ::testing::TestParamInfo& info) const {
+ return std::to_string(info.index) + info.param->name();
+ }
+};
+
+auto values = ::testing::Values(
+ field("null", null()), field("bool", boolean()), field("uint8", uint8()),
+ field("int8", int8()), field("uint16", uint16()), field("int16", int16()),
+ field("uint32", uint32()), field("int32", int32()), field("uint64", uint64()),
+ field("int64", int64()), field("float16", float16()), field("float32", float32()),
+ field("float64", float64()), field("string", utf8()), field("binary", binary()),
+ field("fixed_size_binary", fixed_size_binary(8)),
+ field("decimal128", decimal128(8, 3)), field("decimal256", decimal256(16, 4)),
+ field("date32", date32()), field("date64", date64()),
+ field("timestampns", timestamp(TimeUnit::NANO)),
+ field("timestamps", timestamp(TimeUnit::SECOND, "America/Phoenix")),
+ field("time32ms", time32(TimeUnit::MILLI)), field("time64ns", time64(TimeUnit::NANO)),
+ field("time32s", time32(TimeUnit::SECOND)),
+ field("time64us", time64(TimeUnit::MICRO)), field("month_interval", month_interval()),
+ field("daytime_interval", day_time_interval()), field("listint8", list(int8())),
+ field("listlistint8", list(list(int8()))),
+ field("listint8emptynulls", list(int8()), true,
+ key_value_metadata({{"force_empty_nulls", "true"}})),
+ field("listint81024values", list(int8()), true,
+ key_value_metadata({{"values", "1024"}})),
+ field("structints", struct_({
+ field("int8", int8()),
+ field("int16", int16()),
+ field("int32", int32()),
+ })),
+ field("structnested", struct_({
+ field("string", utf8()),
+ field("list", list(int64())),
+ field("timestamp", timestamp(TimeUnit::MILLI)),
+ })),
+ field("sparseunion", sparse_union({
+ field("int8", int8()),
+ field("int16", int16()),
+ field("int32", int32()),
+ })),
+ field("denseunion", dense_union({
+ field("int8", int8()),
+ field("int16", int16()),
+ field("int32", int32()),
+ })),
+ field("dictionary", dictionary(int8(), utf8())), field("map", map(int8(), utf8())),
+ field("fixedsizelist", fixed_size_list(int8(), 4)),
+ field("durationns", duration(TimeUnit::NANO)), field("largestring", large_utf8()),
+ field("largebinary", large_binary()),
+ field("largelistlistint8", large_list(list(int8()))));
+
+INSTANTIATE_TEST_SUITE_P(
+ TestRandomArrayGeneration, RandomArrayTest, values,
+ [](const ::testing::TestParamInfo& info) {
+ return std::to_string(info.index) + info.param->name();
+ });
+
+using NumericTypes =
+ ::testing::Types;
+TYPED_TEST_SUITE(RandomNumericArrayTest, NumericTypes);
+
+TYPED_TEST(RandomNumericArrayTest, GenerateMinMax) {
+ auto field =
+ this->GetField()->WithMetadata(key_value_metadata({{"min", "0"}, {"max", "127"}}));
+ auto batch = Generate({field}, 128, 0xDEADBEEF);
+ AssertSchemaEqual(schema({field}), batch->schema());
+ auto array = this->Downcast(batch->column(0));
+ auto it = array->begin();
+ while (it != array->end()) {
+ if ((*it).has_value() && !std::isnan(**it)) {
+ ASSERT_GE(**it, 0);
+ ASSERT_LE(**it, 128);
+ }
+ it++;
+ }
+}
+
+TEST(TypeSpecificTests, FloatNan) {
+ auto field = arrow::field("float32", float32())
+ ->WithMetadata(key_value_metadata({{"nan_probability", "1.0"}}));
+ auto batch = Generate({field}, 128, 0xDEADBEEF);
+ AssertSchemaEqual(schema({field}), batch->schema());
+ auto array = internal::checked_pointer_cast>(batch->column(0));
+ auto it = array->begin();
+ while (it != array->end()) {
+ if ((*it).has_value()) {
+ ASSERT_TRUE(std::isnan(**it));
+ }
+ it++;
+ }
+}
+
+TEST(TypeSpecificTests, RepeatedStrings) {
+ auto field =
+ arrow::field("string", utf8())->WithMetadata(key_value_metadata({{"unique", "1"}}));
+ auto batch = Generate({field}, 128, 0xDEADBEEF);
+ AssertSchemaEqual(schema({field}), batch->schema());
+ auto array = internal::checked_pointer_cast(batch->column(0));
+ auto it = array->begin();
+ util::optional singular_value;
+ while (it != array->end()) {
+ if ((*it).has_value()) {
+ if (!singular_value.has_value()) {
+ singular_value = *it;
+ } else {
+ ASSERT_EQ(*singular_value, **it);
+ }
+ }
+ it++;
+ }
+}
+
+} // namespace random
+} // namespace arrow
From 36385c9d67f1cce27fcb1a513f1cb2d2fd803386 Mon Sep 17 00:00:00 2001
From: David Li
Date: Tue, 16 Mar 2021 09:21:58 -0400
Subject: [PATCH 02/11] ARROW-11745: [C++] Use random batch helper in S3FS
benchmark
---
cpp/src/arrow/filesystem/s3fs_benchmark.cc | 38 +++++++++-------------
cpp/src/arrow/testing/random.cc | 10 +++---
cpp/src/arrow/testing/random_test.cc | 10 +++---
3 files changed, 26 insertions(+), 32 deletions(-)
diff --git a/cpp/src/arrow/filesystem/s3fs_benchmark.cc b/cpp/src/arrow/filesystem/s3fs_benchmark.cc
index 88911fd2aa9..ccf78434ae9 100644
--- a/cpp/src/arrow/filesystem/s3fs_benchmark.cc
+++ b/cpp/src/arrow/filesystem/s3fs_benchmark.cc
@@ -36,6 +36,7 @@
#include "arrow/table.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
+#include "arrow/util/key_value_metadata.h"
#include "arrow/util/range.h"
#include "parquet/arrow/reader.h"
@@ -146,32 +147,25 @@ class MinioFixture : public benchmark::Fixture {
/// Appends integer columns to the beginning (to act as indices).
Status MakeParquetObject(const std::string& path, int num_columns, int num_rows) {
std::vector> columns;
- std::vector> fields;
-
- {
- arrow::random::RandomArrayGenerator generator(0);
- std::shared_ptr values = generator.Int64(num_rows, 0, 1e10, 0);
- columns.push_back(std::make_shared(values));
- fields.push_back(::arrow::field("timestamp", values->type()));
- }
- {
- arrow::random::RandomArrayGenerator generator(1);
- std::shared_ptr values = generator.Int32(num_rows, 0, 1e9, 0);
- columns.push_back(std::make_shared(values));
- fields.push_back(::arrow::field("val", values->type()));
- }
-
+ FieldVector fields{::arrow::field("timestamp", int64(), /*nullable=*/true,
+ key_value_metadata({{"min", "0"},
+ {"max", "10000000000"},
+ {"null_probability", "0"}})),
+ ::arrow::field("val", int32(), /*nullable=*/true,
+ key_value_metadata({{"min", "0"},
+ {"max", "1000000000"},
+ {"null_probability", "0"}}))};
for (int i = 0; i < num_columns; i++) {
- arrow::random::RandomArrayGenerator generator(i);
- std::shared_ptr values = generator.Float64(num_rows, -1.e10, 1e10, 0);
std::stringstream ss;
ss << "col" << i;
- columns.push_back(std::make_shared(values));
- fields.push_back(::arrow::field(ss.str(), values->type()));
+ fields.push_back(::arrow::field(
+ ss.str(), float64(), /*nullable=*/true,
+ key_value_metadata(
+ {{"min", "-1.e10"}, {"max", "1e10"}, {"null_probability", "0"}})));
}
- auto schema = std::make_shared<::arrow::Schema>(fields);
-
- std::shared_ptr table = Table::Make(schema, columns);
+ auto batch = random::Generate(fields, num_rows, 0);
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr table,
+ Table::FromRecordBatches({batch}));
std::shared_ptr sink;
ARROW_ASSIGN_OR_RAISE(sink, fs_->OpenOutputStream(path));
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index cc4cba3abca..781f67abec9 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -620,7 +620,7 @@ Result> GenerateArray(const Field& field, int64_t length,
const ARROW_TYPE::c_type max_value = GetMetadata( \
field.metadata().get(), "max", std::numeric_limits::max()); \
const double nan_probability = \
- GetMetadata(field.metadata().get(), "nan_probability", 10); \
+ GetMetadata(field.metadata().get(), "nan_probability", 0); \
return generator->GENERATOR_FUNC(length, min_value, max_value, null_probability, \
nan_probability); \
}
@@ -689,8 +689,8 @@ Result> GenerateArray(const Field& field, int64_t length,
GENERATE_INTEGRAL_CASE_VIEW(Int64Type, DayTimeIntervalType);
case Type::type::LIST: {
- const int32_t values_length =
- GetMetadata(field.metadata().get(), "values", length);
+ const int32_t values_length = GetMetadata(
+ field.metadata().get(), "values", static_cast(length));
const bool force_empty_nulls =
GetMetadata(field.metadata().get(), "force_empty_nulls", false);
auto values = GenerateArray(
@@ -744,8 +744,8 @@ Result> GenerateArray(const Field& field, int64_t length,
}
case Type::type::MAP: {
- const int32_t values_length =
- GetMetadata(field.metadata().get(), "values", length);
+ const int32_t values_length = GetMetadata(
+ field.metadata().get(), "values", static_cast(length));
const bool force_empty_nulls =
GetMetadata(field.metadata().get(), "force_empty_nulls", false);
auto map_type = internal::checked_pointer_cast(field.type());
diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc
index 6bf86fa6af8..a3cfa408865 100644
--- a/cpp/src/arrow/testing/random_test.cc
+++ b/cpp/src/arrow/testing/random_test.cc
@@ -141,16 +141,16 @@ using NumericTypes =
TYPED_TEST_SUITE(RandomNumericArrayTest, NumericTypes);
TYPED_TEST(RandomNumericArrayTest, GenerateMinMax) {
- auto field =
- this->GetField()->WithMetadata(key_value_metadata({{"min", "0"}, {"max", "127"}}));
+ auto field = this->GetField()->WithMetadata(
+ key_value_metadata({{"min", "0"}, {"max", "127"}, {"nan_probability", "0.0"}}));
auto batch = Generate({field}, 128, 0xDEADBEEF);
AssertSchemaEqual(schema({field}), batch->schema());
auto array = this->Downcast(batch->column(0));
auto it = array->begin();
while (it != array->end()) {
- if ((*it).has_value() && !std::isnan(**it)) {
- ASSERT_GE(**it, 0);
- ASSERT_LE(**it, 128);
+ if ((*it).has_value()) {
+ ASSERT_GE(**it, typename TypeParam::c_type(0));
+ ASSERT_LE(**it, typename TypeParam::c_type(127));
}
it++;
}
From 3aa89b26bb7cdb131cf4d50e6638fd37d4c29c5f Mon Sep 17 00:00:00 2001
From: David Li
Date: Wed, 17 Mar 2021 10:49:19 -0400
Subject: [PATCH 03/11] Apply suggestions from code review
Co-authored-by: Benjamin Kietzman
---
cpp/src/arrow/filesystem/s3fs_benchmark.cc | 2 +-
cpp/src/arrow/testing/random.cc | 12 ++++----
cpp/src/arrow/testing/random.h | 2 +-
cpp/src/arrow/testing/random_test.cc | 33 ++++++----------------
4 files changed, 16 insertions(+), 33 deletions(-)
diff --git a/cpp/src/arrow/filesystem/s3fs_benchmark.cc b/cpp/src/arrow/filesystem/s3fs_benchmark.cc
index ccf78434ae9..b732813aedc 100644
--- a/cpp/src/arrow/filesystem/s3fs_benchmark.cc
+++ b/cpp/src/arrow/filesystem/s3fs_benchmark.cc
@@ -147,7 +147,7 @@ class MinioFixture : public benchmark::Fixture {
/// Appends integer columns to the beginning (to act as indices).
Status MakeParquetObject(const std::string& path, int num_columns, int num_rows) {
std::vector> columns;
- FieldVector fields{::arrow::field("timestamp", int64(), /*nullable=*/true,
+ FieldVector fields{field("timestamp", int64(), /*nullable=*/true,
key_value_metadata({{"min", "0"},
{"max", "10000000000"},
{"null_probability", "0"}})),
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index 781f67abec9..f0b90f8f1ff 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -372,7 +372,7 @@ std::shared_ptr RandomArrayGenerator::FixedSizeBinary(int64_t size,
}
namespace {
-template
+template
std::shared_ptr GenerateOffsets(SeedType seed, int64_t size,
typename ArrayType::value_type first_offset,
typename ArrayType::value_type last_offset,
@@ -585,16 +585,14 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(std::shared_ptr t
}
namespace {
-template
-typename T::c_type GetMetadata(const KeyValueMetadata* metadata, const std::string& key,
- typename T::c_type default_value) {
+template ::ArrowType>
+enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, const std::string& key, T default_value) {
if (!metadata) return default_value;
const auto index = metadata->FindKey(key);
if (index < 0) return default_value;
const auto& value = metadata->value(index);
- typename T::c_type output{};
- auto type = checked_pointer_cast(TypeTraits::type_singleton());
- if (!internal::ParseValue(*type, value.data(), value.length(), &output)) {
+ T output{};
+ if (!internal::ParseValue(value.data(), value.length(), &output)) {
ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value));
}
return output;
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h
index c57093da0d0..bce792c53b4 100644
--- a/cpp/src/arrow/testing/random.h
+++ b/cpp/src/arrow/testing/random.h
@@ -363,7 +363,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
};
ARROW_TESTING_EXPORT
-std::shared_ptr Generate(const FieldVector& fields, int64_t size,
+std::shared_ptr GenerateBatch(const FieldVector& fields, int64_t size,
SeedType seed);
} // namespace random
diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc
index a3cfa408865..09742d64cc9 100644
--- a/cpp/src/arrow/testing/random_test.cc
+++ b/cpp/src/arrow/testing/random_test.cc
@@ -51,7 +51,7 @@ TEST_P(RandomArrayTest, GenerateArray) {
ASSERT_OK(array->ValidateFull());
}
-TEST_P(RandomArrayTest, GenerateNonNullArray) {
+TEST_P(RandomArrayTest, GenerateArrayWithZeroNullProbability) {
auto field =
GetField()->WithMetadata(key_value_metadata({{"null_probability", "0.0"}}));
if (field->type()->id() == Type::type::NA) {
@@ -76,12 +76,6 @@ TEST_P(RandomArrayTest, GenerateNonNullableArray) {
ASSERT_EQ(0, array->null_count());
}
-struct FieldParamName {
- template
- std::string operator()(const ::testing::TestParamInfo& info) const {
- return std::to_string(info.index) + info.param->name();
- }
-};
auto values = ::testing::Values(
field("null", null()), field("bool", boolean()), field("uint8", uint8()),
@@ -146,13 +140,10 @@ TYPED_TEST(RandomNumericArrayTest, GenerateMinMax) {
auto batch = Generate({field}, 128, 0xDEADBEEF);
AssertSchemaEqual(schema({field}), batch->schema());
auto array = this->Downcast(batch->column(0));
- auto it = array->begin();
- while (it != array->end()) {
- if ((*it).has_value()) {
- ASSERT_GE(**it, typename TypeParam::c_type(0));
- ASSERT_LE(**it, typename TypeParam::c_type(127));
- }
- it++;
+ for (auto slot : *array) {
+ if (!slot.has_value()) continue;
+ ASSERT_GE(slot, 0);
+ ASSERT_LE(slot, 127);
}
}
@@ -178,16 +169,10 @@ TEST(TypeSpecificTests, RepeatedStrings) {
AssertSchemaEqual(schema({field}), batch->schema());
auto array = internal::checked_pointer_cast(batch->column(0));
auto it = array->begin();
- util::optional singular_value;
- while (it != array->end()) {
- if ((*it).has_value()) {
- if (!singular_value.has_value()) {
- singular_value = *it;
- } else {
- ASSERT_EQ(*singular_value, **it);
- }
- }
- it++;
+ util::string_view singular_value = array->GetView(0);
+ for (auto slot : *array) {
+ if (!slot.has_value()) continue;
+ ASSERT_EQ(slot, singular_value);
}
}
From 4ffb420076280826785a536c195a8d179886a234 Mon Sep 17 00:00:00 2001
From: David Li
Date: Wed, 17 Mar 2021 11:42:30 -0400
Subject: [PATCH 04/11] Fix up code review suggestions
---
cpp/src/arrow/filesystem/s3fs_benchmark.cc | 16 +--
cpp/src/arrow/testing/random.cc | 120 ++++++++++++---------
cpp/src/arrow/testing/random.h | 27 ++++-
cpp/src/arrow/testing/random_test.cc | 26 +++--
4 files changed, 122 insertions(+), 67 deletions(-)
diff --git a/cpp/src/arrow/filesystem/s3fs_benchmark.cc b/cpp/src/arrow/filesystem/s3fs_benchmark.cc
index b732813aedc..2bf2b400e8d 100644
--- a/cpp/src/arrow/filesystem/s3fs_benchmark.cc
+++ b/cpp/src/arrow/filesystem/s3fs_benchmark.cc
@@ -147,14 +147,14 @@ class MinioFixture : public benchmark::Fixture {
/// Appends integer columns to the beginning (to act as indices).
Status MakeParquetObject(const std::string& path, int num_columns, int num_rows) {
std::vector> columns;
- FieldVector fields{field("timestamp", int64(), /*nullable=*/true,
- key_value_metadata({{"min", "0"},
- {"max", "10000000000"},
- {"null_probability", "0"}})),
- ::arrow::field("val", int32(), /*nullable=*/true,
- key_value_metadata({{"min", "0"},
- {"max", "1000000000"},
- {"null_probability", "0"}}))};
+ FieldVector fields{
+ field("timestamp", int64(), /*nullable=*/true,
+ key_value_metadata(
+ {{"min", "0"}, {"max", "10000000000"}, {"null_probability", "0"}})),
+ ::arrow::field(
+ "val", int32(), /*nullable=*/true,
+ key_value_metadata(
+ {{"min", "0"}, {"max", "1000000000"}, {"null_probability", "0"}}))};
for (int i = 0; i < num_columns; i++) {
std::stringstream ss;
ss << "col" << i;
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index f0b90f8f1ff..e311512f13c 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -374,12 +374,12 @@ std::shared_ptr RandomArrayGenerator::FixedSizeBinary(int64_t size,
namespace {
template
std::shared_ptr GenerateOffsets(SeedType seed, int64_t size,
- typename ArrayType::value_type first_offset,
- typename ArrayType::value_type last_offset,
+ typename OffsetArrayType::value_type first_offset,
+ typename OffsetArrayType::value_type last_offset,
double null_probability, bool force_empty_nulls) {
- using GenOpt =
- GenerateOptions>;
+ using GenOpt = GenerateOptions<
+ typename OffsetArrayType::value_type,
+ std::uniform_int_distribution>;
GenOpt options(seed, first_offset, last_offset, null_probability);
BufferVector buffers{2};
@@ -393,9 +393,9 @@ std::shared_ptr GenerateOffsets(SeedType seed, int64_t size,
arrow::BitUtil::SetBit(null_bitmap, 0);
arrow::BitUtil::SetBit(null_bitmap, size - 1);
- buffers[1] = *AllocateBuffer(sizeof(typename ArrayType::value_type) * size);
+ buffers[1] = *AllocateBuffer(sizeof(typename OffsetArrayType::value_type) * size);
auto data =
- reinterpret_cast(buffers[1]->mutable_data());
+ reinterpret_cast(buffers[1]->mutable_data());
options.GenerateTypedData(data, size);
// Ensure offsets are in increasing order
std::sort(data, data + size);
@@ -417,9 +417,9 @@ std::shared_ptr GenerateOffsets(SeedType seed, int64_t size,
}
}
- auto array_data = ArrayData::Make(std::make_shared(),
- size, buffers, null_count);
- return std::make_shared(array_data);
+ auto array_data = ArrayData::Make(
+ std::make_shared(), size, buffers, null_count);
+ return std::make_shared(array_data);
}
} // namespace
@@ -586,7 +586,9 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(std::shared_ptr t
namespace {
template ::ArrowType>
-enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, const std::string& key, T default_value) {
+enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata,
+ const std::string& key,
+ T default_value) {
if (!metadata) return default_value;
const auto index = metadata->FindKey(key);
if (index < 0) return default_value;
@@ -600,12 +602,25 @@ enable_if_parameter_free GetMetadata(const KeyValueMetadata* metad
Result> GenerateArray(const Field& field, int64_t length,
RandomArrayGenerator* generator) {
+ // TODO: check min <= max in tests
+#define VALIDATE_RANGE(PARAM, MIN, MAX) \
+ if (PARAM < MIN || PARAM > MAX) { \
+ ABORT_NOT_OK(Status::Invalid(field.ToString(), ": ", ARROW_STRINGIFY(PARAM), \
+ " must be in [", MIN, ", ", MAX, " ] but got ", \
+ null_probability)); \
+ }
+#define VALIDATE_MIN_MAX(MIN, MAX) \
+ if (MIN > MAX) { \
+ ABORT_NOT_OK( \
+ Status::Invalid(field.ToString(), ": min ", MIN, " must be <= max ", MAX)); \
+ }
#define GENERATE_INTEGRAL_CASE_VIEW(BASE_TYPE, VIEW_TYPE) \
case VIEW_TYPE::type_id: { \
- const BASE_TYPE::c_type min_value = GetMetadata( \
+ const BASE_TYPE::c_type min_value = GetMetadata( \
field.metadata().get(), "min", std::numeric_limits::min()); \
- const BASE_TYPE::c_type max_value = GetMetadata( \
+ const BASE_TYPE::c_type max_value = GetMetadata( \
field.metadata().get(), "max", std::numeric_limits::max()); \
+ VALIDATE_MIN_MAX(min_value, max_value); \
return generator->Numeric(length, min_value, max_value, null_probability) \
->View(field.type()); \
}
@@ -613,27 +628,31 @@ Result> GenerateArray(const Field& field, int64_t length,
GENERATE_INTEGRAL_CASE_VIEW(ARROW_TYPE, ARROW_TYPE)
#define GENERATE_FLOATING_CASE(ARROW_TYPE, GENERATOR_FUNC) \
case ARROW_TYPE::type_id: { \
- const ARROW_TYPE::c_type min_value = GetMetadata( \
+ const ARROW_TYPE::c_type min_value = GetMetadata( \
field.metadata().get(), "min", std::numeric_limits::min()); \
- const ARROW_TYPE::c_type max_value = GetMetadata( \
+ const ARROW_TYPE::c_type max_value = GetMetadata( \
field.metadata().get(), "max", std::numeric_limits::max()); \
const double nan_probability = \
- GetMetadata(field.metadata().get(), "nan_probability", 0); \
+ GetMetadata(field.metadata().get(), "nan_probability", 0); \
+ VALIDATE_MIN_MAX(min_value, max_value); \
+ VALIDATE_RANGE(nan_probability, 0.0, 1.0); \
return generator->GENERATOR_FUNC(length, min_value, max_value, null_probability, \
nan_probability); \
}
const double null_probability =
field.nullable()
- ? GetMetadata(field.metadata().get(), "null_probability", 0.01)
+ ? GetMetadata(field.metadata().get(), "null_probability", 0.01)
: 0.0;
+ VALIDATE_RANGE(null_probability, 0.0, 1.0);
switch (field.type()->id()) {
- case Type::type::NA:
+ case Type::type::NA: {
return std::make_shared(length);
+ }
case Type::type::BOOL: {
const double true_probability =
- GetMetadata(field.metadata().get(), "true_probability", 0.5);
+ GetMetadata(field.metadata().get(), "true_probability", 0.5);
return generator->Boolean(length, true_probability, null_probability);
}
@@ -651,11 +670,10 @@ Result> GenerateArray(const Field& field, int64_t length,
case Type::type::STRING:
case Type::type::BINARY: {
- const int32_t min_length = GetMetadata(field.metadata().get(), "min", 0);
- const int32_t max_length =
- GetMetadata(field.metadata().get(), "max", 1024);
- const int32_t unique_values =
- GetMetadata(field.metadata().get(), "unique", -1);
+ const auto min_length = GetMetadata(field.metadata().get(), "min", 0);
+ const auto max_length = GetMetadata(field.metadata().get(), "max", 1024);
+ const auto unique_values =
+ GetMetadata(field.metadata().get(), "unique", -1);
if (unique_values > 0) {
return generator
->StringWithRepeats(length, unique_values, min_length, max_length,
@@ -687,10 +705,10 @@ Result> GenerateArray(const Field& field, int64_t length,
GENERATE_INTEGRAL_CASE_VIEW(Int64Type, DayTimeIntervalType);
case Type::type::LIST: {
- const int32_t values_length = GetMetadata(
- field.metadata().get(), "values", static_cast(length));
- const bool force_empty_nulls =
- GetMetadata(field.metadata().get(), "force_empty_nulls", false);
+ const auto values_length = GetMetadata(field.metadata().get(), "values",
+ static_cast(length));
+ const auto force_empty_nulls =
+ GetMetadata(field.metadata().get(), "force_empty_nulls", false);
auto values = GenerateArray(
*internal::checked_pointer_cast(field.type())->value_field(),
values_length, generator);
@@ -719,14 +737,15 @@ Result> GenerateArray(const Field& field, int64_t length,
const auto& child_field = field.type()->field(i);
child_arrays[i] = *GenerateArray(*child_field, length, generator);
}
- return field.type()->id() == Type::type::SPARSE_UNION
- ? generator->SparseUnion(child_arrays, length)
- : generator->DenseUnion(child_arrays, length);
+ auto array = field.type()->id() == Type::type::SPARSE_UNION
+ ? generator->SparseUnion(child_arrays, length)
+ : generator->DenseUnion(child_arrays, length);
+ return array->View(field.type());
}
case Type::type::DICTIONARY: {
- const int64_t values_length =
- GetMetadata(field.metadata().get(), "values", 4);
+ const auto values_length =
+ GetMetadata(field.metadata().get(), "values", 4);
auto dict_type = internal::checked_pointer_cast(field.type());
// TODO: no way to control generation of dictionary
auto values = *GenerateArray(
@@ -742,10 +761,10 @@ Result> GenerateArray(const Field& field, int64_t length,
}
case Type::type::MAP: {
- const int32_t values_length = GetMetadata(
- field.metadata().get(), "values", static_cast(length));
- const bool force_empty_nulls =
- GetMetadata(field.metadata().get(), "force_empty_nulls", false);
+ const auto values_length = GetMetadata(field.metadata().get(), "values",
+ static_cast(length));
+ const auto force_empty_nulls =
+ GetMetadata(field.metadata().get(), "force_empty_nulls", false);
auto map_type = internal::checked_pointer_cast(field.type());
auto keys = *GenerateArray(*map_type->key_field(), values_length, generator);
auto items = *GenerateArray(*map_type->item_field(), values_length, generator);
@@ -772,11 +791,10 @@ Result> GenerateArray(const Field& field, int64_t length,
case Type::type::LARGE_STRING:
case Type::type::LARGE_BINARY: {
- const int32_t min_length = GetMetadata(field.metadata().get(), "min", 0);
- const int32_t max_length =
- GetMetadata(field.metadata().get(), "max", 1024);
- const int32_t unique_values =
- GetMetadata(field.metadata().get(), "unique", -1);
+ const auto min_length = GetMetadata(field.metadata().get(), "min", 0);
+ const auto max_length = GetMetadata(field.metadata().get(), "max", 1024);
+ const auto unique_values =
+ GetMetadata(field.metadata().get(), "unique", -1);
if (unique_values > 0) {
ABORT_NOT_OK(
Status::NotImplemented("Generating random array with repeated values for "
@@ -787,10 +805,10 @@ Result> GenerateArray(const Field& field, int64_t length,
}
case Type::type::LARGE_LIST: {
- const int64_t values_length =
- GetMetadata(field.metadata().get(), "values", length);
- const bool force_empty_nulls =
- GetMetadata(field.metadata().get(), "force_empty_nulls", false);
+ const auto values_length =
+ GetMetadata(field.metadata().get(), "values", length);
+ const auto force_empty_nulls =
+ GetMetadata(field.metadata().get(), "force_empty_nulls", false);
auto values = GenerateArray(
*internal::checked_pointer_cast(field.type())->value_field(),
values_length, generator);
@@ -814,8 +832,14 @@ Result> GenerateArray(const Field& field, int64_t length,
} // namespace
-std::shared_ptr Generate(const FieldVector& fields, int64_t length,
- SeedType seed) {
+std::shared_ptr GenerateArray(const Field& field, int64_t length,
+ SeedType seed) {
+ RandomArrayGenerator generator(seed);
+ return *GenerateArray(field, length, &generator);
+}
+
+std::shared_ptr GenerateBatch(const FieldVector& fields,
+ int64_t length, SeedType seed) {
std::vector> arrays(fields.size());
RandomArrayGenerator generator(seed);
for (size_t i = 0; i < fields.size(); i++) {
diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h
index bce792c53b4..942af583ec9 100644
--- a/cpp/src/arrow/testing/random.h
+++ b/cpp/src/arrow/testing/random.h
@@ -362,9 +362,34 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
std::default_random_engine seed_rng_;
};
+/// Generate a record batch with random data of the specified length.
+///
+/// Generation options are read from key-value metadata for each field
+/// (including nested fields).
+///
+/// The following options are supported:
+///
+/// For all types except NullType:
+/// - null_probability (double): range [0.0, 1.0] the probability of a null value.
+/// Default/value is 0.0 if the field is marked non-nullable, else it is 0.01
+///
+/// For all numeric types T:
+/// - min (T::c_type): the minimum value to generate (inclusive), default
+/// std::numeric_limits::min()
+/// - max (T::c_type): the maximum value to generate (inclusive), default
+/// std::numeric_limits::max() Note this means that, for example, min/max are
+/// int16_t values for HalfFloatType.
+///
+/// For floating point types T for which is_physical_floating_type:
+/// - nan_probability (double): range [0.0, 1.0]
ARROW_TESTING_EXPORT
std::shared_ptr GenerateBatch(const FieldVector& fields, int64_t size,
- SeedType seed);
+ SeedType seed);
+
+/// Generate an array with random data. See GenerateBatch for usage info.
+ARROW_TESTING_EXPORT
+std::shared_ptr GenerateArray(const Field& field, int64_t size,
+ SeedType seed);
} // namespace random
diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc
index 09742d64cc9..b193f3bd97e 100644
--- a/cpp/src/arrow/testing/random_test.cc
+++ b/cpp/src/arrow/testing/random_test.cc
@@ -44,7 +44,15 @@ class RandomNumericArrayTest : public ::testing::Test {
TEST_P(RandomArrayTest, GenerateArray) {
auto field = GetField();
- auto batch = Generate({field}, 128, 0xDEADBEEF);
+ auto array = GenerateArray(*field, 128, 0xDEADBEEF);
+ AssertTypeEqual(field->type(), array->type());
+ ASSERT_EQ(128, array->length());
+ ASSERT_OK(array->ValidateFull());
+}
+
+TEST_P(RandomArrayTest, GenerateBatch) {
+ auto field = GetField();
+ auto batch = GenerateBatch({field}, 128, 0xDEADBEEF);
AssertSchemaEqual(schema({field}), batch->schema());
auto array = batch->column(0);
ASSERT_EQ(128, array->length());
@@ -57,7 +65,7 @@ TEST_P(RandomArrayTest, GenerateArrayWithZeroNullProbability) {
if (field->type()->id() == Type::type::NA) {
GTEST_SKIP() << "Cannot generate non-null null arrays";
}
- auto batch = Generate({field}, 128, 0xDEADBEEF);
+ auto batch = GenerateBatch({field}, 128, 0xDEADBEEF);
AssertSchemaEqual(schema({field}), batch->schema());
auto array = batch->column(0);
ASSERT_OK(array->ValidateFull());
@@ -69,14 +77,13 @@ TEST_P(RandomArrayTest, GenerateNonNullableArray) {
if (field->type()->id() == Type::type::NA) {
GTEST_SKIP() << "Cannot generate non-null null arrays";
}
- auto batch = Generate({field}, 128, 0xDEADBEEF);
+ auto batch = GenerateBatch({field}, 128, 0xDEADBEEF);
AssertSchemaEqual(schema({field}), batch->schema());
auto array = batch->column(0);
ASSERT_OK(array->ValidateFull());
ASSERT_EQ(0, array->null_count());
}
-
auto values = ::testing::Values(
field("null", null()), field("bool", boolean()), field("uint8", uint8()),
field("int8", int8()), field("uint16", uint16()), field("int16", int16()),
@@ -137,20 +144,20 @@ TYPED_TEST_SUITE(RandomNumericArrayTest, NumericTypes);
TYPED_TEST(RandomNumericArrayTest, GenerateMinMax) {
auto field = this->GetField()->WithMetadata(
key_value_metadata({{"min", "0"}, {"max", "127"}, {"nan_probability", "0.0"}}));
- auto batch = Generate({field}, 128, 0xDEADBEEF);
+ auto batch = GenerateBatch({field}, 128, 0xDEADBEEF);
AssertSchemaEqual(schema({field}), batch->schema());
auto array = this->Downcast(batch->column(0));
for (auto slot : *array) {
if (!slot.has_value()) continue;
- ASSERT_GE(slot, 0);
- ASSERT_LE(slot, 127);
+ ASSERT_GE(slot, typename TypeParam::c_type(0));
+ ASSERT_LE(slot, typename TypeParam::c_type(127));
}
}
TEST(TypeSpecificTests, FloatNan) {
auto field = arrow::field("float32", float32())
->WithMetadata(key_value_metadata({{"nan_probability", "1.0"}}));
- auto batch = Generate({field}, 128, 0xDEADBEEF);
+ auto batch = GenerateBatch({field}, 128, 0xDEADBEEF);
AssertSchemaEqual(schema({field}), batch->schema());
auto array = internal::checked_pointer_cast>(batch->column(0));
auto it = array->begin();
@@ -165,10 +172,9 @@ TEST(TypeSpecificTests, FloatNan) {
TEST(TypeSpecificTests, RepeatedStrings) {
auto field =
arrow::field("string", utf8())->WithMetadata(key_value_metadata({{"unique", "1"}}));
- auto batch = Generate({field}, 128, 0xDEADBEEF);
+ auto batch = GenerateBatch({field}, 128, 0xDEADBEEF);
AssertSchemaEqual(schema({field}), batch->schema());
auto array = internal::checked_pointer_cast(batch->column(0));
- auto it = array->begin();
util::string_view singular_value = array->GetView(0);
for (auto slot : *array) {
if (!slot.has_value()) continue;
From df6c12db6f783e4d5689248bfa1c1723aa5b98b8 Mon Sep 17 00:00:00 2001
From: David Li
Date: Wed, 17 Mar 2021 15:37:01 -0400
Subject: [PATCH 05/11] Make list/string generators more consistent
---
cpp/src/arrow/testing/random.cc | 104 ++++++++++++++++++++-------
cpp/src/arrow/testing/random_test.cc | 22 +++---
2 files changed, 90 insertions(+), 36 deletions(-)
diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc
index e311512f13c..3ef451b55e0 100644
--- a/cpp/src/arrow/testing/random.cc
+++ b/cpp/src/arrow/testing/random.cc
@@ -31,6 +31,7 @@
#include "arrow/array/builder_decimal.h"
#include "arrow/array/builder_primitive.h"
#include "arrow/buffer.h"
+#include "arrow/compute/api_aggregate.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
@@ -421,6 +422,57 @@ std::shared_ptr GenerateOffsets(SeedType seed, int64_t size,
std::make_shared(), size, buffers, null_count);
return std::make_shared(array_data);
}
+
+template
+std::shared_ptr OffsetsFromLengthsArray(OffsetArrayType* lengths,
+ bool force_empty_nulls) {
+ // TODO: length 0 arrays (need test case)
+ DCHECK(!lengths->IsNull(0));
+ DCHECK(!lengths->IsNull(lengths->length() - 1));
+ // Need N + 1 offsets for N items
+ int64_t size = lengths->length() + 1;
+ BufferVector buffers{2};
+
+ int64_t null_count = 0;
+
+ buffers[0] = *AllocateEmptyBitmap(size);
+ uint8_t* null_bitmap = buffers[0]->mutable_data();
+ // Make sure the first and last entry are non-null
+ arrow::BitUtil::SetBit(null_bitmap, 0);
+ arrow::BitUtil::SetBit(null_bitmap, size - 1);
+
+ buffers[1] = *AllocateBuffer(sizeof(typename OffsetArrayType::value_type) * size);
+ auto data =
+ reinterpret_cast(buffers[1]->mutable_data());
+ data[0] = 0;
+ int index = 1;
+ for (const auto& length : *lengths) {
+ if (length.has_value()) {
+ arrow::BitUtil::SetBit(null_bitmap, index);
+ data[index] = data[index - 1] + *length;
+ DCHECK_GE(*length, 0);
+ } else {
+ data[index] = data[index - 1];
+ }
+ index++;
+ }
+
+ if (force_empty_nulls) {
+ arrow::internal::BitmapReader reader(null_bitmap, 0, size);
+ for (int64_t i = 0; i < size; ++i) {
+ if (reader.IsNotSet()) {
+ // Ensure a null entry corresponds to a 0-sized list extent
+ // (note this can be neither the first nor the last list entry, see above)
+ data[i + 1] = data[i];
+ }
+ reader.Next();
+ }
+ }
+
+ auto array_data = ArrayData::Make(
+ std::make_shared(), size, buffers, null_count);
+ return std::make_shared(array_data);
+}
} // namespace
std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset,
@@ -639,6 +691,27 @@ Result> GenerateArray(const Field& field, int64_t length,
return generator->GENERATOR_FUNC(length, min_value, max_value, null_probability, \
nan_probability); \
}
+#define GENERATE_LIST_CASE(ARRAY_TYPE) \
+ case ARRAY_TYPE::TypeClass::type_id: { \
+ const auto min_length = GetMetadata( \
+ field.metadata().get(), "min", 0); \
+ const auto max_length = GetMetadata( \
+ field.metadata().get(), "max", 1024); \
+ const auto lengths = internal::checked_pointer_cast< \
+ CTypeTraits::ArrayType>( \
+ generator->Numeric::ArrowType>( \
+ length, min_length, max_length, null_probability)); \
+ ARROW_ASSIGN_OR_RAISE(const auto values_datum, compute::Sum(lengths)); \
+ const auto values_length = values_datum.scalar_as().value; \
+ const auto force_empty_nulls = \
+ GetMetadata(field.metadata().get(), "force_empty_nulls", false); \
+ const auto values = GenerateArray( \
+ *internal::checked_pointer_cast(field.type()) \
+ ->value_field(), \
+ values_length, generator); \
+ const auto offsets = OffsetsFromLengthsArray(lengths.get(), force_empty_nulls); \
+ return ARRAY_TYPE::FromArrays(*offsets, **values); \
+ }
const double null_probability =
field.nullable()
@@ -704,19 +777,7 @@ Result> GenerateArray(const Field& field, int64_t length,
// type means it's not a (useful) composition of other generators
GENERATE_INTEGRAL_CASE_VIEW(Int64Type, DayTimeIntervalType);
- case Type::type::LIST: {
- const auto values_length = GetMetadata(field.metadata().get(), "values",
- static_cast(length));
- const auto force_empty_nulls =
- GetMetadata(field.metadata().get(), "force_empty_nulls", false);
- auto values = GenerateArray(
- *internal::checked_pointer_cast(field.type())->value_field(),
- values_length, generator);
- // need N + 1 offsets to have N values
- auto offsets = generator->Offsets(length + 1, 0, values_length, null_probability,
- force_empty_nulls);
- return ListArray::FromArrays(*offsets, **values);
- }
+ GENERATE_LIST_CASE(ListArray);
case Type::type::STRUCT: {
ArrayVector child_arrays(field.type()->num_fields());
@@ -804,19 +865,7 @@ Result> GenerateArray(const Field& field, int64_t length,
->View(field.type());
}
- case Type::type::LARGE_LIST: {
- const auto values_length =
- GetMetadata(field.metadata().get(), "values", length);
- const auto force_empty_nulls =
- GetMetadata(field.metadata().get(), "force_empty_nulls", false);
- auto values = GenerateArray(
- *internal::checked_pointer_cast(field.type())->value_field(),
- values_length, generator);
- // need N + 1 offsets to have N values
- auto offsets = generator->LargeOffsets(length + 1, 0, values_length,
- null_probability, force_empty_nulls);
- return LargeListArray::FromArrays(*offsets, **values);
- }
+ GENERATE_LIST_CASE(LargeListArray);
default:
break;
@@ -824,6 +873,9 @@ Result> GenerateArray(const Field& field, int64_t length,
#undef GENERATE_INTEGRAL_CASE_VIEW
#undef GENERATE_INTEGRAL_CASE
#undef GENERATE_FLOATING_CASE
+#undef GENERATE_LIST_CASE
+#undef VALIDATE_RANGE
+#undef VALIDATE_MIN_MAX
ABORT_NOT_OK(
Status::NotImplemented("Generating random array for field ", field.ToString()));
diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc
index b193f3bd97e..3855ab7426f 100644
--- a/cpp/src/arrow/testing/random_test.cc
+++ b/cpp/src/arrow/testing/random_test.cc
@@ -14,7 +14,6 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
-
#include
#include "arrow/array.h"
@@ -27,6 +26,9 @@
namespace arrow {
namespace random {
+// Use short arrays since especially in debug mode, generating list(list()) is slow
+constexpr int64_t kExpectedLength = 24;
+
class RandomArrayTest : public ::testing::TestWithParam> {
protected:
std::shared_ptr