diff --git a/cpp/src/arrow/filesystem/s3fs_benchmark.cc b/cpp/src/arrow/filesystem/s3fs_benchmark.cc index 88911fd2aa9..36564a70d29 100644 --- a/cpp/src/arrow/filesystem/s3fs_benchmark.cc +++ b/cpp/src/arrow/filesystem/s3fs_benchmark.cc @@ -36,6 +36,7 @@ #include "arrow/table.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" +#include "arrow/util/key_value_metadata.h" #include "arrow/util/range.h" #include "parquet/arrow/reader.h" @@ -146,32 +147,24 @@ class MinioFixture : public benchmark::Fixture { /// Appends integer columns to the beginning (to act as indices). Status MakeParquetObject(const std::string& path, int num_columns, int num_rows) { std::vector> columns; - std::vector> fields; - - { - arrow::random::RandomArrayGenerator generator(0); - std::shared_ptr values = generator.Int64(num_rows, 0, 1e10, 0); - columns.push_back(std::make_shared(values)); - fields.push_back(::arrow::field("timestamp", values->type())); - } - { - arrow::random::RandomArrayGenerator generator(1); - std::shared_ptr values = generator.Int32(num_rows, 0, 1e9, 0); - columns.push_back(std::make_shared(values)); - fields.push_back(::arrow::field("val", values->type())); - } - + FieldVector fields{ + field("timestamp", int64(), /*nullable=*/true, + key_value_metadata( + {{"min", "0"}, {"max", "10000000000"}, {"null_probability", "0"}})), + field("val", int32(), /*nullable=*/true, + key_value_metadata( + {{"min", "0"}, {"max", "1000000000"}, {"null_probability", "0"}}))}; for (int i = 0; i < num_columns; i++) { - arrow::random::RandomArrayGenerator generator(i); - std::shared_ptr values = generator.Float64(num_rows, -1.e10, 1e10, 0); std::stringstream ss; ss << "col" << i; - columns.push_back(std::make_shared(values)); - fields.push_back(::arrow::field(ss.str(), values->type())); + fields.push_back( + field(ss.str(), float64(), /*nullable=*/true, + key_value_metadata( + {{"min", "-1.e10"}, {"max", "1e10"}, {"null_probability", "0"}}))); } - auto schema = std::make_shared<::arrow::Schema>(fields); - - std::shared_ptr table = Table::Make(schema, columns); + auto batch = random::GenerateBatch(fields, num_rows, 0); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr
table, + Table::FromRecordBatches({batch})); std::shared_ptr sink; ARROW_ASSIGN_OR_RAISE(sink, fs_->OpenOutputStream(path)); diff --git a/cpp/src/arrow/testing/CMakeLists.txt b/cpp/src/arrow/testing/CMakeLists.txt index 125b385ad9e..073224d519b 100644 --- a/cpp/src/arrow/testing/CMakeLists.txt +++ b/cpp/src/arrow/testing/CMakeLists.txt @@ -17,6 +17,10 @@ arrow_install_all_headers("arrow/testing") +if(ARROW_BUILD_TESTS) + add_arrow_test(random_test) +endif() + # json_integration_test is two things at the same time: # - an executable that can be called to answer integration test requests # - a self-(unit)test for the C++ side of integration testing diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 7bf5dd22d43..a29a464846a 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -39,7 +39,9 @@ #include "arrow/util/bitmap_reader.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" +#include "arrow/util/value_parsing.h" namespace arrow { @@ -369,12 +371,16 @@ std::shared_ptr RandomArrayGenerator::FixedSizeBinary(int64_t size, std::move(null_bitmap), null_count); } -std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset, - int32_t last_offset, - double null_probability, - bool force_empty_nulls) { - using GenOpt = GenerateOptions>; - GenOpt options(seed(), first_offset, last_offset, null_probability); +namespace { +template +std::shared_ptr GenerateOffsets(SeedType seed, int64_t size, + typename OffsetArrayType::value_type first_offset, + typename OffsetArrayType::value_type last_offset, + double null_probability, bool force_empty_nulls) { + using GenOpt = GenerateOptions< + typename OffsetArrayType::value_type, + std::uniform_int_distribution>; + GenOpt options(seed, first_offset, last_offset, null_probability); BufferVector buffers{2}; @@ -387,8 +393,9 @@ std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first arrow::BitUtil::SetBit(null_bitmap, 0); arrow::BitUtil::SetBit(null_bitmap, size - 1); - buffers[1] = *AllocateBuffer(sizeof(int32_t) * size); - auto data = reinterpret_cast(buffers[1]->mutable_data()); + buffers[1] = *AllocateBuffer(sizeof(typename OffsetArrayType::value_type) * size); + auto data = + reinterpret_cast(buffers[1]->mutable_data()); options.GenerateTypedData(data, size); // Ensure offsets are in increasing order std::sort(data, data + size); @@ -410,8 +417,78 @@ std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first } } - auto array_data = ArrayData::Make(int32(), size, buffers, null_count); - return std::make_shared(array_data); + auto array_data = ArrayData::Make( + std::make_shared(), size, buffers, null_count); + return std::make_shared(array_data); +} + +template +std::shared_ptr OffsetsFromLengthsArray(OffsetArrayType* lengths, + bool force_empty_nulls) { + DCHECK(lengths->length() == 0 || !lengths->IsNull(0)); + DCHECK(lengths->length() == 0 || !lengths->IsNull(lengths->length() - 1)); + // Need N + 1 offsets for N items + int64_t size = lengths->length() + 1; + BufferVector buffers{2}; + + int64_t null_count = 0; + + buffers[0] = *AllocateEmptyBitmap(size); + uint8_t* null_bitmap = buffers[0]->mutable_data(); + // Make sure the first and last entry are non-null + arrow::BitUtil::SetBit(null_bitmap, 0); + arrow::BitUtil::SetBit(null_bitmap, size - 1); + + buffers[1] = *AllocateBuffer(sizeof(typename OffsetArrayType::value_type) * size); + auto data = + reinterpret_cast(buffers[1]->mutable_data()); + data[0] = 0; + int index = 1; + for (const auto& length : *lengths) { + if (length.has_value()) { + arrow::BitUtil::SetBit(null_bitmap, index); + data[index] = data[index - 1] + *length; + DCHECK_GE(*length, 0); + } else { + data[index] = data[index - 1]; + null_count++; + } + index++; + } + + if (force_empty_nulls) { + arrow::internal::BitmapReader reader(null_bitmap, 0, size); + for (int64_t i = 0; i < size; ++i) { + if (reader.IsNotSet()) { + // Ensure a null entry corresponds to a 0-sized list extent + // (note this can be neither the first nor the last list entry, see above) + data[i + 1] = data[i]; + } + reader.Next(); + } + } + + auto array_data = ArrayData::Make( + std::make_shared(), size, buffers, null_count); + return std::make_shared(array_data); +} +} // namespace + +std::shared_ptr RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset, + int32_t last_offset, + double null_probability, + bool force_empty_nulls) { + return GenerateOffsets>(seed(), size, first_offset, last_offset, + null_probability, force_empty_nulls); +} + +std::shared_ptr RandomArrayGenerator::LargeOffsets(int64_t size, + int64_t first_offset, + int64_t last_offset, + double null_probability, + bool force_empty_nulls) { + return GenerateOffsets>(seed(), size, first_offset, last_offset, + null_probability, force_empty_nulls); } std::shared_ptr RandomArrayGenerator::List(const Array& values, int64_t size, @@ -558,5 +635,280 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(std::shared_ptr t return RandomArrayGeneratorOfImpl{this, type, size, null_probability, nullptr}.Finish(); } +namespace { +template ::ArrowType> +enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, + const std::string& key, + T default_value) { + if (!metadata) return default_value; + const auto index = metadata->FindKey(key); + if (index < 0) return default_value; + const auto& value = metadata->value(index); + T output{}; + if (!internal::ParseValue(value.data(), value.length(), &output)) { + ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value)); + } + return output; +} +} // namespace + +std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t length) { +#define VALIDATE_RANGE(PARAM, MIN, MAX) \ + if (PARAM < MIN || PARAM > MAX) { \ + ABORT_NOT_OK(Status::Invalid(field.ToString(), ": ", ARROW_STRINGIFY(PARAM), \ + " must be in [", MIN, ", ", MAX, " ] but got ", \ + null_probability)); \ + } +#define VALIDATE_MIN_MAX(MIN, MAX) \ + if (MIN > MAX) { \ + ABORT_NOT_OK( \ + Status::Invalid(field.ToString(), ": min ", MIN, " must be <= max ", MAX)); \ + } +#define GENERATE_INTEGRAL_CASE_VIEW(BASE_TYPE, VIEW_TYPE) \ + case VIEW_TYPE::type_id: { \ + const BASE_TYPE::c_type min_value = GetMetadata( \ + field.metadata().get(), "min", std::numeric_limits::min()); \ + const BASE_TYPE::c_type max_value = GetMetadata( \ + field.metadata().get(), "max", std::numeric_limits::max()); \ + VALIDATE_MIN_MAX(min_value, max_value); \ + return *Numeric(length, min_value, max_value, null_probability) \ + ->View(field.type()); \ + } +#define GENERATE_INTEGRAL_CASE(ARROW_TYPE) \ + GENERATE_INTEGRAL_CASE_VIEW(ARROW_TYPE, ARROW_TYPE) +#define GENERATE_FLOATING_CASE(ARROW_TYPE, GENERATOR_FUNC) \ + case ARROW_TYPE::type_id: { \ + const ARROW_TYPE::c_type min_value = GetMetadata( \ + field.metadata().get(), "min", std::numeric_limits::min()); \ + const ARROW_TYPE::c_type max_value = GetMetadata( \ + field.metadata().get(), "max", std::numeric_limits::max()); \ + const double nan_probability = \ + GetMetadata(field.metadata().get(), "nan_probability", 0); \ + VALIDATE_MIN_MAX(min_value, max_value); \ + VALIDATE_RANGE(nan_probability, 0.0, 1.0); \ + return GENERATOR_FUNC(length, min_value, max_value, null_probability, \ + nan_probability); \ + } + + // Don't use compute::Sum since that may not get built +#define GENERATE_LIST_CASE(ARRAY_TYPE) \ + case ARRAY_TYPE::TypeClass::type_id: { \ + const auto min_length = GetMetadata( \ + field.metadata().get(), "min_length", 0); \ + const auto max_length = GetMetadata( \ + field.metadata().get(), "max_length", 1024); \ + const auto lengths = internal::checked_pointer_cast< \ + CTypeTraits::ArrayType>( \ + Numeric::ArrowType>( \ + length, min_length, max_length, null_probability)); \ + int64_t values_length = 0; \ + for (const auto& length : *lengths) { \ + if (length.has_value()) values_length += *length; \ + } \ + const auto force_empty_nulls = \ + GetMetadata(field.metadata().get(), "force_empty_nulls", false); \ + const auto values = \ + ArrayOf(*internal::checked_pointer_cast(field.type()) \ + ->value_field(), \ + values_length); \ + const auto offsets = OffsetsFromLengthsArray(lengths.get(), force_empty_nulls); \ + return *ARRAY_TYPE::FromArrays(*offsets, *values); \ + } + + const double null_probability = + field.nullable() + ? GetMetadata(field.metadata().get(), "null_probability", 0.01) + : 0.0; + VALIDATE_RANGE(null_probability, 0.0, 1.0); + switch (field.type()->id()) { + case Type::type::NA: { + return std::make_shared(length); + } + + case Type::type::BOOL: { + const double true_probability = + GetMetadata(field.metadata().get(), "true_probability", 0.5); + return Boolean(length, true_probability, null_probability); + } + + GENERATE_INTEGRAL_CASE(UInt8Type); + GENERATE_INTEGRAL_CASE(Int8Type); + GENERATE_INTEGRAL_CASE(UInt16Type); + GENERATE_INTEGRAL_CASE(Int16Type); + GENERATE_INTEGRAL_CASE(UInt32Type); + GENERATE_INTEGRAL_CASE(Int32Type); + GENERATE_INTEGRAL_CASE(UInt64Type); + GENERATE_INTEGRAL_CASE(Int64Type); + GENERATE_INTEGRAL_CASE_VIEW(Int16Type, HalfFloatType); + GENERATE_FLOATING_CASE(FloatType, Float32); + GENERATE_FLOATING_CASE(DoubleType, Float64); + + case Type::type::STRING: + case Type::type::BINARY: { + const auto min_length = + GetMetadata(field.metadata().get(), "min_length", 0); + const auto max_length = + GetMetadata(field.metadata().get(), "max_length", 1024); + const auto unique_values = + GetMetadata(field.metadata().get(), "unique", -1); + if (unique_values > 0) { + return *StringWithRepeats(length, unique_values, min_length, max_length, + null_probability) + ->View(field.type()); + } + return *String(length, min_length, max_length, null_probability) + ->View(field.type()); + } + + case Type::type::DECIMAL128: + case Type::type::DECIMAL256: + case Type::type::FIXED_SIZE_BINARY: { + auto byte_width = + internal::checked_pointer_cast(field.type())->byte_width(); + return *FixedSizeBinary(length, byte_width, null_probability)->View(field.type()); + } + + GENERATE_INTEGRAL_CASE_VIEW(Int32Type, Date32Type); + GENERATE_INTEGRAL_CASE_VIEW(Int64Type, Date64Type); + GENERATE_INTEGRAL_CASE_VIEW(Int64Type, TimestampType); + GENERATE_INTEGRAL_CASE_VIEW(Int32Type, Time32Type); + GENERATE_INTEGRAL_CASE_VIEW(Int64Type, Time64Type); + GENERATE_INTEGRAL_CASE_VIEW(Int32Type, MonthIntervalType); + + // This isn't as flexible as it could be, but the array-of-structs layout of this + // type means it's not a (useful) composition of other generators + GENERATE_INTEGRAL_CASE_VIEW(Int64Type, DayTimeIntervalType); + + GENERATE_LIST_CASE(ListArray); + + case Type::type::STRUCT: { + ArrayVector child_arrays(field.type()->num_fields()); + std::vector field_names; + for (int i = 0; i < field.type()->num_fields(); i++) { + const auto& child_field = field.type()->field(i); + child_arrays[i] = ArrayOf(*child_field, length); + field_names.push_back(child_field->name()); + } + return *StructArray::Make(child_arrays, field_names, + NullBitmap(length, null_probability)); + } + + case Type::type::SPARSE_UNION: + case Type::type::DENSE_UNION: { + ArrayVector child_arrays(field.type()->num_fields()); + for (int i = 0; i < field.type()->num_fields(); i++) { + const auto& child_field = field.type()->field(i); + child_arrays[i] = ArrayOf(*child_field, length); + } + auto array = field.type()->id() == Type::type::SPARSE_UNION + ? SparseUnion(child_arrays, length) + : DenseUnion(child_arrays, length); + return *array->View(field.type()); + } + + case Type::type::DICTIONARY: { + const auto values_length = + GetMetadata(field.metadata().get(), "values", 4); + auto dict_type = internal::checked_pointer_cast(field.type()); + // TODO: no way to control generation of dictionary + auto values = + ArrayOf(*arrow::field("temporary", dict_type->value_type(), /*nullable=*/false), + values_length); + auto merged = field.metadata() ? field.metadata() : key_value_metadata({}, {}); + if (merged->Contains("min")) + ABORT_NOT_OK(Status::Invalid(field.ToString(), ": cannot specify min")); + if (merged->Contains("max")) + ABORT_NOT_OK(Status::Invalid(field.ToString(), ": cannot specify max")); + merged = merged->Merge(*key_value_metadata( + {{"min", "0"}, {"max", std::to_string(values_length - 1)}})); + auto indices = ArrayOf( + *arrow::field("temporary", dict_type->index_type(), field.nullable(), merged), + length); + return *DictionaryArray::FromArrays(field.type(), indices, values); + } + + case Type::type::MAP: { + const auto values_length = GetMetadata(field.metadata().get(), "values", + static_cast(length)); + const auto force_empty_nulls = + GetMetadata(field.metadata().get(), "force_empty_nulls", false); + auto map_type = internal::checked_pointer_cast(field.type()); + auto keys = ArrayOf(*map_type->key_field(), values_length); + auto items = ArrayOf(*map_type->item_field(), values_length); + // need N + 1 offsets to have N values + auto offsets = + Offsets(length + 1, 0, values_length, null_probability, force_empty_nulls); + return *MapArray::FromArrays(map_type, offsets, keys, items); + } + + case Type::type::EXTENSION: + // Could be supported by generating the storage type (though any extension + // invariants wouldn't be preserved) + break; + + case Type::type::FIXED_SIZE_LIST: { + auto list_type = internal::checked_pointer_cast(field.type()); + const int64_t values_length = list_type->list_size() * length; + auto values = ArrayOf(*list_type->value_field(), values_length); + auto null_bitmap = NullBitmap(length, null_probability); + return std::make_shared(list_type, length, values, null_bitmap); + } + + GENERATE_INTEGRAL_CASE_VIEW(Int64Type, DurationType); + + case Type::type::LARGE_STRING: + case Type::type::LARGE_BINARY: { + const auto min_length = + GetMetadata(field.metadata().get(), "min_length", 0); + const auto max_length = + GetMetadata(field.metadata().get(), "max_length", 1024); + const auto unique_values = + GetMetadata(field.metadata().get(), "unique", -1); + if (unique_values > 0) { + ABORT_NOT_OK( + Status::NotImplemented("Generating random array with repeated values for " + "large string/large binary types")); + } + return *LargeString(length, min_length, max_length, null_probability) + ->View(field.type()); + } + + GENERATE_LIST_CASE(LargeListArray); + + default: + break; + } +#undef GENERATE_INTEGRAL_CASE_VIEW +#undef GENERATE_INTEGRAL_CASE +#undef GENERATE_FLOATING_CASE +#undef GENERATE_LIST_CASE +#undef VALIDATE_RANGE +#undef VALIDATE_MIN_MAX + + ABORT_NOT_OK( + Status::NotImplemented("Generating random array for field ", field.ToString())); + return nullptr; +} + +std::shared_ptr RandomArrayGenerator::BatchOf( + const FieldVector& fields, int64_t length) { + std::vector> arrays(fields.size()); + for (size_t i = 0; i < fields.size(); i++) { + const auto& field = fields[i]; + arrays[i] = ArrayOf(*field, length); + } + return RecordBatch::Make(schema(fields), length, std::move(arrays)); +} + +std::shared_ptr GenerateArray(const Field& field, int64_t length, + SeedType seed) { + return RandomArrayGenerator(seed).ArrayOf(field, length); +} + +std::shared_ptr GenerateBatch(const FieldVector& fields, + int64_t length, SeedType seed) { + return RandomArrayGenerator(seed).BatchOf(fields, length); +} + } // namespace random } // namespace arrow diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index 2358ab0911f..5c6b0b4ae77 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -249,6 +249,10 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { double null_probability = 0, bool force_empty_nulls = false); + std::shared_ptr LargeOffsets(int64_t size, int64_t first_offset, + int64_t last_offset, double null_probability = 0, + bool force_empty_nulls = false); + /// \brief Generate a random StringArray /// /// \param[in] size the size of the array to generate @@ -351,6 +355,57 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { std::shared_ptr ArrayOf(std::shared_ptr type, int64_t size, double null_probability); + /// \brief Generate an array with random data based on the given field. See BatchOf + /// for usage info. + std::shared_ptr ArrayOf(const Field& field, int64_t size); + + /// \brief Generate a record batch with random data of the specified length. + /// + /// Generation options are read from key-value metadata for each field, and may be + /// specified at any nesting level. For example, generation options for the child + /// values of a list array can be specified by constructing the list type with + /// list(field("item", int8(), options_metadata)) + /// + /// The following options are supported: + /// + /// For all types except NullType: + /// - null_probability (double): range [0.0, 1.0] the probability of a null value. + /// Default/value is 0.0 if the field is marked non-nullable, else it is 0.01 + /// + /// For all numeric types T: + /// - min (T::c_type): the minimum value to generate (inclusive), default + /// std::numeric_limits::min() + /// - max (T::c_type): the maximum value to generate (inclusive), default + /// std::numeric_limits::max() + /// Note this means that, for example, min/max are int16_t values for HalfFloatType. + /// + /// For floating point types T for which is_physical_floating_type: + /// - nan_probability (double): range [0.0, 1.0] the probability of a NaN value. + /// + /// For BooleanType: + /// - true_probability (double): range [0.0, 1.0] the probability of a true. + /// + /// For DictionaryType: + /// - values (int32_t): the size of the dictionary. + /// Other properties are passed to the generator for the dictionary indices. However, + /// min and max cannot be specified. Note it is not possible to otherwise customize + /// the generation of dictionary values. + /// + /// For list, string, and binary types T, including their large variants: + /// - min_length (T::offset_type): the minimum length of the child to generate, + /// default 0 + /// - max_length (T::offset_type): the minimum length of the child to generate, + /// default 1024 + /// + /// For string and binary types T (not including their large variants): + /// - unique (int32_t): if positive, this many distinct values will be generated + /// and all array values will be one of these values, default -1 + /// + /// For MapType: + /// - values (int32_t): the number of key-value pairs to generate, which will be + /// partitioned among the array values. + std::shared_ptr BatchOf(const FieldVector& fields, int64_t size); + SeedType seed() { return seed_distribution_(seed_rng_); } private: @@ -358,6 +413,16 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { std::default_random_engine seed_rng_; }; +/// Generate an array with random data. See RandomArrayGenerator::BatchOf. +ARROW_TESTING_EXPORT +std::shared_ptr GenerateBatch(const FieldVector& fields, int64_t size, + SeedType seed); + +/// Generate an array with random data. See RandomArrayGenerator::BatchOf. +ARROW_TESTING_EXPORT +std::shared_ptr GenerateArray(const Field& field, int64_t size, + SeedType seed); + } // namespace random // diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc new file mode 100644 index 00000000000..42624eb3af8 --- /dev/null +++ b/cpp/src/arrow/testing/random_test.cc @@ -0,0 +1,331 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include + +#include "arrow/array.h" +#include "arrow/record_batch.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/type.h" +#include "arrow/util/key_value_metadata.h" + +namespace arrow { +namespace random { + +// Use short arrays since especially in debug mode, generating list(list()) is slow +constexpr int64_t kExpectedLength = 24; + +class RandomArrayTest : public ::testing::TestWithParam> { + protected: + std::shared_ptr GetField() { return GetParam(); } +}; + +template +class RandomNumericArrayTest : public ::testing::Test { + protected: + std::shared_ptr GetField() { return field("field0", std::make_shared()); } + + std::shared_ptr> Downcast(std::shared_ptr array) { + return internal::checked_pointer_cast>(array); + } +}; + +TEST_P(RandomArrayTest, GenerateArray) { + auto field = GetField(); + auto array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), array->type()); + ASSERT_EQ(kExpectedLength, array->length()); + ASSERT_OK(array->ValidateFull()); +} + +TEST_P(RandomArrayTest, GenerateBatch) { + auto field = GetField(); + auto batch = GenerateBatch({field}, kExpectedLength, 0xDEADBEEF); + AssertSchemaEqual(schema({field}), batch->schema()); + auto array = batch->column(0); + ASSERT_EQ(kExpectedLength, array->length()); + ASSERT_OK(array->ValidateFull()); +} + +TEST_P(RandomArrayTest, GenerateZeroLengthArray) { + auto field = GetField(); + if (field->type()->id() == Type::type::DENSE_UNION) { + GTEST_SKIP() << "Cannot generate zero-length dense union arrays"; + } + auto array = GenerateArray(*field, 0, 0xDEADBEEF); + AssertTypeEqual(field->type(), array->type()); + ASSERT_EQ(0, array->length()); + ASSERT_OK(array->ValidateFull()); +} + +TEST_P(RandomArrayTest, GenerateArrayWithZeroNullProbability) { + auto field = + GetField()->WithMetadata(key_value_metadata({{"null_probability", "0.0"}})); + if (field->type()->id() == Type::type::NA) { + GTEST_SKIP() << "Cannot generate non-null null arrays"; + } + auto batch = GenerateBatch({field}, kExpectedLength, 0xDEADBEEF); + AssertSchemaEqual(schema({field}), batch->schema()); + auto array = batch->column(0); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(0, array->null_count()); +} + +TEST_P(RandomArrayTest, GenerateNonNullableArray) { + auto field = GetField()->WithNullable(false); + if (field->type()->id() == Type::type::NA) { + GTEST_SKIP() << "Cannot generate non-null null arrays"; + } + auto batch = GenerateBatch({field}, kExpectedLength, 0xDEADBEEF); + AssertSchemaEqual(schema({field}), batch->schema()); + auto array = batch->column(0); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(0, array->null_count()); +} + +auto values = ::testing::Values( + field("null", null()), field("bool", boolean()), field("uint8", uint8()), + field("int8", int8()), field("uint16", uint16()), field("int16", int16()), + field("uint32", uint32()), field("int32", int32()), field("uint64", uint64()), + field("int64", int64()), field("float16", float16()), field("float32", float32()), + field("float64", float64()), field("string", utf8()), field("binary", binary()), + field("fixed_size_binary", fixed_size_binary(8)), + field("decimal128", decimal128(8, 3)), field("decimal256", decimal256(16, 4)), + field("date32", date32()), field("date64", date64()), + field("timestampns", timestamp(TimeUnit::NANO)), + field("timestamps", timestamp(TimeUnit::SECOND, "America/Phoenix")), + field("time32ms", time32(TimeUnit::MILLI)), field("time64ns", time64(TimeUnit::NANO)), + field("time32s", time32(TimeUnit::SECOND)), + field("time64us", time64(TimeUnit::MICRO)), field("month_interval", month_interval()), + field("daytime_interval", day_time_interval()), field("listint8", list(int8())), + field("listlistint8", list(list(int8()))), + field("listint8emptynulls", list(int8()), true, + key_value_metadata({{"force_empty_nulls", "true"}})), + field("listint81024values", list(int8()), true, + key_value_metadata({{"values", "1024"}})), + field("structints", struct_({ + field("int8", int8()), + field("int16", int16()), + field("int32", int32()), + })), + field("structnested", struct_({ + field("string", utf8()), + field("list", list(int64())), + field("timestamp", timestamp(TimeUnit::MILLI)), + })), + field("sparseunion", sparse_union({ + field("int8", int8()), + field("int16", int16()), + field("int32", int32()), + })), + field("denseunion", dense_union({ + field("int8", int8()), + field("int16", int16()), + field("int32", int32()), + })), + field("dictionary", dictionary(int8(), utf8())), field("map", map(int8(), utf8())), + field("fixedsizelist", fixed_size_list(int8(), 4)), + field("durationns", duration(TimeUnit::NANO)), field("largestring", large_utf8()), + field("largebinary", large_binary()), + field("largelistlistint8", large_list(list(int8())))); + +INSTANTIATE_TEST_SUITE_P( + TestRandomArrayGeneration, RandomArrayTest, values, + [](const ::testing::TestParamInfo& info) { + return std::to_string(info.index) + info.param->name(); + }); + +using NumericTypes = + ::testing::Types; +TYPED_TEST_SUITE(RandomNumericArrayTest, NumericTypes); + +TYPED_TEST(RandomNumericArrayTest, GenerateMinMax) { + auto field = this->GetField()->WithMetadata( + key_value_metadata({{"min", "0"}, {"max", "127"}, {"nan_probability", "0.0"}})); + auto batch = GenerateBatch({field}, kExpectedLength, 0xDEADBEEF); + AssertSchemaEqual(schema({field}), batch->schema()); + auto array = this->Downcast(batch->column(0)); + for (auto slot : *array) { + if (!slot.has_value()) continue; + ASSERT_GE(slot, typename TypeParam::c_type(0)); + ASSERT_LE(slot, typename TypeParam::c_type(127)); + } +} + +// Test all the supported options +TEST(TypeSpecificTests, BoolTrueProbability) { + auto field = + arrow::field("bool", boolean(), key_value_metadata({{"true_probability", "1.0"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + for (const auto& value : *array) { + ASSERT_TRUE(!value.has_value() || *value); + } +} + +TEST(TypeSpecificTests, DictionaryValues) { + auto field = arrow::field("dictionary", dictionary(int8(), utf8()), + key_value_metadata({{"values", "16"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(16, array->dictionary()->length()); +} + +TEST(TypeSpecificTests, Float32Nan) { + auto field = arrow::field("float32", float32(), + key_value_metadata({{"nan_probability", "1.0"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast>(base_array); + ASSERT_OK(array->ValidateFull()); + for (const auto& value : *array) { + ASSERT_TRUE(!value.has_value() || std::isnan(*value)); + } +} + +TEST(TypeSpecificTests, Float64Nan) { + auto field = arrow::field("float64", float64(), + key_value_metadata({{"nan_probability", "1.0"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast>(base_array); + ASSERT_OK(array->ValidateFull()); + for (const auto& value : *array) { + ASSERT_TRUE(!value.has_value() || std::isnan(*value)); + } +} + +TEST(TypeSpecificTests, ListLengths) { + { + auto field = + arrow::field("list", list(int8()), + key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(1, array->value_length(i)); + } + } + } + { + auto field = + arrow::field("list", large_list(int8()), + key_value_metadata({{"min_length", "10"}, {"max_length", "10"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(10, array->value_length(i)); + } + } + } +} + +TEST(TypeSpecificTests, MapValues) { + auto field = + arrow::field("map", map(int8(), int8()), key_value_metadata({{"values", "4"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(4, array->keys()->length()); + ASSERT_EQ(4, array->items()->length()); +} + +TEST(TypeSpecificTests, RepeatedStrings) { + auto field = arrow::field("string", utf8(), key_value_metadata({{"unique", "1"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + util::string_view singular_value = array->GetView(0); + for (auto slot : *array) { + if (!slot.has_value()) continue; + ASSERT_EQ(slot, singular_value); + } + // N.B. LargeString does not support unique +} + +TEST(TypeSpecificTests, StringLengths) { + { + auto field = arrow::field( + "list", utf8(), key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(1, array->value_length(i)); + } + } + } + { + auto field = arrow::field( + "list", binary(), key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(1, array->value_length(i)); + } + } + } + { + auto field = + arrow::field("list", large_utf8(), + key_value_metadata({{"min_length", "10"}, {"max_length", "10"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(10, array->value_length(i)); + } + } + } + { + auto field = + arrow::field("list", large_binary(), + key_value_metadata({{"min_length", "10"}, {"max_length", "10"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(10, array->value_length(i)); + } + } + } +} + +} // namespace random +} // namespace arrow diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 0a9f50529c2..344585446fc 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -2233,6 +2233,12 @@ std::shared_ptr field(std::string name, std::shared_ptr type, std::move(metadata)); } +std::shared_ptr field(std::string name, std::shared_ptr type, + std::shared_ptr metadata) { + return std::make_shared(std::move(name), std::move(type), /*nullable=*/true, + std::move(metadata)); +} + std::shared_ptr decimal(int32_t precision, int32_t scale) { return precision <= Decimal128Type::kMaxPrecision ? decimal128(precision, scale) : decimal256(precision, scale); diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 230c1ff6cb6..168e172bc88 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -629,6 +629,17 @@ std::shared_ptr ARROW_EXPORT field(std::string name, std::shared_ptr type, bool nullable = true, std::shared_ptr metadata = NULLPTR); +/// \brief Create a Field instance with metadata +/// +/// The field will be assumed to be nullable. +/// +/// \param name the field name +/// \param type the field value type +/// \param metadata any custom key-value metadata +std::shared_ptr ARROW_EXPORT +field(std::string name, std::shared_ptr type, + std::shared_ptr metadata); + /// \brief Create a Schema instance /// /// \param fields the schema's fields