Skip to content
Closed
37 changes: 15 additions & 22 deletions cpp/src/arrow/filesystem/s3fs_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "arrow/table.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/range.h"

#include "parquet/arrow/reader.h"
Expand Down Expand Up @@ -146,32 +147,24 @@ class MinioFixture : public benchmark::Fixture {
/// Appends integer columns to the beginning (to act as indices).
Status MakeParquetObject(const std::string& path, int num_columns, int num_rows) {
std::vector<std::shared_ptr<ChunkedArray>> columns;
std::vector<std::shared_ptr<Field>> fields;

{
arrow::random::RandomArrayGenerator generator(0);
std::shared_ptr<Array> values = generator.Int64(num_rows, 0, 1e10, 0);
columns.push_back(std::make_shared<ChunkedArray>(values));
fields.push_back(::arrow::field("timestamp", values->type()));
}
{
arrow::random::RandomArrayGenerator generator(1);
std::shared_ptr<Array> values = generator.Int32(num_rows, 0, 1e9, 0);
columns.push_back(std::make_shared<ChunkedArray>(values));
fields.push_back(::arrow::field("val", values->type()));
}

FieldVector fields{
field("timestamp", int64(), /*nullable=*/true,
key_value_metadata(
{{"min", "0"}, {"max", "10000000000"}, {"null_probability", "0"}})),
field("val", int32(), /*nullable=*/true,
key_value_metadata(
{{"min", "0"}, {"max", "1000000000"}, {"null_probability", "0"}}))};
for (int i = 0; i < num_columns; i++) {
arrow::random::RandomArrayGenerator generator(i);
std::shared_ptr<Array> values = generator.Float64(num_rows, -1.e10, 1e10, 0);
std::stringstream ss;
ss << "col" << i;
columns.push_back(std::make_shared<ChunkedArray>(values));
fields.push_back(::arrow::field(ss.str(), values->type()));
fields.push_back(
field(ss.str(), float64(), /*nullable=*/true,
key_value_metadata(
{{"min", "-1.e10"}, {"max", "1e10"}, {"null_probability", "0"}})));
}
auto schema = std::make_shared<::arrow::Schema>(fields);

std::shared_ptr<Table> table = Table::Make(schema, columns);
auto batch = random::GenerateBatch(fields, num_rows, 0);
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Table> table,
Table::FromRecordBatches({batch}));

std::shared_ptr<io::OutputStream> sink;
ARROW_ASSIGN_OR_RAISE(sink, fs_->OpenOutputStream(path));
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/arrow/testing/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@

arrow_install_all_headers("arrow/testing")

if(ARROW_BUILD_TESTS)
add_arrow_test(random_test)
endif()

# json_integration_test is two things at the same time:
# - an executable that can be called to answer integration test requests
# - a self-(unit)test for the C++ side of integration testing
Expand Down
372 changes: 362 additions & 10 deletions cpp/src/arrow/testing/random.cc

Large diffs are not rendered by default.

65 changes: 65 additions & 0 deletions cpp/src/arrow/testing/random.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,10 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
double null_probability = 0,
bool force_empty_nulls = false);

std::shared_ptr<Array> LargeOffsets(int64_t size, int64_t first_offset,
int64_t last_offset, double null_probability = 0,
bool force_empty_nulls = false);

/// \brief Generate a random StringArray
///
/// \param[in] size the size of the array to generate
Expand Down Expand Up @@ -351,13 +355,74 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
std::shared_ptr<Array> ArrayOf(std::shared_ptr<DataType> type, int64_t size,
double null_probability);

/// \brief Generate an array with random data based on the given field. See BatchOf
/// for usage info.
std::shared_ptr<Array> ArrayOf(const Field& field, int64_t size);

/// \brief Generate a record batch with random data of the specified length.
///
/// Generation options are read from key-value metadata for each field, and may be
/// specified at any nesting level. For example, generation options for the child
/// values of a list array can be specified by constructing the list type with
/// list(field("item", int8(), options_metadata))
///
/// The following options are supported:
///
/// For all types except NullType:
/// - null_probability (double): range [0.0, 1.0] the probability of a null value.
/// Default/value is 0.0 if the field is marked non-nullable, else it is 0.01
///
/// For all numeric types T:
/// - min (T::c_type): the minimum value to generate (inclusive), default
/// std::numeric_limits<T::c_type>::min()
/// - max (T::c_type): the maximum value to generate (inclusive), default
/// std::numeric_limits<T::c_type>::max()
/// Note this means that, for example, min/max are int16_t values for HalfFloatType.
///
/// For floating point types T for which is_physical_floating_type<T>:
/// - nan_probability (double): range [0.0, 1.0] the probability of a NaN value.
///
/// For BooleanType:
/// - true_probability (double): range [0.0, 1.0] the probability of a true.
///
/// For DictionaryType:
/// - values (int32_t): the size of the dictionary.
/// Other properties are passed to the generator for the dictionary indices. However,
/// min and max cannot be specified. Note it is not possible to otherwise customize
/// the generation of dictionary values.
///
/// For list, string, and binary types T, including their large variants:
/// - min_length (T::offset_type): the minimum length of the child to generate,
/// default 0
/// - max_length (T::offset_type): the minimum length of the child to generate,
/// default 1024
///
/// For string and binary types T (not including their large variants):
/// - unique (int32_t): if positive, this many distinct values will be generated
/// and all array values will be one of these values, default -1
///
/// For MapType:
/// - values (int32_t): the number of key-value pairs to generate, which will be
/// partitioned among the array values.
std::shared_ptr<arrow::RecordBatch> BatchOf(const FieldVector& fields, int64_t size);

SeedType seed() { return seed_distribution_(seed_rng_); }

private:
std::uniform_int_distribution<SeedType> seed_distribution_;
std::default_random_engine seed_rng_;
};

/// Generate an array with random data. See RandomArrayGenerator::BatchOf.
ARROW_TESTING_EXPORT
std::shared_ptr<arrow::RecordBatch> GenerateBatch(const FieldVector& fields, int64_t size,
SeedType seed);

/// Generate an array with random data. See RandomArrayGenerator::BatchOf.
ARROW_TESTING_EXPORT
std::shared_ptr<arrow::Array> GenerateArray(const Field& field, int64_t size,
SeedType seed);

} // namespace random

//
Expand Down
Loading