Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion cpp/src/arrow/compute/api_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,12 @@ Result<Datum> Filter(const Datum& values, const Datum& filter,
ExecContext* ctx = NULLPTR);

struct ARROW_EXPORT TakeOptions : public FunctionOptions {
static TakeOptions Defaults() { return TakeOptions{}; }
explicit TakeOptions(bool boundscheck = true) : boundscheck(boundscheck) {}

bool boundscheck = true;
static TakeOptions BoundsCheck() { return TakeOptions(true); }
static TakeOptions NoBoundsCheck() { return TakeOptions(false); }
static TakeOptions Defaults() { return BoundsCheck(); }
};

/// \brief Take from an array of values at indices in another array
Expand Down
28 changes: 20 additions & 8 deletions cpp/src/arrow/compute/benchmark_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,10 @@ void BenchmarkSetArgsWithSizes(benchmark::internal::Benchmark* bench,
const std::vector<int64_t>& sizes = kMemorySizes) {
bench->Unit(benchmark::kMicrosecond);

// 0 is treated as "no nulls"
for (const auto size : sizes) {
for (const auto inverse_null_proportion :
std::vector<ArgsType>({10000, 100, 10, 2, 1})) {
std::vector<ArgsType>({10000, 100, 10, 2, 1, 0})) {
bench->Args({static_cast<ArgsType>(size), inverse_null_proportion});
}
}
Expand All @@ -80,21 +81,32 @@ struct RegressionArgs {
const int64_t size;

// proportion of nulls in generated arrays
const double null_proportion;

explicit RegressionArgs(benchmark::State& state)
: size(state.range(0)),
null_proportion(std::min(1., 1. / static_cast<double>(state.range(1)))),
state_(state) {}
double null_proportion;

// If size_is_bytes is true, then it's a number of bytes, otherwise it's the
// number of items processed (for reporting)
explicit RegressionArgs(benchmark::State& state, bool size_is_bytes = true)
: size(state.range(0)), state_(state), size_is_bytes_(size_is_bytes) {
if (state.range(1) == 0) {
this->null_proportion = 0.0;
} else {
this->null_proportion = std::min(1., 1. / static_cast<double>(state.range(1)));
}
}

~RegressionArgs() {
state_.counters["size"] = static_cast<double>(size);
state_.counters["null_percent"] = null_proportion * 100;
state_.SetBytesProcessed(state_.iterations() * size);
if (size_is_bytes_) {
state_.SetBytesProcessed(state_.iterations() * size);
} else {
state_.SetItemsProcessed(state_.iterations() * size);
}
}

private:
benchmark::State& state_;
bool size_is_bytes_;
};

} // namespace compute
Expand Down
82 changes: 81 additions & 1 deletion cpp/src/arrow/compute/kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "arrow/compute/exec.h"
#include "arrow/compute/util_internal.h"
#include "arrow/result.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/hash_util.h"
Expand Down Expand Up @@ -96,7 +97,6 @@ class SameTypeIdMatcher : public TypeMatcher {
if (this == &other) {
return true;
}

auto casted = dynamic_cast<const SameTypeIdMatcher*>(&other);
if (casted == nullptr) {
return false;
Expand Down Expand Up @@ -150,6 +150,86 @@ std::shared_ptr<TypeMatcher> TimestampUnit(TimeUnit::type unit) {
return std::make_shared<TimestampUnitMatcher>(unit);
}

class IntegerMatcher : public TypeMatcher {
public:
IntegerMatcher() {}

bool Matches(const DataType& type) const override { return is_integer(type.id()); }

bool Equals(const TypeMatcher& other) const override {
if (this == &other) {
return true;
}
auto casted = dynamic_cast<const IntegerMatcher*>(&other);
return casted != nullptr;
}

std::string ToString() const override { return "integer"; }
};

std::shared_ptr<TypeMatcher> Integer() { return std::make_shared<IntegerMatcher>(); }

class PrimitiveMatcher : public TypeMatcher {
public:
PrimitiveMatcher() {}

bool Matches(const DataType& type) const override { return is_primitive(type.id()); }

bool Equals(const TypeMatcher& other) const override {
if (this == &other) {
return true;
}
auto casted = dynamic_cast<const PrimitiveMatcher*>(&other);
return casted != nullptr;
}

std::string ToString() const override { return "primitive"; }
};

std::shared_ptr<TypeMatcher> Primitive() { return std::make_shared<PrimitiveMatcher>(); }

class BinaryLikeMatcher : public TypeMatcher {
public:
BinaryLikeMatcher() {}

bool Matches(const DataType& type) const override { return is_binary_like(type.id()); }

bool Equals(const TypeMatcher& other) const override {
if (this == &other) {
return true;
}
auto casted = dynamic_cast<const BinaryLikeMatcher*>(&other);
return casted != nullptr;
}
std::string ToString() const override { return "binary-like"; }
};

std::shared_ptr<TypeMatcher> BinaryLike() {
return std::make_shared<BinaryLikeMatcher>();
}

class LargeBinaryLikeMatcher : public TypeMatcher {
public:
LargeBinaryLikeMatcher() {}

bool Matches(const DataType& type) const override {
return is_large_binary_like(type.id());
}

bool Equals(const TypeMatcher& other) const override {
if (this == &other) {
return true;
}
auto casted = dynamic_cast<const LargeBinaryLikeMatcher*>(&other);
return casted != nullptr;
}
std::string ToString() const override { return "large-binary-like"; }
};

std::shared_ptr<TypeMatcher> LargeBinaryLike() {
return std::make_shared<LargeBinaryLikeMatcher>();
}

} // namespace match

// ----------------------------------------------------------------------
Expand Down
13 changes: 13 additions & 0 deletions cpp/src/arrow/compute/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,19 @@ ARROW_EXPORT std::shared_ptr<TypeMatcher> SameTypeId(Type::type type_id);
/// zones can be different.
ARROW_EXPORT std::shared_ptr<TypeMatcher> TimestampUnit(TimeUnit::type unit);

// \brief Match any integer type
ARROW_EXPORT std::shared_ptr<TypeMatcher> Integer();

// Match types using 32-bit varbinary representation
ARROW_EXPORT std::shared_ptr<TypeMatcher> BinaryLike();

// Match types using 64-bit varbinary representation
ARROW_EXPORT std::shared_ptr<TypeMatcher> LargeBinaryLike();

// \brief Match any primitive type (boolean or any type representable as a C
// Type)
ARROW_EXPORT std::shared_ptr<TypeMatcher> Primitive();

} // namespace match

/// \brief An object used for type- and shape-checking arguments to be passed
Expand Down
36 changes: 13 additions & 23 deletions cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ struct FilterParams {
const double filter_null_proportion;
};

std::vector<int64_t> g_data_sizes = {kL1Size, 1 << 20};
std::vector<int64_t> g_data_sizes = {kL2Size};

// The benchmark state parameter references this vector of cases. Test high and
// low selectivity filters.
Expand Down Expand Up @@ -87,46 +87,36 @@ struct TakeBenchmark {
TakeBenchmark(benchmark::State& state, bool indices_have_nulls,
bool monotonic_indices = false)
: state(state),
args(state),
args(state, /*size_is_bytes=*/false),
rand(kSeed),
indices_have_nulls(indices_have_nulls),
monotonic_indices(false) {}
monotonic_indices(monotonic_indices) {}

void Int64() {
const int64_t array_size = args.size / sizeof(int64_t);
auto values = rand.Int64(array_size, -100, 100, args.null_proportion);
auto values = rand.Int64(args.size, -100, 100, args.null_proportion);
Bench(values);
}

void FSLInt64() {
const int64_t array_size = args.size / sizeof(int64_t);
auto int_array = rand.Int64(array_size, -100, 100, args.null_proportion);
auto int_array = rand.Int64(args.size, -100, 100, args.null_proportion);
auto values = std::make_shared<FixedSizeListArray>(
fixed_size_list(int64(), 1), array_size, int_array, int_array->null_bitmap(),
fixed_size_list(int64(), 1), args.size, int_array, int_array->null_bitmap(),
int_array->null_count());
Bench(values);
}

void String() {
int32_t string_min_length = 0, string_max_length = 32;
int32_t string_mean_length = (string_max_length + string_min_length) / 2;
// for an array of 50% null strings, we need to generate twice as many strings
// to ensure that they have an average of args.size total characters
int64_t array_size = args.size;
if (args.null_proportion < 1) {
array_size = static_cast<int64_t>(args.size / string_mean_length /
(1 - args.null_proportion));
}
auto values = std::static_pointer_cast<StringArray>(rand.String(
array_size, string_min_length, string_max_length, args.null_proportion));
args.size, string_min_length, string_max_length, args.null_proportion));
Bench(values);
}

void Bench(const std::shared_ptr<Array>& values) {
bool indices_null_proportion = indices_have_nulls ? args.null_proportion : 0;
double indices_null_proportion = indices_have_nulls ? args.null_proportion : 0;
auto indices =
rand.Int32(static_cast<int32_t>(values->length()), 0,
static_cast<int32_t>(values->length() - 1), indices_null_proportion);
rand.Int32(values->length(), 0, static_cast<int32_t>(values->length() - 1),
indices_null_proportion);

if (monotonic_indices) {
auto arg_sorter = *SortToIndices(*indices);
Expand Down Expand Up @@ -269,20 +259,20 @@ BENCHMARK(FilterStringFilterWithNulls)->Apply(FilterSetArgs);

void TakeSetArgs(benchmark::internal::Benchmark* bench) {
for (int64_t size : g_data_sizes) {
for (auto nulls : std::vector<ArgsType>({1000, 100, 50, 10, 1, 0})) {
for (auto nulls : std::vector<ArgsType>({1000, 10, 2, 1, 0})) {
bench->Args({static_cast<ArgsType>(size), nulls});
}
}
}

BENCHMARK(TakeInt64RandomIndicesNoNulls)->Apply(TakeSetArgs);
BENCHMARK(TakeInt64RandomIndicesWithNulls)->Apply(TakeSetArgs);
BENCHMARK(TakeInt64MonotonicIndices)->Apply(TakeSetArgs);
BENCHMARK(TakeFSLInt64RandomIndicesNoNulls)->Apply(TakeSetArgs);
BENCHMARK(TakeFSLInt64RandomIndicesWithNulls)->Apply(TakeSetArgs);
BENCHMARK(TakeFSLInt64MonotonicIndices)->Apply(TakeSetArgs);
BENCHMARK(TakeStringRandomIndicesNoNulls)->Apply(TakeSetArgs);
BENCHMARK(TakeStringRandomIndicesWithNulls)->Apply(TakeSetArgs);
BENCHMARK(TakeInt64MonotonicIndices)->Apply(TakeSetArgs);
BENCHMARK(TakeFSLInt64MonotonicIndices)->Apply(TakeSetArgs);
BENCHMARK(TakeStringMonotonicIndices)->Apply(TakeSetArgs);

} // namespace compute
Expand Down
Loading