Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,7 @@ if(ARROW_COMPUTE)
compute/kernels/scalar_cast_temporal.cc
compute/kernels/scalar_compare.cc
compute/kernels/scalar_set_lookup.cc
compute/kernels/scalar_string_ascii.cc
compute/kernels/vector_filter.cc
compute/kernels/vector_hash.cc
compute/kernels/vector_sort.cc
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compare.cc
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,7 @@ class ScalarEqualsVisitor {
template <typename T>
typename std::enable_if<std::is_base_of<BaseBinaryScalar, T>::value, Status>::type
Visit(const T& left) {
const auto& right = checked_cast<const BinaryScalar&>(right_);
const auto& right = checked_cast<const BaseBinaryScalar&>(right_);
result_ = internal::SharedPtrEquals(left.value, right.value);
return Status::OK();
}
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/arrow/compute/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ add_arrow_compute_test(scalar_test
scalar_boolean_test.cc
scalar_cast_test.cc
scalar_compare_test.cc
scalar_set_lookup_test.cc)
scalar_set_lookup_test.cc
scalar_string_test.cc)

add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute")

Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/compute/kernels/codegen_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,11 @@ const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes() {
return g_base_binary_types;
}

const std::vector<std::shared_ptr<DataType>>& StringTypes() {
static DataTypeVector types = {utf8(), large_utf8()};
return types;
}

const std::vector<std::shared_ptr<DataType>>& SignedIntTypes() {
std::call_once(codegen_static_initialized, InitStaticData);
return g_signed_int_types;
Expand Down
98 changes: 79 additions & 19 deletions cpp/src/arrow/compute/kernels/codegen_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,20 +122,55 @@ struct UnboxScalar<Type, enable_if_base_binary<Type>> {
};

template <typename Type, typename Enable = void>
struct GetValueType;
struct GetViewType;

template <typename Type>
struct GetValueType<Type, enable_if_has_c_type<Type>> {
struct GetViewType<Type, enable_if_has_c_type<Type>> {
using T = typename Type::c_type;
};

template <typename Type>
struct GetValueType<
struct GetViewType<
Type, enable_if_t<is_base_binary_type<Type>::value || is_decimal_type<Type>::value ||
is_fixed_size_binary_type<Type>::value>> {
using T = util::string_view;
};

template <typename Type, typename Enable = void>
struct GetOutputType;

template <typename Type>
struct GetOutputType<Type, enable_if_has_c_type<Type>> {
using T = typename Type::c_type;
};

template <typename Type>
struct GetOutputType<
Type, enable_if_t<is_string_like_type<Type>::value>> {
using T = std::string;
};

template <typename Type, typename Enable = void>
struct BoxScalar;

template <typename Type>
struct BoxScalar<Type, enable_if_has_c_type<Type>> {
using T = typename GetOutputType<Type>::T;
using ScalarType = typename TypeTraits<Type>::ScalarType;
static std::shared_ptr<Scalar> Box(T val, const std::shared_ptr<DataType>& type) {
return std::make_shared<ScalarType>(val, type);
}
};

template <typename Type>
struct BoxScalar<Type, enable_if_base_binary<Type>> {
using T = typename GetOutputType<Type>::T;
using ScalarType = typename TypeTraits<Type>::ScalarType;
static std::shared_ptr<Scalar> Box(T val, const std::shared_ptr<DataType>&) {
return std::make_shared<ScalarType>(val);
}
};

// ----------------------------------------------------------------------
// Reusable type resolvers

Expand All @@ -154,6 +189,7 @@ void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec exec,
// functions

const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes();
const std::vector<std::shared_ptr<DataType>>& StringTypes();
const std::vector<std::shared_ptr<DataType>>& SignedIntTypes();
const std::vector<std::shared_ptr<DataType>>& UnsignedIntTypes();
const std::vector<std::shared_ptr<DataType>>& IntTypes();
Expand Down Expand Up @@ -327,10 +363,8 @@ struct OutputAdapter<Type, enable_if_base_binary<Type>> {
// };
template <typename OutType, typename Arg0Type, typename Op>
struct ScalarUnary {
using OutScalar = typename TypeTraits<OutType>::ScalarType;

using OUT = typename GetValueType<OutType>::T;
using ARG0 = typename GetValueType<Arg0Type>::T;
using OUT = typename GetOutputType<OutType>::T;
using ARG0 = typename GetViewType<Arg0Type>::T;

static void Array(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
ArrayIterator<Arg0Type> arg0(*batch[0].array());
Expand All @@ -342,8 +376,9 @@ struct ScalarUnary {
static void Scalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (batch[0].scalar()->is_valid) {
ARG0 arg0 = UnboxScalar<Arg0Type>::Unbox(batch[0]);
out->value = std::make_shared<OutScalar>(Op::template Call<OUT, ARG0>(ctx, arg0),
out->type());
out->value = BoxScalar<OutType>::Box(
Op::template Call<OUT, ARG0>(ctx, arg0),
out->type());
} else {
out->value = MakeNullScalar(batch[0].type());
}
Expand All @@ -363,9 +398,8 @@ struct ScalarUnary {
template <typename OutType, typename Arg0Type, typename Op>
struct ScalarUnaryNotNullStateful {
using ThisType = ScalarUnaryNotNullStateful<OutType, Arg0Type, Op>;
using OutScalar = typename TypeTraits<OutType>::ScalarType;
using OUT = typename GetValueType<OutType>::T;
using ARG0 = typename GetValueType<Arg0Type>::T;
using OUT = typename GetOutputType<OutType>::T;
using ARG0 = typename GetViewType<Arg0Type>::T;

Op op;
ScalarUnaryNotNullStateful(Op op) : op(std::move(op)) {}
Expand Down Expand Up @@ -394,6 +428,30 @@ struct ScalarUnaryNotNullStateful {
}
};

template <typename Type>
struct ArrayExec<Type, enable_if_string_like<Type>> {
static void Exec(const ThisType& functor, KernelContext* ctx, const ExecBatch& batch,
Datum* out) {
typename TypeTraits<Type>::BuilderType builder;
Status s = VisitArrayDataInline<Arg0Type>(
*batch[0].array(), [&](util::optional<ARG0> v) -> Status {
if (v.has_value()) {
return builder.Append(functor.op.Call(ctx, *v));
} else {
return builder.AppendNull();
}
});
if (!s.ok()) {
ctx->SetStatus(s);
return;
} else {
std::shared_ptr<ArrayData> result;
ctx->SetStatus(builder.FinishInternal(&result));
out->value = std::move(result);
}
}
};

template <typename Type>
struct ArrayExec<Type, enable_if_t<is_boolean_type<Type>::value>> {
static void Exec(const ThisType& functor, KernelContext* ctx, const ExecBatch& batch,
Expand All @@ -416,7 +474,7 @@ struct ScalarUnaryNotNullStateful {
void Scalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (batch[0].scalar()->is_valid) {
ARG0 arg0 = UnboxScalar<Arg0Type>::Unbox(batch[0]);
out->value = std::make_shared<OutScalar>(
out->value = BoxScalar<OutType>::Box(
this->op.template Call<OUT, ARG0>(ctx, arg0),
out->type());
} else {
Expand All @@ -438,6 +496,9 @@ struct ScalarUnaryNotNullStateful {
// operator requires some initialization use ScalarUnaryNotNullStateful
template <typename OutType, typename Arg0Type, typename Op>
struct ScalarUnaryNotNull {
using OUT = typename GetOutputType<OutType>::T;
using ARG0 = typename GetViewType<Arg0Type>::T;

static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// Seed kernel with dummy state
ScalarUnaryNotNullStateful<OutType, Arg0Type, Op> kernel({});
Expand All @@ -464,11 +525,9 @@ struct ScalarUnaryNotNull {
template <typename OutType, typename Arg0Type, typename Arg1Type, typename Op,
typename FlippedOp = Op>
struct ScalarBinary {
using OutScalarType = typename TypeTraits<OutType>::ScalarType;

using OUT = typename GetValueType<OutType>::T;
using ARG0 = typename GetValueType<Arg0Type>::T;
using ARG1 = typename GetValueType<Arg1Type>::T;
using OUT = typename GetOutputType<OutType>::T;
using ARG0 = typename GetViewType<Arg0Type>::T;
using ARG1 = typename GetViewType<Arg1Type>::T;

template <typename ChosenOp>
static void ArrayArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
Expand All @@ -492,7 +551,8 @@ struct ScalarBinary {
static void ScalarScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
auto arg0 = UnboxScalar<Arg0Type>::Unbox(batch[0]);
auto arg1 = UnboxScalar<Arg1Type>::Unbox(batch[1]);
out->value = std::make_shared<OutScalarType>(ChosenOp::template Call(ctx, arg0, arg1));
out->value = BoxScalar<OutType>::Box(ChosenOp::template Call(ctx, arg0, arg1),
out->type());
}

static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ struct SetLookupState : public KernelState {
: lookup_table(pool, 0), lookup_null_count(0) {}

Status Init(const SetLookupOptions& options) {
using T = typename GetValueType<Type>::T;
using T = typename GetViewType<Type>::T;
auto insert_value = [&](util::optional<T> v) {
if (v.has_value()) {
int32_t unused_memo_index;
Expand Down Expand Up @@ -147,7 +147,7 @@ struct MatchVisitor {

template <typename Type>
enable_if_supports_set_lookup<Type, Status> Visit(const Type&) {
using T = typename GetValueType<Type>::T;
using T = typename GetViewType<Type>::T;

const auto& state = checked_cast<const SetLookupState<Type>&>(*ctx->state());

Expand Down Expand Up @@ -222,7 +222,7 @@ struct IsInVisitor {

template <typename Type>
enable_if_supports_set_lookup<Type, Status> Visit(const Type&) {
using T = typename GetValueType<Type>::T;
using T = typename GetViewType<Type>::T;
const auto& state = checked_cast<const SetLookupState<Type>&>(*ctx->state());
ArrayData* output = out->mutable_array();

Expand Down
67 changes: 67 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <algorithm>
#include <cctype>
#include <string>

#include "arrow/compute/kernels/common.h"
#include "arrow/compute/kernels/scalar_string_internal.h"

namespace arrow {
namespace compute {
namespace internal {

// TODO: optional ascii validation

struct AsciiLength {
template <typename OUT, typename ARG0 = util::string_view>
static OUT Call(KernelContext*, ARG0 val) {
return static_cast<OUT>(val.size());
}
};

struct AsciiUpper {
// XXX: the Scalar codegen path passes template arguments that are unused
template <typename... Ignored>
static std::string Call(KernelContext*, const util::string_view& val) {
std::string result = val.to_string();
std::transform(result.begin(), result.end(), result.begin(),
[](unsigned char c) { return std::toupper(c); });
return result;
}
};

void AddAsciiLength(FunctionRegistry* registry) {
auto func = std::make_shared<ScalarFunction>("ascii_length", Arity::Unary());
ArrayKernelExec exec_offset_32 =
codegen::ScalarUnaryNotNull<Int32Type, StringType, AsciiLength>::Exec;
ArrayKernelExec exec_offset_64 =
codegen::ScalarUnaryNotNull<Int64Type, LargeStringType, AsciiLength>::Exec;
DCHECK_OK(func->AddKernel({utf8()}, int32(), exec_offset_32));
DCHECK_OK(func->AddKernel({large_utf8()}, int64(), exec_offset_64));
DCHECK_OK(registry->AddFunction(std::move(func)));
}

void RegisterScalarStringAscii(FunctionRegistry* registry) {
MakeUnaryStringToString<AsciiUpper>("ascii_upper", registry);
AddAsciiLength(registry);
}

} // namespace internal
} // namespace compute
} // namespace arrow
44 changes: 44 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_internal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <memory>
#include <string>
#include <utility>

#include "arrow/array/builder_binary.h"
#include "arrow/compute/kernels/common.h"

namespace arrow {
namespace compute {
namespace internal {

// Apply a scalar function to each string and yield same output type
template <typename Op>
void MakeUnaryStringToString(std::string name, FunctionRegistry* registry) {
auto func = std::make_shared<ScalarFunction>(name, Arity::Unary());
ArrayKernelExec exec_offset_32 =
codegen::ScalarUnaryNotNull<StringType, StringType, Op>::Exec;
ArrayKernelExec exec_offset_64 =
codegen::ScalarUnaryNotNull<LargeStringType, LargeStringType, Op>::Exec;
DCHECK_OK(func->AddKernel({utf8()}, utf8(), exec_offset_32));
DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), exec_offset_64));
DCHECK_OK(registry->AddFunction(std::move(func)));
}

} // namespace internal
} // namespace compute
} // namespace arrow
Loading