Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 122 additions & 46 deletions cpp/src/arrow/compute/kernels/scalar_cast_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,15 @@
// specific language governing permissions and limitations
// under the License.

// Implementation of casting to integer or floating point types
#include <limits>

#include "arrow/array/array_base.h"
#include "arrow/array/builder_binary.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/compute/kernels/scalar_cast_internal.h"
#include "arrow/result.h"
#include "arrow/util/formatting.h"
#include "arrow/util/int_util.h"
#include "arrow/util/optional.h"
#include "arrow/util/utf8.h"
#include "arrow/visitor_inline.h"
Expand All @@ -36,13 +37,13 @@ using util::ValidateUTF8;
namespace compute {
namespace internal {

namespace {

// ----------------------------------------------------------------------
// Number / Boolean to String

template <typename I, typename O>
struct CastFunctor<O, I,
enable_if_t<is_string_like_type<O>::value &&
(is_number_type<I>::value || is_boolean_type<I>::value)>> {
template <typename O, typename I>
struct NumericToStringCastFunctor {
using value_type = typename TypeTraits<I>::CType;
using BuilderType = typename TypeTraits<O>::BuilderType;
using FormatterType = StringFormatter<I>;
Expand Down Expand Up @@ -71,7 +72,7 @@ struct CastFunctor<O, I,
};

// ----------------------------------------------------------------------
// Binary to String
// Binary-like to binary-like
//

#if defined(_MSC_VER)
Expand All @@ -92,12 +93,78 @@ struct Utf8Validator {
};

template <typename I, typename O>
struct BinaryToStringSameWidthCastFunctor {
struct CastBinaryToBinaryOffsets;

// Cast same-width offsets (no-op)
template <>
struct CastBinaryToBinaryOffsets<int32_t, int32_t> {
static void CastOffsets(KernelContext* ctx, const ArrayData& input, ArrayData* output) {
}
};
template <>
struct CastBinaryToBinaryOffsets<int64_t, int64_t> {
static void CastOffsets(KernelContext* ctx, const ArrayData& input, ArrayData* output) {
}
};

// Upcast offsets
template <>
struct CastBinaryToBinaryOffsets<int32_t, int64_t> {
static void CastOffsets(KernelContext* ctx, const ArrayData& input, ArrayData* output) {
using input_offset_type = int32_t;
using output_offset_type = int64_t;
KERNEL_ASSIGN_OR_RAISE(output->buffers[1], ctx,
ctx->Allocate((output->length + output->offset + 1) *
sizeof(output_offset_type)));
memset(output->buffers[1]->mutable_data(), 0,
output->offset * sizeof(output_offset_type));
::arrow::internal::CastInts(input.GetValues<input_offset_type>(1),
output->GetMutableValues<output_offset_type>(1),
output->length + 1);
}
};

// Downcast offsets
template <>
struct CastBinaryToBinaryOffsets<int64_t, int32_t> {
static void CastOffsets(KernelContext* ctx, const ArrayData& input, ArrayData* output) {
using input_offset_type = int64_t;
using output_offset_type = int32_t;

constexpr input_offset_type kMaxOffset =
std::numeric_limits<output_offset_type>::max();

auto input_offsets = input.GetValues<input_offset_type>(1);

// Binary offsets are ascending, so it's enough to check the last one for overflow.
if (input_offsets[input.length] > kMaxOffset) {
ctx->SetStatus(Status::Invalid("Failed casting from ", input.type->ToString(),
" to ", output->type->ToString(),
": input array too large"));
} else {
KERNEL_ASSIGN_OR_RAISE(output->buffers[1], ctx,
ctx->Allocate((output->length + output->offset + 1) *
sizeof(output_offset_type)));
memset(output->buffers[1]->mutable_data(), 0,
output->offset * sizeof(output_offset_type));
::arrow::internal::CastInts(input.GetValues<input_offset_type>(1),
output->GetMutableValues<output_offset_type>(1),
output->length + 1);
}
}
};

template <typename O, typename I>
struct BinaryToBinaryCastFunctor {
using input_offset_type = typename I::offset_type;
using output_offset_type = typename O::offset_type;

static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
if (!options.allow_invalid_utf8) {
const ArrayData& input = *batch[0].array();

if (!I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8) {
InitializeUTF8();
const ArrayData& input = *batch[0].array();

ArrayDataVisitor<I> visitor;
Utf8Validator validator;
Expand All @@ -107,75 +174,84 @@ struct BinaryToStringSameWidthCastFunctor {
return;
}
}
// It's OK to call this because base binary types do not preallocate
// anything

// Start with a zero-copy cast, but change indices to expected size
ZeroCopyCastExec(ctx, batch, out);
CastBinaryToBinaryOffsets<input_offset_type, output_offset_type>::CastOffsets(
ctx, input, out->mutable_array());
}
};

template <>
struct CastFunctor<StringType, BinaryType>
: public BinaryToStringSameWidthCastFunctor<StringType, BinaryType> {};

template <>
struct CastFunctor<LargeStringType, LargeBinaryType>
: public BinaryToStringSameWidthCastFunctor<LargeStringType, LargeBinaryType> {};

#if defined(_MSC_VER)
#pragma warning(pop)
#endif

// String casts available
//
// * Numbers and boolean to String / LargeString
// * Binary / LargeBinary to String / LargeString with UTF8 validation
// ----------------------------------------------------------------------
// Cast functions registration

template <typename OutType>
void AddNumberToStringCasts(std::shared_ptr<DataType> out_ty, CastFunction* func) {
void AddNumberToStringCasts(CastFunction* func) {
auto out_ty = TypeTraits<OutType>::type_singleton();

DCHECK_OK(func->AddKernel(Type::BOOL, {boolean()}, out_ty,
CastFunctor<OutType, BooleanType>::Exec,
NumericToStringCastFunctor<OutType, BooleanType>::Exec,
NullHandling::COMPUTED_NO_PREALLOCATE));

for (const std::shared_ptr<DataType>& in_ty : NumericTypes()) {
DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty,
GenerateNumeric<CastFunctor, OutType>(*in_ty),
NullHandling::COMPUTED_NO_PREALLOCATE));
DCHECK_OK(
func->AddKernel(in_ty->id(), {in_ty}, out_ty,
GenerateNumeric<NumericToStringCastFunctor, OutType>(*in_ty),
NullHandling::COMPUTED_NO_PREALLOCATE));
}
}

template <typename OutType, typename InType>
void AddBinaryToBinaryCast(CastFunction* func) {
auto in_ty = TypeTraits<InType>::type_singleton();
auto out_ty = TypeTraits<OutType>::type_singleton();

DCHECK_OK(func->AddKernel(OutType::type_id, {in_ty}, out_ty,
BinaryToBinaryCastFunctor<OutType, InType>::Exec,
NullHandling::COMPUTED_NO_PREALLOCATE));
}

} // namespace

std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
auto cast_binary = std::make_shared<CastFunction>("cast_binary", Type::BINARY);
AddCommonCasts(Type::BINARY, binary(), cast_binary.get());
AddZeroCopyCast(Type::STRING, {utf8()}, binary(), cast_binary.get());
AddBinaryToBinaryCast<BinaryType, StringType>(cast_binary.get());
AddBinaryToBinaryCast<BinaryType, LargeBinaryType>(cast_binary.get());
AddBinaryToBinaryCast<BinaryType, LargeStringType>(cast_binary.get());

auto cast_large_binary =
std::make_shared<CastFunction>("cast_large_binary", Type::LARGE_BINARY);
AddCommonCasts(Type::LARGE_BINARY, large_binary(), cast_large_binary.get());
AddZeroCopyCast(Type::LARGE_STRING, {large_utf8()}, large_binary(),
cast_large_binary.get());

auto cast_fsb =
std::make_shared<CastFunction>("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY);
AddCommonCasts(Type::FIXED_SIZE_BINARY, OutputType(ResolveOutputFromOptions),
cast_fsb.get());
AddBinaryToBinaryCast<LargeBinaryType, BinaryType>(cast_large_binary.get());
AddBinaryToBinaryCast<LargeBinaryType, StringType>(cast_large_binary.get());
AddBinaryToBinaryCast<LargeBinaryType, LargeStringType>(cast_large_binary.get());

auto cast_string = std::make_shared<CastFunction>("cast_string", Type::STRING);
AddCommonCasts(Type::STRING, utf8(), cast_string.get());
AddNumberToStringCasts<StringType>(utf8(), cast_string.get());
DCHECK_OK(cast_string->AddKernel(Type::BINARY, {binary()}, utf8(),
CastFunctor<StringType, BinaryType>::Exec,
NullHandling::COMPUTED_NO_PREALLOCATE));
AddNumberToStringCasts<StringType>(cast_string.get());
AddBinaryToBinaryCast<StringType, BinaryType>(cast_string.get());
AddBinaryToBinaryCast<StringType, LargeBinaryType>(cast_string.get());
AddBinaryToBinaryCast<StringType, LargeStringType>(cast_string.get());

auto cast_large_string =
std::make_shared<CastFunction>("cast_large_string", Type::LARGE_STRING);
AddCommonCasts(Type::LARGE_STRING, large_utf8(), cast_large_string.get());
AddNumberToStringCasts<LargeStringType>(large_utf8(), cast_large_string.get());
DCHECK_OK(
cast_large_string->AddKernel(Type::LARGE_BINARY, {large_binary()}, large_utf8(),
CastFunctor<LargeStringType, LargeBinaryType>::Exec,
NullHandling::COMPUTED_NO_PREALLOCATE));
AddNumberToStringCasts<LargeStringType>(cast_large_string.get());
AddBinaryToBinaryCast<LargeStringType, BinaryType>(cast_large_string.get());
AddBinaryToBinaryCast<LargeStringType, StringType>(cast_large_string.get());
AddBinaryToBinaryCast<LargeStringType, LargeBinaryType>(cast_large_string.get());

auto cast_fsb =
std::make_shared<CastFunction>("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY);
AddCommonCasts(Type::FIXED_SIZE_BINARY, OutputType(ResolveOutputFromOptions),
cast_fsb.get());

return {cast_binary, cast_fsb, cast_large_binary, cast_string, cast_large_string};
return {cast_binary, cast_large_binary, cast_string, cast_large_string, cast_fsb};
}

} // namespace internal
Expand Down
95 changes: 57 additions & 38 deletions cpp/src/arrow/compute/kernels/scalar_cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ class TestCast : public TestBase {
}

template <typename SourceType, typename DestType>
void TestCastBinaryToString() {
void TestCastBinaryToBinary() {
CastOptions options;
auto src_type = TypeTraits<SourceType>::type_singleton();
auto dest_type = TypeTraits<DestType>::type_singleton();
Expand All @@ -233,41 +233,28 @@ class TestCast : public TestBase {
std::vector<bool> valid = {1, 1, 1, 1, 0};
std::vector<std::string> strings = {"Hi", "olá mundo", "你好世界", "", kInvalidUtf8};

std::shared_ptr<Array> array;

// Should accept when invalid but null.
ArrayFromVector<SourceType, std::string>(src_type, valid, strings, &array);
CheckZeroCopy(*array, dest_type);

// Should refuse due to invalid utf8 payload
CheckFails<SourceType>(strings, all, dest_type, options,
/*check_scalar=*/false);

// Should accept due to option override
options.allow_invalid_utf8 = true;
CheckCase<SourceType, DestType>(strings, all, strings, options,
/*check_scalar=*/false, /*validate_full=*/false);
}

template <typename SourceType, typename DestType>
void TestCastStringToBinary() {
CastOptions options;
auto src_type = TypeTraits<SourceType>::type_singleton();
auto dest_type = TypeTraits<DestType>::type_singleton();

// All valid except the last one
std::vector<bool> all = {1, 1, 1, 1, 1};
std::vector<bool> valid = {1, 1, 1, 1, 0};
std::vector<std::string> strings = {"Hi", "olá mundo", "你好世界", "", kInvalidUtf8};

std::shared_ptr<Array> array;

// Should accept when invalid but null.
ArrayFromVector<SourceType, std::string>(src_type, valid, strings, &array);
CheckZeroCopy(*array, dest_type);

CheckCase<SourceType, DestType>(src_type, strings, all, dest_type, strings, options,
CheckCase<SourceType, DestType>(strings, valid, strings, options,
/*check_scalar=*/false);

// Should accept empty array
CheckCaseJSON(src_type, dest_type, "[]", "[]", /*check_scalar=*/false);

if (!SourceType::is_utf8 && DestType::is_utf8) {
// Should refuse due to invalid utf8 payload
CheckFails<SourceType>(strings, all, dest_type, options,
/*check_scalar=*/false);
// Should accept due to option override
options.allow_invalid_utf8 = true;
CheckCase<SourceType, DestType>(strings, all, strings, options,
/*check_scalar=*/false, /*validate_full=*/false);
} else {
// Destination type allows non-utf8 data,
// or source type also enforces utf8 data.
const bool validate_full = !DestType::is_utf8;
CheckCase<SourceType, DestType>(strings, all, strings, options,
/*check_scalar=*/false, validate_full);
}
}

template <typename DestType>
Expand Down Expand Up @@ -1577,16 +1564,48 @@ TEST_F(TestCast, StringToTimestampErrors) {
}
}

TEST_F(TestCast, BinaryToString) { TestCastBinaryToString<BinaryType, StringType>(); }
TEST_F(TestCast, BinaryToString) { TestCastBinaryToBinary<BinaryType, StringType>(); }

TEST_F(TestCast, BinaryToLargeBinary) {
TestCastBinaryToBinary<BinaryType, LargeBinaryType>();
}

TEST_F(TestCast, BinaryToLargeString) {
TestCastBinaryToBinary<BinaryType, LargeStringType>();
}

TEST_F(TestCast, LargeBinaryToBinary) {
TestCastBinaryToBinary<LargeBinaryType, BinaryType>();
}

TEST_F(TestCast, LargeBinaryToString) {
TestCastBinaryToBinary<LargeBinaryType, StringType>();
}

TEST_F(TestCast, LargeBinaryToLargeString) {
TestCastBinaryToString<LargeBinaryType, LargeStringType>();
TestCastBinaryToBinary<LargeBinaryType, LargeStringType>();
}

TEST_F(TestCast, StringToBinary) { TestCastStringToBinary<StringType, BinaryType>(); }
TEST_F(TestCast, StringToBinary) { TestCastBinaryToBinary<StringType, BinaryType>(); }

TEST_F(TestCast, StringToLargeBinary) {
TestCastBinaryToBinary<StringType, LargeBinaryType>();
}

TEST_F(TestCast, StringToLargeString) {
TestCastBinaryToBinary<StringType, LargeStringType>();
}

TEST_F(TestCast, LargeStringToBinary) {
TestCastBinaryToBinary<LargeStringType, BinaryType>();
}

TEST_F(TestCast, LargeStringToString) {
TestCastBinaryToBinary<LargeStringType, StringType>();
}

TEST_F(TestCast, LargeStringToLargeBinary) {
TestCastStringToBinary<LargeStringType, LargeBinaryType>();
TestCastBinaryToBinary<LargeStringType, LargeBinaryType>();
}

TEST_F(TestCast, NumberToString) { TestCastNumberToString<StringType>(); }
Expand Down
Loading