Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions cpp/src/arrow/array/builder_binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,11 @@ class BaseBinaryBuilder : public ArrayBuilder {
return util::string_view(reinterpret_cast<const char*>(value_data), value_length);
}

// Cannot make this a static attribute because of linking issues
static constexpr int64_t memory_limit() {
return std::numeric_limits<offset_type>::max() - 1;
}

protected:
TypedBufferBuilder<offset_type> offsets_builder_;
TypedBufferBuilder<uint8_t> value_data_builder_;
Expand All @@ -315,11 +320,6 @@ class BaseBinaryBuilder : public ArrayBuilder {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
}

// Cannot make this a static attribute because of linking issues
static constexpr int64_t memory_limit() {
return std::numeric_limits<offset_type>::max() - 1;
}
};

/// \class BinaryBuilder
Expand Down Expand Up @@ -387,6 +387,8 @@ class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {

class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
public:
using TypeClass = FixedSizeBinaryType;

FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);

Expand Down Expand Up @@ -471,6 +473,10 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
/// This view becomes invalid on the next modifying operation.
util::string_view GetView(int64_t i) const;

static constexpr int64_t memory_limit() {
return std::numeric_limits<int64_t>::max() - 1;
}

protected:
int32_t byte_width_;
BufferBuilder byte_builder_;
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/arrow/array/builder_decimal.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ class Decimal128;

class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
public:
using TypeClass = Decimal128Type;

explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);

Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/array/builder_primitive.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder {
template <typename T>
class NumericBuilder : public ArrayBuilder {
public:
using TypeClass = T;
using value_type = typename T::c_type;
using ArrayType = typename TypeTraits<T>::ArrayType;
using ArrayBuilder::ArrayBuilder;
Expand Down Expand Up @@ -265,7 +266,9 @@ using DoubleBuilder = NumericBuilder<DoubleType>;

class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
public:
using TypeClass = BooleanType;
using value_type = bool;

explicit BooleanBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);

explicit BooleanBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/array/builder_time.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ namespace arrow {

class ARROW_EXPORT DayTimeIntervalBuilder : public ArrayBuilder {
public:
using TypeClass = DayTimeIntervalType;
using DayMilliseconds = DayTimeIntervalType::DayMilliseconds;

explicit DayTimeIntervalBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT)
Expand Down
22 changes: 16 additions & 6 deletions cpp/src/arrow/compute/kernels/filter-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -285,18 +285,26 @@ TYPED_TEST(TestFilterKernelWithNumeric, ScalarInRangeAndFilterRandomNumeric) {
}
}

class TestFilterKernelWithString : public TestFilterKernel<StringType> {
using StringTypes =
::testing::Types<BinaryType, StringType, LargeBinaryType, LargeStringType>;

template <typename TypeClass>
class TestFilterKernelWithString : public TestFilterKernel<TypeClass> {
protected:
std::shared_ptr<DataType> value_type() {
return TypeTraits<TypeClass>::type_singleton();
}

void AssertFilter(const std::string& values, const std::string& filter,
const std::string& expected) {
TestFilterKernel<StringType>::AssertFilter(utf8(), values, filter, expected);
TestFilterKernel<TypeClass>::AssertFilter(value_type(), values, filter, expected);
}
void AssertFilterDictionary(const std::string& dictionary_values,
const std::string& dictionary_filter,
const std::string& filter,
const std::string& expected_filter) {
auto dict = ArrayFromJSON(utf8(), dictionary_values);
auto type = dictionary(int8(), utf8());
auto dict = ArrayFromJSON(value_type(), dictionary_values);
auto type = dictionary(int8(), value_type());
std::shared_ptr<Array> values, actual, expected;
ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), dictionary_filter),
dict, &values));
Expand All @@ -307,13 +315,15 @@ class TestFilterKernelWithString : public TestFilterKernel<StringType> {
}
};

TEST_F(TestFilterKernelWithString, FilterString) {
TYPED_TEST_CASE(TestFilterKernelWithString, StringTypes);

TYPED_TEST(TestFilterKernelWithString, FilterString) {
this->AssertFilter(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["b"])");
this->AssertFilter(R"([null, "b", "c"])", "[0, 1, 0]", R"(["b"])");
this->AssertFilter(R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b"])");
}

TEST_F(TestFilterKernelWithString, FilterDictionary) {
TYPED_TEST(TestFilterKernelWithString, FilterDictionary) {
auto dict = R"(["a", "b", "c", "d", "e"])";
this->AssertFilterDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[4]");
this->AssertFilterDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", "[4]");
Expand Down
20 changes: 10 additions & 10 deletions cpp/src/arrow/compute/kernels/take-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@
#include <algorithm>
#include <limits>
#include <memory>
#include <type_traits>
#include <utility>
#include <vector>

#include "arrow/builder.h"
#include "arrow/compute/context.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit-util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
Expand All @@ -37,21 +39,19 @@ namespace compute {
using internal::checked_cast;
using internal::checked_pointer_cast;

// For non-binary builders, use regular value append
template <typename Builder, typename Scalar>
static Status UnsafeAppend(Builder* builder, Scalar&& value) {
static typename std::enable_if<
!std::is_base_of<BaseBinaryType, typename Builder::TypeClass>::value, Status>::type
UnsafeAppend(Builder* builder, Scalar&& value) {
builder->UnsafeAppend(std::forward<Scalar>(value));
return Status::OK();
}

// Use BinaryBuilder::UnsafeAppend, but reserve byte storage first
static Status UnsafeAppend(BinaryBuilder* builder, util::string_view value) {
RETURN_NOT_OK(builder->ReserveData(static_cast<int64_t>(value.size())));
builder->UnsafeAppend(value);
return Status::OK();
}

// Use StringBuilder::UnsafeAppend, but reserve character storage first
static Status UnsafeAppend(StringBuilder* builder, util::string_view value) {
// For binary builders, need to reserve byte storage first
template <typename Builder>
static enable_if_base_binary<typename Builder::TypeClass, Status> UnsafeAppend(
Builder* builder, util::string_view value) {
RETURN_NOT_OK(builder->ReserveData(static_cast<int64_t>(value.size())));
builder->UnsafeAppend(value);
return Status::OK();
Expand Down
31 changes: 21 additions & 10 deletions cpp/src/arrow/compute/kernels/take-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -179,18 +179,26 @@ TYPED_TEST(TestTakeKernelWithNumeric, TakeRandomNumeric) {
}
}

class TestTakeKernelWithString : public TestTakeKernel<StringType> {
protected:
using StringTypes =
::testing::Types<BinaryType, StringType, LargeBinaryType, LargeStringType>;

template <typename TypeClass>
class TestTakeKernelWithString : public TestTakeKernel<TypeClass> {
public:
std::shared_ptr<DataType> value_type() {
return TypeTraits<TypeClass>::type_singleton();
}

void AssertTake(const std::string& values, const std::string& indices,
const std::string& expected) {
TestTakeKernel<StringType>::AssertTake(utf8(), values, indices, expected);
TestTakeKernel<TypeClass>::AssertTake(value_type(), values, indices, expected);
}
void AssertTakeDictionary(const std::string& dictionary_values,
const std::string& dictionary_indices,
const std::string& indices,
const std::string& expected_indices) {
auto dict = ArrayFromJSON(utf8(), dictionary_values);
auto type = dictionary(int8(), utf8());
auto dict = ArrayFromJSON(value_type(), dictionary_values);
auto type = dictionary(int8(), value_type());
std::shared_ptr<Array> values, actual, expected;
ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), dictionary_indices),
dict, &values));
Expand All @@ -201,19 +209,22 @@ class TestTakeKernelWithString : public TestTakeKernel<StringType> {
}
};

TEST_F(TestTakeKernelWithString, TakeString) {
TYPED_TEST_CASE(TestTakeKernelWithString, StringTypes);

TYPED_TEST(TestTakeKernelWithString, TakeString) {
this->AssertTake(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["a", "b", "a"])");
this->AssertTake(R"([null, "b", "c"])", "[0, 1, 0]", "[null, \"b\", null]");
this->AssertTake(R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b", "a"])");

std::shared_ptr<DataType> type = this->value_type();
std::shared_ptr<Array> arr;
ASSERT_RAISES(IndexError,
this->Take(utf8(), R"(["a", "b", "c"])", int8(), "[0, 9, 0]", &arr));
ASSERT_RAISES(IndexError, this->Take(utf8(), R"(["a", "b", null, "ddd", "ee"])",
int64(), "[2, 5]", &arr));
this->Take(type, R"(["a", "b", "c"])", int8(), "[0, 9, 0]", &arr));
ASSERT_RAISES(IndexError, this->Take(type, R"(["a", "b", null, "ddd", "ee"])", int64(),
"[2, 5]", &arr));
}

TEST_F(TestTakeKernelWithString, TakeDictionary) {
TYPED_TEST(TestTakeKernelWithString, TakeDictionary) {
auto dict = R"(["a", "b", "c", "d", "e"])";
this->AssertTakeDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[3, 4, 3]");
this->AssertTakeDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", "[null, 4, null]");
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/arrow/python/helpers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
GET_PRIMITIVE_TYPE(DOUBLE, float64);
GET_PRIMITIVE_TYPE(BINARY, binary);
GET_PRIMITIVE_TYPE(STRING, utf8);
GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary);
GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8);
default:
return nullptr;
}
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/python/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,12 @@ inline Status CastSize(Py_ssize_t size, int32_t* out,
return Status::OK();
}

inline Status CastSize(Py_ssize_t size, int64_t* out, const char* error_msg = NULLPTR) {
// size is assumed to be positive
*out = static_cast<int64_t>(size);
return Status::OK();
}

// \brief Print the Python object's __str__ form along with the passed error
// message
ARROW_PYTHON_EXPORT
Expand Down
66 changes: 41 additions & 25 deletions cpp/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -429,29 +429,34 @@ class TimestampConverter : public TypedConverter<TimestampType, TimestampConvert

namespace detail {

template <typename BuilderType, typename AppendFunc>
inline Status AppendPyString(BuilderType* builder, const PyBytesView& view, bool* is_full,
AppendFunc&& append_func) {
int32_t length = -1;
RETURN_NOT_OK(internal::CastSize(view.size, &length));
DCHECK_GE(length, 0);
template <typename BuilderType>
inline Status AppendPyString(BuilderType* builder, const PyBytesView& view,
bool* is_full) {
if (view.size > BuilderType::memory_limit()) {
return Status::Invalid("string too large for datatype");
}
DCHECK_GE(view.size, 0);
// Did we reach the builder size limit?
if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) {
if (ARROW_PREDICT_FALSE(builder->value_data_length() + view.size >
BuilderType::memory_limit())) {
*is_full = true;
return Status::OK();
}
RETURN_NOT_OK(append_func(view.bytes, length));
RETURN_NOT_OK(builder->Append(::arrow::util::string_view(view.bytes, view.size)));
*is_full = false;
return Status::OK();
}

inline Status BuilderAppend(BinaryBuilder* builder, PyObject* obj, bool* is_full) {
PyBytesView view;
RETURN_NOT_OK(view.FromString(obj));
return AppendPyString(builder, view, is_full,
[&builder](const char* bytes, int32_t length) {
return builder->Append(bytes, length);
});
return AppendPyString(builder, view, is_full);
}

inline Status BuilderAppend(LargeBinaryBuilder* builder, PyObject* obj, bool* is_full) {
PyBytesView view;
RETURN_NOT_OK(view.FromString(obj));
return AppendPyString(builder, view, is_full);
}

inline Status BuilderAppend(FixedSizeBinaryBuilder* builder, PyObject* obj,
Expand All @@ -466,9 +471,7 @@ inline Status BuilderAppend(FixedSizeBinaryBuilder* builder, PyObject* obj,
return internal::InvalidValue(obj, ss.str());
}

return AppendPyString(
builder, view, is_full,
[&builder](const char* bytes, int32_t length) { return builder->Append(bytes); });
return AppendPyString(builder, view, is_full);
}

} // namespace detail
Expand Down Expand Up @@ -496,12 +499,15 @@ class BinaryLikeConverter : public TypedConverter<Type, BinaryLikeConverter<Type

class BytesConverter : public BinaryLikeConverter<BinaryType> {};

class LargeBytesConverter : public BinaryLikeConverter<LargeBinaryType> {};

class FixedWidthBytesConverter : public BinaryLikeConverter<FixedSizeBinaryType> {};

// For String/UTF8, if strict_conversions enabled, we reject any non-UTF8,
// otherwise we allow but return results as BinaryArray
template <bool STRICT>
class StringConverter : public TypedConverter<StringType, StringConverter<STRICT>> {
template <typename TypeClass, bool STRICT>
class StringConverter
: public TypedConverter<TypeClass, StringConverter<TypeClass, STRICT>> {
public:
StringConverter() : binary_count_(0) {}

Expand All @@ -526,10 +532,7 @@ class StringConverter : public TypedConverter<StringType, StringConverter<STRICT
}
}

return detail::AppendPyString(this->typed_builder_, string_view_, is_full,
[this](const char* bytes, int32_t length) {
return this->typed_builder_->Append(bytes, length);
});
return detail::AppendPyString(this->typed_builder_, string_view_, is_full);
}

Status AppendItem(PyObject* obj) {
Expand All @@ -556,10 +559,13 @@ class StringConverter : public TypedConverter<StringType, StringConverter<STRICT
// We should have bailed out earlier
DCHECK(!STRICT);

using EquivalentBinaryType = typename TypeClass::EquivalentBinaryType;
using EquivalentBinaryArray = typename TypeTraits<EquivalentBinaryType>::ArrayType;

for (size_t i = 0; i < out->size(); ++i) {
auto binary_data = (*out)[i]->data()->Copy();
binary_data->type = ::arrow::binary();
(*out)[i] = std::make_shared<BinaryArray>(binary_data);
binary_data->type = TypeTraits<EquivalentBinaryType>::type_singleton();
(*out)[i] = std::make_shared<EquivalentBinaryArray>(binary_data);
}
}
return Status::OK();
Expand Down Expand Up @@ -871,14 +877,24 @@ Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas,
NUMERIC_CONVERTER(DOUBLE, DoubleType);
SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter);
SIMPLE_CONVERTER_CASE(BINARY, BytesConverter);
SIMPLE_CONVERTER_CASE(LARGE_BINARY, LargeBytesConverter);
SIMPLE_CONVERTER_CASE(FIXED_SIZE_BINARY, FixedWidthBytesConverter);
SIMPLE_CONVERTER_CASE(DATE32, Date32Converter);
SIMPLE_CONVERTER_CASE(DATE64, Date64Converter);
case Type::STRING:
if (strict_conversions) {
*out = std::unique_ptr<SeqConverter>(new StringConverter<true>());
*out = std::unique_ptr<SeqConverter>(new StringConverter<StringType, true>());
} else {
*out = std::unique_ptr<SeqConverter>(new StringConverter<StringType, false>());
}
break;
case Type::LARGE_STRING:
if (strict_conversions) {
*out =
std::unique_ptr<SeqConverter>(new StringConverter<LargeStringType, true>());
} else {
*out = std::unique_ptr<SeqConverter>(new StringConverter<false>());
*out =
std::unique_ptr<SeqConverter>(new StringConverter<LargeStringType, false>());
}
break;
case Type::TIME32: {
Expand Down
Loading