From 9672ca4c3c74e070b6894bd76f22104e0d421084 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 30 Jul 2019 10:44:27 +0200 Subject: [PATCH 1/2] ARROW-6000: [Python] Add support for LargeString and LargeBinary types --- cpp/src/arrow/array/builder_binary.h | 14 ++-- cpp/src/arrow/python/helpers.cc | 2 + cpp/src/arrow/python/helpers.h | 6 ++ cpp/src/arrow/python/python_to_arrow.cc | 66 ++++++++++++------- cpp/src/arrow/type.h | 2 + docs/source/python/api/arrays.rst | 4 ++ docs/source/python/api/datatypes.rst | 6 ++ python/pyarrow/__init__.py | 9 ++- python/pyarrow/array.pxi | 40 ++++++++++++ python/pyarrow/includes/libarrow.pxd | 20 +++++- python/pyarrow/lib.pyx | 2 + python/pyarrow/scalar.pxi | 69 ++++++++++++++++++++ python/pyarrow/tests/strategies.py | 8 ++- python/pyarrow/tests/test_convert_builtin.py | 58 +++++++++++++--- python/pyarrow/tests/test_scalars.py | 37 +++++++++++ python/pyarrow/tests/test_types.py | 16 +++++ python/pyarrow/types.pxi | 32 +++++++++ python/pyarrow/types.py | 22 +++++++ 18 files changed, 369 insertions(+), 44 deletions(-) diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 7ae4d311de5..dce5dda5945 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -294,6 +294,11 @@ class BaseBinaryBuilder : public ArrayBuilder { return util::string_view(reinterpret_cast(value_data), value_length); } + // Cannot make this a static attribute because of linking issues + static constexpr int64_t memory_limit() { + return std::numeric_limits::max() - 1; + } + protected: TypedBufferBuilder offsets_builder_; TypedBufferBuilder value_data_builder_; @@ -315,11 +320,6 @@ class BaseBinaryBuilder : public ArrayBuilder { const int64_t num_bytes = value_data_builder_.length(); offsets_builder_.UnsafeAppend(static_cast(num_bytes)); } - - // Cannot make this a static attribute because of linking issues - static constexpr int64_t memory_limit() { - return std::numeric_limits::max() - 1; - } }; /// \class BinaryBuilder @@ -471,6 +471,10 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { /// This view becomes invalid on the next modifying operation. util::string_view GetView(int64_t i) const; + static constexpr int64_t memory_limit() { + return std::numeric_limits::max() - 1; + } + protected: int32_t byte_width_; BufferBuilder byte_builder_; diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc index e44878fb507..7bbac819709 100644 --- a/cpp/src/arrow/python/helpers.cc +++ b/cpp/src/arrow/python/helpers.cc @@ -62,6 +62,8 @@ std::shared_ptr GetPrimitiveType(Type::type type) { GET_PRIMITIVE_TYPE(DOUBLE, float64); GET_PRIMITIVE_TYPE(BINARY, binary); GET_PRIMITIVE_TYPE(STRING, utf8); + GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary); + GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8); default: return nullptr; } diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h index 4917f99d2ea..8661ee5dda7 100644 --- a/cpp/src/arrow/python/helpers.h +++ b/cpp/src/arrow/python/helpers.h @@ -119,6 +119,12 @@ inline Status CastSize(Py_ssize_t size, int32_t* out, return Status::OK(); } +inline Status CastSize(Py_ssize_t size, int64_t* out, const char* error_msg = NULLPTR) { + // size is assumed to be positive + *out = static_cast(size); + return Status::OK(); +} + // \brief Print the Python object's __str__ form along with the passed error // message ARROW_PYTHON_EXPORT diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 28d8c13e4a1..424e3097b96 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -429,18 +429,20 @@ class TimestampConverter : public TypedConverter -inline Status AppendPyString(BuilderType* builder, const PyBytesView& view, bool* is_full, - AppendFunc&& append_func) { - int32_t length = -1; - RETURN_NOT_OK(internal::CastSize(view.size, &length)); - DCHECK_GE(length, 0); +template +inline Status AppendPyString(BuilderType* builder, const PyBytesView& view, + bool* is_full) { + if (view.size > BuilderType::memory_limit()) { + return Status::Invalid("string too large for datatype"); + } + DCHECK_GE(view.size, 0); // Did we reach the builder size limit? - if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) { + if (ARROW_PREDICT_FALSE(builder->value_data_length() + view.size > + BuilderType::memory_limit())) { *is_full = true; return Status::OK(); } - RETURN_NOT_OK(append_func(view.bytes, length)); + RETURN_NOT_OK(builder->Append(::arrow::util::string_view(view.bytes, view.size))); *is_full = false; return Status::OK(); } @@ -448,10 +450,13 @@ inline Status AppendPyString(BuilderType* builder, const PyBytesView& view, bool inline Status BuilderAppend(BinaryBuilder* builder, PyObject* obj, bool* is_full) { PyBytesView view; RETURN_NOT_OK(view.FromString(obj)); - return AppendPyString(builder, view, is_full, - [&builder](const char* bytes, int32_t length) { - return builder->Append(bytes, length); - }); + return AppendPyString(builder, view, is_full); +} + +inline Status BuilderAppend(LargeBinaryBuilder* builder, PyObject* obj, bool* is_full) { + PyBytesView view; + RETURN_NOT_OK(view.FromString(obj)); + return AppendPyString(builder, view, is_full); } inline Status BuilderAppend(FixedSizeBinaryBuilder* builder, PyObject* obj, @@ -466,9 +471,7 @@ inline Status BuilderAppend(FixedSizeBinaryBuilder* builder, PyObject* obj, return internal::InvalidValue(obj, ss.str()); } - return AppendPyString( - builder, view, is_full, - [&builder](const char* bytes, int32_t length) { return builder->Append(bytes); }); + return AppendPyString(builder, view, is_full); } } // namespace detail @@ -496,12 +499,15 @@ class BinaryLikeConverter : public TypedConverter {}; +class LargeBytesConverter : public BinaryLikeConverter {}; + class FixedWidthBytesConverter : public BinaryLikeConverter {}; // For String/UTF8, if strict_conversions enabled, we reject any non-UTF8, // otherwise we allow but return results as BinaryArray -template -class StringConverter : public TypedConverter> { +template +class StringConverter + : public TypedConverter> { public: StringConverter() : binary_count_(0) {} @@ -526,10 +532,7 @@ class StringConverter : public TypedConvertertyped_builder_, string_view_, is_full, - [this](const char* bytes, int32_t length) { - return this->typed_builder_->Append(bytes, length); - }); + return detail::AppendPyString(this->typed_builder_, string_view_, is_full); } Status AppendItem(PyObject* obj) { @@ -556,10 +559,13 @@ class StringConverter : public TypedConverter::ArrayType; + for (size_t i = 0; i < out->size(); ++i) { auto binary_data = (*out)[i]->data()->Copy(); - binary_data->type = ::arrow::binary(); - (*out)[i] = std::make_shared(binary_data); + binary_data->type = TypeTraits::type_singleton(); + (*out)[i] = std::make_shared(binary_data); } } return Status::OK(); @@ -871,14 +877,24 @@ Status GetConverter(const std::shared_ptr& type, bool from_pandas, NUMERIC_CONVERTER(DOUBLE, DoubleType); SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter); SIMPLE_CONVERTER_CASE(BINARY, BytesConverter); + SIMPLE_CONVERTER_CASE(LARGE_BINARY, LargeBytesConverter); SIMPLE_CONVERTER_CASE(FIXED_SIZE_BINARY, FixedWidthBytesConverter); SIMPLE_CONVERTER_CASE(DATE32, Date32Converter); SIMPLE_CONVERTER_CASE(DATE64, Date64Converter); case Type::STRING: if (strict_conversions) { - *out = std::unique_ptr(new StringConverter()); + *out = std::unique_ptr(new StringConverter()); + } else { + *out = std::unique_ptr(new StringConverter()); + } + break; + case Type::LARGE_STRING: + if (strict_conversions) { + *out = + std::unique_ptr(new StringConverter()); } else { - *out = std::unique_ptr(new StringConverter()); + *out = + std::unique_ptr(new StringConverter()); } break; case Type::TIME32: { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 572b888df11..80327e89fae 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -612,6 +612,7 @@ class ARROW_EXPORT StringType : public BinaryType { public: static constexpr Type::type type_id = Type::STRING; static constexpr bool is_utf8 = true; + using EquivalentBinaryType = BinaryType; StringType() : BinaryType(Type::STRING) {} @@ -624,6 +625,7 @@ class ARROW_EXPORT LargeStringType : public LargeBinaryType { public: static constexpr Type::type type_id = Type::LARGE_STRING; static constexpr bool is_utf8 = true; + using EquivalentBinaryType = LargeBinaryType; LargeStringType() : LargeBinaryType(Type::LARGE_STRING) {} diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index db45eeff0ca..e10b5afef43 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -57,6 +57,8 @@ may expose data type-specific methods or properties. BinaryArray StringArray FixedSizeBinaryArray + LargeBinaryArray + LargeStringArray Time32Array Time64Array Date32Array @@ -97,6 +99,8 @@ any of those classes directly. BinaryValue StringValue FixedSizeBinaryValue + LargeBinaryValue + LargeStringValue Time32Value Time64Value Date32Value diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst index 5ad02049663..327bcf63ac9 100644 --- a/docs/source/python/api/datatypes.rst +++ b/docs/source/python/api/datatypes.rst @@ -50,6 +50,9 @@ These should be used to create Arrow data types and schemas. binary string utf8 + large_binary + large_string + large_utf8 decimal128 list_ struct @@ -129,6 +132,9 @@ represents a given data type (such as ``int32``) or general category is_binary is_unicode is_string + is_large_binary + is_large_unicode + is_large_string is_fixed_size_binary is_map is_dictionary diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index afe06368104..51afa0f1c48 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -52,7 +52,9 @@ def parse_git(root, **kwargs): uint8, uint16, uint32, uint64, time32, time64, timestamp, date32, date64, float16, float32, float64, - binary, string, utf8, decimal128, + binary, string, utf8, + large_binary, large_string, large_utf8, + decimal128, list_, struct, union, dictionary, field, type_for_alias, DataType, DictionaryType, ListType, StructType, @@ -77,6 +79,7 @@ def parse_git(root, **kwargs): Int64Array, UInt64Array, ListArray, UnionArray, BinaryArray, StringArray, + LargeBinaryArray, LargeStringArray, FixedSizeBinaryArray, DictionaryArray, Date32Array, Date64Array, @@ -87,7 +90,9 @@ def parse_git(root, **kwargs): Int8Value, Int16Value, Int32Value, Int64Value, UInt8Value, UInt16Value, UInt32Value, UInt64Value, HalfFloatValue, FloatValue, DoubleValue, ListValue, - BinaryValue, StringValue, FixedSizeBinaryValue, + BinaryValue, StringValue, + LargeBinaryValue, LargeStringValue, + FixedSizeBinaryValue, DecimalValue, UnionValue, StructValue, DictionaryValue, Date32Value, Date64Value, Time32Value, Time64Value, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 15905a18507..4341c414a22 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1178,12 +1178,50 @@ cdef class StringArray(Array): null_count, offset) +cdef class LargeStringArray(Array): + """ + Concrete class for Arrow arrays of large string (or utf8) data type. + """ + + @staticmethod + def from_buffers(int length, Buffer value_offsets, Buffer data, + Buffer null_bitmap=None, int null_count=-1, + int offset=0): + """ + Construct a LargeStringArray from value_offsets and data buffers. + If there are nulls in the data, also a null_bitmap and the matching + null_count must be passed. + + Parameters + ---------- + length : int + value_offsets : Buffer + data : Buffer + null_bitmap : Buffer, optional + null_count : int, default 0 + offset : int, default 0 + + Returns + ------- + string_array : StringArray + """ + return Array.from_buffers(large_utf8(), length, + [null_bitmap, value_offsets, data], + null_count, offset) + + cdef class BinaryArray(Array): """ Concrete class for Arrow arrays of variable-sized binary data type. """ +cdef class LargeBinaryArray(Array): + """ + Concrete class for Arrow arrays of large variable-sized binary data type. + """ + + cdef class DictionaryArray(Array): """ Concrete class for dictionary-encoded Arrow arrays. @@ -1449,6 +1487,8 @@ cdef dict _array_classes = { _Type_UNION: UnionArray, _Type_BINARY: BinaryArray, _Type_STRING: StringArray, + _Type_LARGE_BINARY: LargeBinaryArray, + _Type_LARGE_STRING: LargeStringArray, _Type_DICTIONARY: DictionaryArray, _Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray, _Type_DECIMAL: Decimal128Array, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 0f80ad7e75d..bfda15d4a98 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -65,6 +65,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_TIME64" arrow::Type::TIME64" _Type_BINARY" arrow::Type::BINARY" _Type_STRING" arrow::Type::STRING" + _Type_LARGE_BINARY" arrow::Type::LARGE_BINARY" + _Type_LARGE_STRING" arrow::Type::LARGE_STRING" _Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY" _Type_LIST" arrow::Type::LIST" @@ -437,12 +439,18 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: const CArray* UnsafeChild(int pos) UnionMode mode() - cdef cppclass CBinaryArray" arrow::BinaryArray"(CListArray): + cdef cppclass CBinaryArray" arrow::BinaryArray"(CArray): const uint8_t* GetValue(int i, int32_t* length) shared_ptr[CBuffer] value_data() int32_t value_offset(int64_t i) int32_t value_length(int64_t i) + cdef cppclass CLargeBinaryArray" arrow::LargeBinaryArray"(CArray): + const uint8_t* GetValue(int i, int64_t* length) + shared_ptr[CBuffer] value_data() + int64_t value_offset(int64_t i) + int64_t value_length(int64_t i) + cdef cppclass CStringArray" arrow::StringArray"(CBinaryArray): CStringArray(int64_t length, shared_ptr[CBuffer] value_offsets, shared_ptr[CBuffer] data, @@ -452,6 +460,16 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_string GetString(int i) + cdef cppclass CLargeStringArray" arrow::LargeStringArray" \ + (CLargeBinaryArray): + CLargeStringArray(int64_t length, shared_ptr[CBuffer] value_offsets, + shared_ptr[CBuffer] data, + shared_ptr[CBuffer] null_bitmap, + int64_t null_count, + int64_t offset) + + c_string GetString(int i) + cdef cppclass CStructArray" arrow::StructArray"(CArray): CStructArray(shared_ptr[CDataType] type, int64_t length, vector[shared_ptr[CArray]] children, diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 2da5a8301bc..0b33f3906c4 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -87,6 +87,8 @@ Type_TIME32 = _Type_TIME32 Type_TIME64 = _Type_TIME64 Type_BINARY = _Type_BINARY Type_STRING = _Type_STRING +Type_LARGE_BINARY = _Type_LARGE_BINARY +Type_LARGE_STRING = _Type_LARGE_STRING Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY Type_LIST = _Type_LIST Type_STRUCT = _Type_STRUCT diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 326aefa4985..0ead3e57552 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -437,6 +437,43 @@ cdef class StringValue(ArrayValue): cdef CStringArray* ap = self.sp_array.get() return ap.GetString(self.index).decode('utf-8') + def as_buffer(self): + """ + Return a view over this value as a Buffer object. + """ + cdef: + CStringArray* ap = self.sp_array.get() + shared_ptr[CBuffer] buf + + buf = SliceBuffer(ap.value_data(), ap.value_offset(self.index), + ap.value_length(self.index)) + return pyarrow_wrap_buffer(buf) + + +cdef class LargeStringValue(ArrayValue): + """ + Concrete class for large string (utf8) array elements. + """ + + def as_py(self): + """ + Return this value as a Python unicode string. + """ + cdef CLargeStringArray* ap = self.sp_array.get() + return ap.GetString(self.index).decode('utf-8') + + def as_buffer(self): + """ + Return a view over this value as a Buffer object. + """ + cdef: + CLargeStringArray* ap = self.sp_array.get() + shared_ptr[CBuffer] buf + + buf = SliceBuffer(ap.value_data(), ap.value_offset(self.index), + ap.value_length(self.index)) + return pyarrow_wrap_buffer(buf) + cdef class BinaryValue(ArrayValue): """ @@ -468,6 +505,36 @@ cdef class BinaryValue(ArrayValue): return pyarrow_wrap_buffer(buf) +cdef class LargeBinaryValue(ArrayValue): + """ + Concrete class for large variable-sized binary array elements. + """ + + def as_py(self): + """ + Return this value as a Python bytes object. + """ + cdef: + const uint8_t* ptr + int64_t length + CLargeBinaryArray* ap = self.sp_array.get() + + ptr = ap.GetValue(self.index, &length) + return cp.PyBytes_FromStringAndSize((ptr), length) + + def as_buffer(self): + """ + Return a view over this value as a Buffer object. + """ + cdef: + CLargeBinaryArray* ap = self.sp_array.get() + shared_ptr[CBuffer] buf + + buf = SliceBuffer(ap.value_data(), ap.value_offset(self.index), + ap.value_length(self.index)) + return pyarrow_wrap_buffer(buf) + + cdef class ListValue(ArrayValue): """ Concrete class for list array elements. @@ -665,6 +732,8 @@ cdef dict _array_value_classes = { _Type_UNION: UnionValue, _Type_BINARY: BinaryValue, _Type_STRING: StringValue, + _Type_LARGE_BINARY: LargeBinaryValue, + _Type_LARGE_STRING: LargeStringValue, _Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue, _Type_DECIMAL: DecimalValue, _Type_STRUCT: StructValue, diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index a2828643cc3..7312ce33de3 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -38,6 +38,8 @@ binary_type = st.just(pa.binary()) string_type = st.just(pa.string()) +large_binary_type = st.just(pa.large_binary()) +large_string_type = st.just(pa.large_string()) signed_integer_types = st.sampled_from([ pa.int8(), @@ -87,6 +89,8 @@ bool_type, binary_type, string_type, + large_binary_type, + large_string_type, numeric_types, temporal_types ) @@ -190,9 +194,9 @@ def arrays(draw, type, size=None): elif pa.types.is_timestamp(type): tz = pytz.timezone(type.tz) if type.tz is not None else None value = st.datetimes(timezones=st.just(tz)) - elif pa.types.is_binary(type): + elif pa.types.is_binary(type) or pa.types.is_large_binary(type): value = st.binary() - elif pa.types.is_string(type): + elif pa.types.is_string(type) or pa.types.is_large_string(type): value = st.text() elif pa.types.is_decimal(type): # TODO(kszucs): properly limit the precision diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 81d5952b4b1..f39706e3cbc 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -26,6 +26,7 @@ import datetime import decimal import itertools +import math import traceback import sys @@ -605,7 +606,7 @@ def test_sequence_unicode(): assert arr.to_pylist() == data -def test_array_mixed_unicode_bytes(): +def check_array_mixed_unicode_bytes(binary_type, string_type): values = [u'qux', b'foo', bytearray(b'barz')] b_values = [b'qux', b'foo', b'barz'] u_values = [u'qux', u'foo', u'barz'] @@ -615,36 +616,75 @@ def test_array_mixed_unicode_bytes(): assert arr.type == pa.binary() assert arr.equals(expected) - arr = pa.array(values, type=pa.string()) - expected = pa.array(u_values, type=pa.string()) - assert arr.type == pa.string() + arr = pa.array(values, type=binary_type) + expected = pa.array(b_values, type=binary_type) + assert arr.type == binary_type + assert arr.equals(expected) + + arr = pa.array(values, type=string_type) + expected = pa.array(u_values, type=string_type) + assert arr.type == string_type assert arr.equals(expected) +def test_array_mixed_unicode_bytes(): + check_array_mixed_unicode_bytes(pa.binary(), pa.string()) + check_array_mixed_unicode_bytes(pa.large_binary(), pa.large_string()) + + +@pytest.mark.large_memory +@pytest.mark.parametrize("ty", [pa.large_binary(), pa.large_string()]) +def test_large_binary_array(ty): + # Construct a large binary array with more than 4GB of data + s = b"0123456789abcdefghijklmnopqrstuvwxyz" * 10 + nrepeats = math.ceil((2**32 + 5) / len(s)) + data = [s] * nrepeats + arr = pa.array(data, type=ty) + assert isinstance(arr, pa.Array) + assert arr.type == ty + assert len(arr) == nrepeats + + +@pytest.mark.large_memory +@pytest.mark.parametrize("ty", [pa.large_binary(), pa.large_string()]) +def test_large_binary_value(ty): + # Construct a large binary array with a single value larger than 4GB + s = b"0123456789abcdefghijklmnopqrstuvwxyz" + nrepeats = math.ceil((2**32 + 5) / len(s)) + arr = pa.array([b"foo", s * nrepeats, None, b"bar"], type=ty) + assert isinstance(arr, pa.Array) + assert arr.type == ty + assert len(arr) == 4 + buf = arr[1].as_buffer() + assert len(buf) == len(s) * nrepeats + + def test_sequence_bytes(): u1 = b'ma\xc3\xb1ana' data = [b'foo', u1.decode('utf-8'), # unicode gets encoded, bytearray(b'bar'), None] - for ty in [None, pa.binary()]: + for ty in [None, pa.binary(), pa.large_binary()]: arr = pa.array(data, type=ty) assert len(arr) == 4 assert arr.null_count == 1 - assert arr.type == pa.binary() + assert arr.type == ty or pa.binary() assert arr.to_pylist() == [b'foo', u1, b'bar', None] -def test_sequence_utf8_to_unicode(): +@pytest.mark.parametrize("ty", [pa.string(), pa.large_string()]) +def test_sequence_utf8_to_unicode(ty): # ARROW-1225 data = [b'foo', None, b'bar'] - arr = pa.array(data, type=pa.string()) + arr = pa.array(data, type=ty) + assert arr.type == ty assert arr[0].as_py() == u'foo' # test a non-utf8 unicode string val = (u'mañana').encode('utf-16-le') with pytest.raises(pa.ArrowInvalid): - pa.array([val], type=pa.string()) + pa.array([val], type=ty) def test_sequence_fixed_size_bytes(): diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 406f4ee697c..ca7a10e0046 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -104,6 +104,25 @@ def test_string_unicode(self): assert v == u'mañana' assert isinstance(v, unicode_type) + def test_large_string_unicode(self): + arr = pa.array([u'foo', None, u'mañana'], type=pa.large_string()) + + v = arr[0] + assert isinstance(v, pa.LargeStringValue) + assert v.as_py() == u'foo' + assert repr(v) == repr(u"foo") + assert str(v) == str(u"foo") + assert v == u'foo' + # Assert that newly created values are equal to the previously created + # one. + assert v == arr[0] + + assert arr[1] is pa.NA + + v = arr[2].as_py() + assert v == u'mañana' + assert isinstance(v, unicode_type) + def test_bytes(self): arr = pa.array([b'foo', None, u('bar')]) @@ -122,6 +141,24 @@ def check_value(v, expected): assert arr[1] is pa.NA check_value(arr[2], b'bar') + def test_large_bytes(self): + arr = pa.array([b'foo', None, u('bar')], type=pa.large_binary()) + + def check_value(v, expected): + assert isinstance(v, pa.LargeBinaryValue) + assert v.as_py() == expected + assert str(v) == str(expected) + assert repr(v) == repr(expected) + assert v == expected + assert v != b'xxxxx' + buf = v.as_buffer() + assert isinstance(buf, pa.Buffer) + assert buf.to_pybytes() == expected + + check_value(arr[0], b'foo') + assert arr[1] is pa.NA + check_value(arr[2], b'bar') + def test_fixed_size_bytes(self): data = [b'foof', None, b'barb'] arr = pa.array(data, type=pa.binary(4)) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 7d9abf7956b..de532e6cb98 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -49,6 +49,8 @@ def get_many_types(): pa.string(), pa.binary(), pa.binary(10), + pa.large_string(), + pa.large_binary(), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), @@ -145,10 +147,24 @@ def test_is_union(): def test_is_binary_string(): assert types.is_binary(pa.binary()) assert not types.is_binary(pa.string()) + assert not types.is_binary(pa.large_binary()) + assert not types.is_binary(pa.large_string()) assert types.is_string(pa.string()) assert types.is_unicode(pa.string()) assert not types.is_string(pa.binary()) + assert not types.is_string(pa.large_string()) + assert not types.is_string(pa.large_binary()) + + assert types.is_large_binary(pa.large_binary()) + assert not types.is_large_binary(pa.large_string()) + assert not types.is_large_binary(pa.binary()) + assert not types.is_large_binary(pa.string()) + + assert types.is_large_string(pa.large_string()) + assert not types.is_large_string(pa.large_binary()) + assert not types.is_large_string(pa.string()) + assert not types.is_large_string(pa.binary()) assert types.is_fixed_size_binary(pa.binary(5)) assert not types.is_fixed_size_binary(pa.binary()) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 657df04f965..0db15d583f3 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -106,6 +106,7 @@ cdef class DataType: "instead.".format(self.__class__.__name__)) cdef void init(self, const shared_ptr[CDataType]& type) except *: + assert type != nullptr self.sp_type = type self.type = type.get() self.pep3118_format = _datatype_to_pep3118(self.type) @@ -1531,6 +1532,33 @@ def binary(int length=-1): return pyarrow_wrap_data_type(fixed_size_binary_type) +def large_binary(): + """ + Create large variable-length binary type + + This data type may not be supported by all Arrow implementations. Unless + you need to represent data larger than 2GB, you should prefer binary(). + """ + return primitive_type(_Type_LARGE_BINARY) + + +def large_string(): + """ + Create large UTF8 variable-length string type + + This data type may not be supported by all Arrow implementations. Unless + you need to represent data larger than 2GB, you should prefer string(). + """ + return primitive_type(_Type_LARGE_STRING) + + +def large_utf8(): + """ + Alias for large_string() + """ + return large_string() + + cpdef ListType list_(value_type): """ Create ListType instance from child data type or field @@ -1714,6 +1742,10 @@ cdef dict _type_aliases = { 'str': string, 'utf8': string, 'binary': binary, + 'large_string': large_string, + 'large_str': large_string, + 'large_utf8': large_string, + 'large_binary': large_binary, 'date32': date32, 'date64': date64, 'date32[day]': date32, diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index def1dde214e..dc314e8a043 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -228,6 +228,14 @@ def is_binary(t): return t.id == lib.Type_BINARY +def is_large_binary(t): + """ + Return True if value is an instance of a large variable-length + binary type + """ + return t.id == lib.Type_LARGE_BINARY + + def is_unicode(t): """ Alias for is_string @@ -242,6 +250,20 @@ def is_string(t): return t.id == lib.Type_STRING +def is_large_unicode(t): + """ + Alias for is_large_string + """ + return is_large_string(t) + + +def is_large_string(t): + """ + Return True if value is an instance of large string (utf8 unicode) type + """ + return t.id == lib.Type_LARGE_STRING + + def is_fixed_size_binary(t): """ Return True if value is an instance of a fixed size binary type From 0bb1b7f89359d2de816bde529aa5cb09ce2454db Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 31 Jul 2019 15:38:46 +0200 Subject: [PATCH 2/2] Fix Take on LargeBinary / LargeString data --- cpp/src/arrow/array/builder_binary.h | 2 ++ cpp/src/arrow/array/builder_decimal.h | 2 ++ cpp/src/arrow/array/builder_primitive.h | 3 ++ cpp/src/arrow/array/builder_time.h | 1 + cpp/src/arrow/compute/kernels/filter-test.cc | 22 +++++++++---- cpp/src/arrow/compute/kernels/take-internal.h | 20 ++++++------ cpp/src/arrow/compute/kernels/take-test.cc | 31 +++++++++++++------ 7 files changed, 55 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index dce5dda5945..869e61ff990 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -387,6 +387,8 @@ class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder { class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { public: + using TypeClass = FixedSizeBinaryType; + FixedSizeBinaryBuilder(const std::shared_ptr& type, MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); diff --git a/cpp/src/arrow/array/builder_decimal.h b/cpp/src/arrow/array/builder_decimal.h index e64d16528da..1b8d3b4951e 100644 --- a/cpp/src/arrow/array/builder_decimal.h +++ b/cpp/src/arrow/array/builder_decimal.h @@ -28,6 +28,8 @@ class Decimal128; class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { public: + using TypeClass = Decimal128Type; + explicit Decimal128Builder(const std::shared_ptr& type, MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index 8abbe029e13..fecb8ca2246 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -58,6 +58,7 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { template class NumericBuilder : public ArrayBuilder { public: + using TypeClass = T; using value_type = typename T::c_type; using ArrayType = typename TypeTraits::ArrayType; using ArrayBuilder::ArrayBuilder; @@ -265,7 +266,9 @@ using DoubleBuilder = NumericBuilder; class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { public: + using TypeClass = BooleanType; using value_type = bool; + explicit BooleanBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); explicit BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool); diff --git a/cpp/src/arrow/array/builder_time.h b/cpp/src/arrow/array/builder_time.h index 3ff783b1b1c..e2641761971 100644 --- a/cpp/src/arrow/array/builder_time.h +++ b/cpp/src/arrow/array/builder_time.h @@ -34,6 +34,7 @@ namespace arrow { class ARROW_EXPORT DayTimeIntervalBuilder : public ArrayBuilder { public: + using TypeClass = DayTimeIntervalType; using DayMilliseconds = DayTimeIntervalType::DayMilliseconds; explicit DayTimeIntervalBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) diff --git a/cpp/src/arrow/compute/kernels/filter-test.cc b/cpp/src/arrow/compute/kernels/filter-test.cc index 253093e3799..6a7385568a5 100644 --- a/cpp/src/arrow/compute/kernels/filter-test.cc +++ b/cpp/src/arrow/compute/kernels/filter-test.cc @@ -285,18 +285,26 @@ TYPED_TEST(TestFilterKernelWithNumeric, ScalarInRangeAndFilterRandomNumeric) { } } -class TestFilterKernelWithString : public TestFilterKernel { +using StringTypes = + ::testing::Types; + +template +class TestFilterKernelWithString : public TestFilterKernel { protected: + std::shared_ptr value_type() { + return TypeTraits::type_singleton(); + } + void AssertFilter(const std::string& values, const std::string& filter, const std::string& expected) { - TestFilterKernel::AssertFilter(utf8(), values, filter, expected); + TestFilterKernel::AssertFilter(value_type(), values, filter, expected); } void AssertFilterDictionary(const std::string& dictionary_values, const std::string& dictionary_filter, const std::string& filter, const std::string& expected_filter) { - auto dict = ArrayFromJSON(utf8(), dictionary_values); - auto type = dictionary(int8(), utf8()); + auto dict = ArrayFromJSON(value_type(), dictionary_values); + auto type = dictionary(int8(), value_type()); std::shared_ptr values, actual, expected; ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), dictionary_filter), dict, &values)); @@ -307,13 +315,15 @@ class TestFilterKernelWithString : public TestFilterKernel { } }; -TEST_F(TestFilterKernelWithString, FilterString) { +TYPED_TEST_CASE(TestFilterKernelWithString, StringTypes); + +TYPED_TEST(TestFilterKernelWithString, FilterString) { this->AssertFilter(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["b"])"); this->AssertFilter(R"([null, "b", "c"])", "[0, 1, 0]", R"(["b"])"); this->AssertFilter(R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b"])"); } -TEST_F(TestFilterKernelWithString, FilterDictionary) { +TYPED_TEST(TestFilterKernelWithString, FilterDictionary) { auto dict = R"(["a", "b", "c", "d", "e"])"; this->AssertFilterDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[4]"); this->AssertFilterDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", "[4]"); diff --git a/cpp/src/arrow/compute/kernels/take-internal.h b/cpp/src/arrow/compute/kernels/take-internal.h index 96519a9869a..e358994d70e 100644 --- a/cpp/src/arrow/compute/kernels/take-internal.h +++ b/cpp/src/arrow/compute/kernels/take-internal.h @@ -20,11 +20,13 @@ #include #include #include +#include #include #include #include "arrow/builder.h" #include "arrow/compute/context.h" +#include "arrow/type_traits.h" #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" @@ -37,21 +39,19 @@ namespace compute { using internal::checked_cast; using internal::checked_pointer_cast; +// For non-binary builders, use regular value append template -static Status UnsafeAppend(Builder* builder, Scalar&& value) { +static typename std::enable_if< + !std::is_base_of::value, Status>::type +UnsafeAppend(Builder* builder, Scalar&& value) { builder->UnsafeAppend(std::forward(value)); return Status::OK(); } -// Use BinaryBuilder::UnsafeAppend, but reserve byte storage first -static Status UnsafeAppend(BinaryBuilder* builder, util::string_view value) { - RETURN_NOT_OK(builder->ReserveData(static_cast(value.size()))); - builder->UnsafeAppend(value); - return Status::OK(); -} - -// Use StringBuilder::UnsafeAppend, but reserve character storage first -static Status UnsafeAppend(StringBuilder* builder, util::string_view value) { +// For binary builders, need to reserve byte storage first +template +static enable_if_base_binary UnsafeAppend( + Builder* builder, util::string_view value) { RETURN_NOT_OK(builder->ReserveData(static_cast(value.size()))); builder->UnsafeAppend(value); return Status::OK(); diff --git a/cpp/src/arrow/compute/kernels/take-test.cc b/cpp/src/arrow/compute/kernels/take-test.cc index 7ae9321bb46..de5b2e2d794 100644 --- a/cpp/src/arrow/compute/kernels/take-test.cc +++ b/cpp/src/arrow/compute/kernels/take-test.cc @@ -179,18 +179,26 @@ TYPED_TEST(TestTakeKernelWithNumeric, TakeRandomNumeric) { } } -class TestTakeKernelWithString : public TestTakeKernel { - protected: +using StringTypes = + ::testing::Types; + +template +class TestTakeKernelWithString : public TestTakeKernel { + public: + std::shared_ptr value_type() { + return TypeTraits::type_singleton(); + } + void AssertTake(const std::string& values, const std::string& indices, const std::string& expected) { - TestTakeKernel::AssertTake(utf8(), values, indices, expected); + TestTakeKernel::AssertTake(value_type(), values, indices, expected); } void AssertTakeDictionary(const std::string& dictionary_values, const std::string& dictionary_indices, const std::string& indices, const std::string& expected_indices) { - auto dict = ArrayFromJSON(utf8(), dictionary_values); - auto type = dictionary(int8(), utf8()); + auto dict = ArrayFromJSON(value_type(), dictionary_values); + auto type = dictionary(int8(), value_type()); std::shared_ptr values, actual, expected; ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), dictionary_indices), dict, &values)); @@ -201,19 +209,22 @@ class TestTakeKernelWithString : public TestTakeKernel { } }; -TEST_F(TestTakeKernelWithString, TakeString) { +TYPED_TEST_CASE(TestTakeKernelWithString, StringTypes); + +TYPED_TEST(TestTakeKernelWithString, TakeString) { this->AssertTake(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["a", "b", "a"])"); this->AssertTake(R"([null, "b", "c"])", "[0, 1, 0]", "[null, \"b\", null]"); this->AssertTake(R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b", "a"])"); + std::shared_ptr type = this->value_type(); std::shared_ptr arr; ASSERT_RAISES(IndexError, - this->Take(utf8(), R"(["a", "b", "c"])", int8(), "[0, 9, 0]", &arr)); - ASSERT_RAISES(IndexError, this->Take(utf8(), R"(["a", "b", null, "ddd", "ee"])", - int64(), "[2, 5]", &arr)); + this->Take(type, R"(["a", "b", "c"])", int8(), "[0, 9, 0]", &arr)); + ASSERT_RAISES(IndexError, this->Take(type, R"(["a", "b", null, "ddd", "ee"])", int64(), + "[2, 5]", &arr)); } -TEST_F(TestTakeKernelWithString, TakeDictionary) { +TYPED_TEST(TestTakeKernelWithString, TakeDictionary) { auto dict = R"(["a", "b", "c", "d", "e"])"; this->AssertTakeDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[3, 4, 3]"); this->AssertTakeDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", "[null, 4, null]");