diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 4b1fabfdcd4..0b235cc199c 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -103,7 +103,7 @@ static inline std::shared_ptr SliceData(const ArrayData& data, int64_ length = std::min(data.length - offset, length); offset += data.offset; - auto new_data = data.ShallowCopy(); + auto new_data = data.Copy(); new_data->length = length; new_data->offset = offset; new_data->null_count = data.null_count != 0 ? kUnknownNullCount : 0; @@ -482,14 +482,14 @@ DictionaryArray::DictionaryArray(const std::shared_ptr& type, : dict_type_(static_cast(type.get())) { DCHECK_EQ(type->id(), Type::DICTIONARY); DCHECK_EQ(indices->type_id(), dict_type_->index_type()->id()); - auto data = indices->data()->ShallowCopy(); + auto data = indices->data()->Copy(); data->type = type; SetData(data); } void DictionaryArray::SetData(const std::shared_ptr& data) { this->Array::SetData(data); - auto indices_data = data_->ShallowCopy(); + auto indices_data = data_->Copy(); indices_data->type = dict_type_->index_type(); std::shared_ptr result; indices_ = MakeArray(indices_data); diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index ec5381d6e28..ebe54adcb9e 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -143,9 +143,14 @@ struct ARROW_EXPORT ArrayData { return *this; } - std::shared_ptr ShallowCopy() const { - return std::make_shared(*this); - } + std::shared_ptr Copy() const { return std::make_shared(*this); } + +#ifndef ARROW_NO_DEPRECATED_API + + // Deprecated since 0.8.0 + std::shared_ptr ShallowCopy() const { return Copy(); } + +#endif std::shared_ptr type; int64_t length; diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc index d5158978c8d..c73bfa309fd 100644 --- a/cpp/src/arrow/compute/compute-test.cc +++ b/cpp/src/arrow/compute/compute-test.cc @@ -709,6 +709,66 @@ TEST_F(TestCast, PreallocatedMemory) { ASSERT_ARRAYS_EQUAL(*expected, *result); } +template +void CheckOffsetOutputCase(FunctionContext* ctx, const std::shared_ptr& in_type, + const vector& in_values, + const std::shared_ptr& out_type, + const vector& out_values) { + using OutTraits = TypeTraits; + + CastOptions options; + + const int64_t length = static_cast(in_values.size()); + + shared_ptr arr, expected; + ArrayFromVector(in_type, in_values, &arr); + ArrayFromVector(out_type, out_values, &expected); + + shared_ptr out_buffer; + ASSERT_OK(ctx->Allocate(OutTraits::bytes_required(length), &out_buffer)); + + std::unique_ptr kernel; + ASSERT_OK(GetCastFunction(*in_type, out_type, options, &kernel)); + + const int64_t first_half = length / 2; + + auto out_data = ArrayData::Make(out_type, length, {nullptr, out_buffer}); + auto out_second_data = out_data->Copy(); + out_second_data->offset = first_half; + + Datum out_first(out_data); + Datum out_second(out_second_data); + + // Cast each bit + ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(0, first_half)), &out_first)); + ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(first_half)), &out_second)); + + shared_ptr result = MakeArray(out_data); + + ASSERT_ARRAYS_EQUAL(*expected, *result); +} + +TEST_F(TestCast, OffsetOutputBuffer) { + // ARROW-1735 + vector v1 = {0, 10000, 2000, 1000, 0}; + vector e1 = {0, 10000, 2000, 1000, 0}; + + auto in_type = int32(); + auto out_type = int64(); + CheckOffsetOutputCase(&this->ctx_, in_type, v1, + out_type, e1); + + vector e2 = {false, true, true, true, false}; + + out_type = boolean(); + CheckOffsetOutputCase(&this->ctx_, in_type, v1, + boolean(), e2); + + vector e3 = {0, 10000, 2000, 1000, 0}; + CheckOffsetOutputCase(&this->ctx_, in_type, v1, + int16(), e3); +} + template class TestDictionaryCast : public TestCast {}; diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index d48d669922a..465be958cfa 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -124,12 +124,7 @@ template struct CastFunctor::value>::type> { void operator()(FunctionContext* ctx, const CastOptions& options, - const ArrayData& input, ArrayData* output) { - // Simply initialize data to 0 - auto buf = output->buffers[1]; - DCHECK_EQ(output->offset, 0); - memset(buf->mutable_data(), 0, buf->size()); - } + const ArrayData& input, ArrayData* output) {} }; template <> @@ -199,14 +194,19 @@ struct CastFunctor::v !std::is_same::value>::type> { void operator()(FunctionContext* ctx, const CastOptions& options, const ArrayData& input, ArrayData* output) { - using in_type = typename I::c_type; - DCHECK_EQ(output->offset, 0); + auto in_data = GetValues(input, 1); + internal::BitmapWriter writer(output->buffers[1]->mutable_data(), output->offset, + input.length); - const in_type* in_data = GetValues(input, 1); - uint8_t* out_data = GetMutableValues(output, 1); for (int64_t i = 0; i < input.length; ++i) { - BitUtil::SetBitTo(out_data, i, (*in_data++) != 0); + if (*in_data++ != 0) { + writer.Set(); + } else { + writer.Clear(); + } + writer.Next(); } + writer.Finish(); } }; @@ -217,7 +217,6 @@ struct CastFunctoroffset, 0); auto in_offset = input.offset; @@ -475,9 +474,10 @@ void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, const Array& indices, const index_c_type* in = GetValues(*indices.data(), 1); - uint8_t* out = output->buffers[1]->mutable_data(); int32_t byte_width = static_cast(*output->type).byte_width(); + + uint8_t* out = output->buffers[1]->mutable_data() + byte_width * output->offset; for (int64_t i = 0; i < indices.length(); ++i) { if (valid_bits_reader.IsSet()) { const uint8_t* value = dictionary.Value(in[i]); @@ -493,7 +493,7 @@ struct CastFunctor< typename std::enable_if::value>::type> { void operator()(FunctionContext* ctx, const CastOptions& options, const ArrayData& input, ArrayData* output) { - DictionaryArray dict_array(input.ShallowCopy()); + DictionaryArray dict_array(input.Copy()); const DictionaryType& type = static_cast(*input.type); const DataType& values_type = *type.dictionary()->type(); @@ -565,7 +565,7 @@ struct CastFunctor::value>::type> { void operator()(FunctionContext* ctx, const CastOptions& options, const ArrayData& input, ArrayData* output) { - DictionaryArray dict_array(input.ShallowCopy()); + DictionaryArray dict_array(input.Copy()); const DictionaryType& type = static_cast(*input.type); const DataType& values_type = *type.dictionary()->type(); @@ -605,12 +605,10 @@ struct CastFunctor void UnpackPrimitiveDictionary(const Array& indices, const c_type* dictionary, c_type* out) { - using index_c_type = typename IndexType::c_type; - internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(), indices.length()); - const index_c_type* in = GetValues(*indices.data(), 1); + auto in = GetValues(*indices.data(), 1); for (int64_t i = 0; i < indices.length(); ++i) { if (valid_bits_reader.IsSet()) { out[i] = dictionary[in[i]]; @@ -627,7 +625,7 @@ struct CastFunctor(*input.type); const DataType& values_type = *type.dictionary()->type(); @@ -638,7 +636,7 @@ struct CastFunctor(*type.dictionary()->data(), 1); - auto out = reinterpret_cast(output->buffers[1]->mutable_data()); + auto out = GetMutableValues(output, 1); const Array& indices = *dict_array.indices(); switch (indices.type()->id()) { case Type::INT8: diff --git a/cpp/src/arrow/python/arrow_to_python.h b/cpp/src/arrow/python/arrow_to_python.h index 9440ffb32ab..02a22f07d7e 100644 --- a/cpp/src/arrow/python/arrow_to_python.h +++ b/cpp/src/arrow/python/arrow_to_python.h @@ -51,8 +51,8 @@ Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out); /// \brief Reconstruct SerializedPyObject from representation produced by /// SerializedPyObject::GetComponents. /// -/// \param[in] num_tensors -/// \param[in] num_buffers +/// \param[in] num_tensors number of tensors in the object +/// \param[in] num_buffers number of buffers in the object /// \param[in] data a list containing pyarrow.Buffer instances. Must be 1 + /// num_tensors * 2 + num_buffers in length /// \param[out] out the reconstructed object diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 0c0d1a9b3fd..0b1124d303d 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -765,7 +765,7 @@ Status NumPyConverter::ConvertObjectStrings() { // If we saw PyBytes, convert everything to BinaryArray if (global_have_bytes) { for (size_t i = 0; i < out_arrays_.size(); ++i) { - auto binary_data = out_arrays_[i]->data()->ShallowCopy(); + auto binary_data = out_arrays_[i]->data()->Copy(); binary_data->type = ::arrow::binary(); out_arrays_[i] = std::make_shared(binary_data); }