diff --git a/c_glib/test/test-array.rb b/c_glib/test/test-array.rb index 43181742cc1..ee176b9949e 100644 --- a/c_glib/test/test-array.rb +++ b/c_glib/test/test-array.rb @@ -154,7 +154,7 @@ def test_diff def test_different_type array = build_string_array(["Start", "Shutdown", "Reboot"]) other_array = build_int8_array([2, 3, 6, 10]) - assert_equal("# Array types differed: string vs int8", + assert_equal("# Array types differed: string vs int8\n", array.diff_unified(other_array)) end end diff --git a/ci/conda_env_python.yml b/ci/conda_env_python.yml index c76c370e210..480c1d59799 100644 --- a/ci/conda_env_python.yml +++ b/ci/conda_env_python.yml @@ -16,6 +16,7 @@ # under the License. # don't add pandas here, because it is not a mandatory test dependency +cffi cython cloudpickle hypothesis diff --git a/ci/conda_env_r.yml b/ci/conda_env_r.yml index f7f76126cdb..8cecbe5752b 100644 --- a/ci/conda_env_r.yml +++ b/ci/conda_env_r.yml @@ -32,6 +32,7 @@ r-covr r-hms r-lubridate r-rcmdcheck +r-reticulate r-rmarkdown r-testthat r-tibble diff --git a/ci/docker/conda-r.dockerfile b/ci/docker/conda-r.dockerfile index b3749b912a0..ad8e59fb2a8 100644 --- a/ci/docker/conda-r.dockerfile +++ b/ci/docker/conda-r.dockerfile @@ -48,5 +48,6 @@ ENV ARROW_BUILD_STATIC=OFF \ ARROW_ORC=OFF \ ARROW_PARQUET=ON \ ARROW_PLASMA=OFF \ + ARROW_USE_CCACHE=ON \ ARROW_USE_GLOG=OFF \ LC_ALL=en_US.UTF-8 diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index 100a1f4c018..3e1e13732b1 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -47,7 +47,11 @@ RUN apt-get update -y && \ # R CMD CHECK --as-cran needs pdflatex to build the package manual texlive-latex-base \ # Need locales so we can set UTF-8 - locales && \ + locales \ + # Need Python to check py-to-r bridge + python3 \ + python3-pip \ + python3-dev && \ locale-gen en_US.UTF-8 && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -63,6 +67,18 @@ COPY ci/scripts/r_deps.sh /arrow/ci/scripts/ COPY r/DESCRIPTION /arrow/r/ RUN /arrow/ci/scripts/r_deps.sh /arrow +# Set up Python 3 and its dependencies +RUN ln -s /usr/bin/python3 /usr/local/bin/python && \ + ln -s /usr/bin/pip3 /usr/local/bin/pip + +COPY python/requirements.txt \ + python/requirements-test.txt \ + /arrow/python/ + +RUN pip install \ + -r arrow/python/requirements.txt \ + cython setuptools + ENV \ ARROW_BUILD_STATIC=OFF \ ARROW_BUILD_TESTS=OFF \ @@ -74,5 +90,7 @@ ENV \ ARROW_ORC=OFF \ ARROW_PARQUET=ON \ ARROW_PLASMA=OFF \ + ARROW_PYTHON=ON \ + ARROW_USE_CCACHE=ON \ ARROW_USE_GLOG=OFF \ LC_ALL=en_US.UTF-8 diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 43dbddb1f6f..30d1510ae8a 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -139,6 +139,7 @@ set(ARROW_SRCS tensor.cc type.cc visitor.cc + c/bridge.cc io/buffered.cc io/compressed.cc io/file.cc @@ -278,6 +279,7 @@ add_subdirectory(testing) # add_subdirectory(array) +add_subdirectory(c) add_subdirectory(io) add_subdirectory(util) add_subdirectory(vendored) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 54736fa54ac..da22cf14d86 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -716,6 +716,12 @@ Result> StructArray::Make( if (offset > length) { return Status::IndexError("Offset greater than length of child arrays"); } + if (null_bitmap == nullptr) { + if (null_count > 0) { + return Status::Invalid("null_count = ", null_count, " but no null bitmap given"); + } + null_count = 0; + } return std::make_shared(struct_(fields), length - offset, children, null_bitmap, null_count, offset); } diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index 78d211b954b..f1d4fb369d4 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -83,7 +83,7 @@ class NumericBuilder : public ArrayBuilder { /// uninitialized memory access Status AppendNulls(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); - data_builder_.UnsafeAppend(length, static_cast(0)); + data_builder_.UnsafeAppend(length, value_type{}); // zero UnsafeSetNull(length); return Status::OK(); } @@ -91,7 +91,7 @@ class NumericBuilder : public ArrayBuilder { /// \brief Append a single null element Status AppendNull() final { ARROW_RETURN_NOT_OK(Reserve(1)); - data_builder_.UnsafeAppend(static_cast(0)); + data_builder_.UnsafeAppend(value_type{}); // zero UnsafeAppendToBitmap(false); return Status::OK(); } @@ -243,7 +243,7 @@ class NumericBuilder : public ArrayBuilder { void UnsafeAppendNull() { ArrayBuilder::UnsafeAppendToBitmap(false); - data_builder_.UnsafeAppend(0); + data_builder_.UnsafeAppend(value_type{}); // zero } std::shared_ptr type() const override { return type_; } diff --git a/cpp/src/arrow/array/builder_time.h b/cpp/src/arrow/array/builder_time.h index 83597336f35..d1d5f967961 100644 --- a/cpp/src/arrow/array/builder_time.h +++ b/cpp/src/arrow/array/builder_time.h @@ -21,52 +21,23 @@ #include -#include "arrow/array.h" #include "arrow/array/builder_base.h" -#include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" -#include "arrow/buffer_builder.h" -#include "arrow/status.h" -#include "arrow/type_traits.h" -#include "arrow/util/macros.h" namespace arrow { -class ARROW_EXPORT DayTimeIntervalBuilder : public ArrayBuilder { +// TODO this class is untested + +class ARROW_EXPORT DayTimeIntervalBuilder : public NumericBuilder { public: - using TypeClass = DayTimeIntervalType; using DayMilliseconds = DayTimeIntervalType::DayMilliseconds; explicit DayTimeIntervalBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) : DayTimeIntervalBuilder(day_time_interval(), pool) {} - DayTimeIntervalBuilder(std::shared_ptr type, - MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : ArrayBuilder(pool), builder_(fixed_size_binary(sizeof(DayMilliseconds)), pool) {} - - void Reset() override { builder_.Reset(); } - Status Resize(int64_t capacity) override { return builder_.Resize(capacity); } - Status Append(DayMilliseconds day_millis) { - return builder_.Append(reinterpret_cast(&day_millis)); - } - void UnsafeAppend(DayMilliseconds day_millis) { - builder_.UnsafeAppend(reinterpret_cast(&day_millis)); - } - using ArrayBuilder::UnsafeAppendNull; - Status AppendNull() override { return builder_.AppendNull(); } - Status AppendNulls(int64_t length) override { return builder_.AppendNulls(length); } - Status FinishInternal(std::shared_ptr* out) override { - auto result = builder_.FinishInternal(out); - if (*out != NULLPTR) { - (*out)->type = type(); - } - return result; - } - - std::shared_ptr type() const override { return day_time_interval(); } - - private: - FixedSizeBinaryBuilder builder_; + explicit DayTimeIntervalBuilder(std::shared_ptr type, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : NumericBuilder(type, pool) {} }; } // namespace arrow diff --git a/cpp/src/arrow/array/diff_test.cc b/cpp/src/arrow/array/diff_test.cc index 0ad321158ee..b76d74fcdad 100644 --- a/cpp/src/arrow/array/diff_test.cc +++ b/cpp/src/arrow/array/diff_test.cc @@ -152,7 +152,7 @@ TEST_F(DiffTest, Errors) { ASSERT_RAISES(TypeError, Diff(*base_, *target_, default_memory_pool())); ASSERT_FALSE(base_->Equals(*target_, EqualOptions().diff_sink(&formatted))); - ASSERT_EQ(formatted.str(), R"(# Array types differed: int32 vs string)"); + ASSERT_EQ(formatted.str(), "# Array types differed: int32 vs string\n"); } template diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 5766c1fea68..064dcdd3e29 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -18,6 +18,7 @@ #include "arrow/array/validate.h" #include "arrow/array.h" +#include "arrow/util/bit_util.h" #include "arrow/util/int_util.h" #include "arrow/util/logging.h" #include "arrow/visitor_inline.h" @@ -41,11 +42,13 @@ struct ValidateArrayVisitor { ARROW_RETURN_IF(array.data()->buffers.size() != 2, Status::Invalid("number of buffers is != 2")); - if (array.length() > 0 && array.data()->buffers[1] == nullptr) { - return Status::Invalid("values buffer is null"); - } - if (array.length() > 0 && array.values() == nullptr) { - return Status::Invalid("values is null"); + if (array.length() > 0) { + if (array.data()->buffers[1] == nullptr) { + return Status::Invalid("values buffer is null"); + } + if (array.values() == nullptr) { + return Status::Invalid("values is null"); + } } return Status::OK(); } @@ -265,7 +268,8 @@ struct ValidateArrayVisitor { auto value_offsets = array.value_offsets(); if (value_offsets == nullptr) { - if (array.length() != 0) { + // For length 0, an empty offsets array seems accepted as a special case (ARROW-544) + if (array.length() > 0) { return Status::Invalid("non-empty array but value_offsets_ is null"); } return Status::OK(); diff --git a/cpp/src/arrow/c/CMakeLists.txt b/cpp/src/arrow/c/CMakeLists.txt new file mode 100644 index 00000000000..3765477ba09 --- /dev/null +++ b/cpp/src/arrow/c/CMakeLists.txt @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +add_arrow_test(bridge_test PREFIX "arrow-c") + +add_arrow_benchmark(bridge_benchmark) + +arrow_install_all_headers("arrow/c") diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h new file mode 100644 index 00000000000..821bc961281 --- /dev/null +++ b/cpp/src/arrow/c/abi.h @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define ARROW_FLAG_DICTIONARY_ORDERED 1 +#define ARROW_FLAG_NULLABLE 2 +#define ARROW_FLAG_MAP_KEYS_SORTED 4 + +struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; +}; + +struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; +}; + +#ifdef __cplusplus +} +#endif diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc new file mode 100644 index 00000000000..3c14571cf4b --- /dev/null +++ b/cpp/src/arrow/c/bridge.cc @@ -0,0 +1,1606 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/c/bridge.h" + +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/c/helpers.h" +#include "arrow/c/util_internal.h" +#include "arrow/memory_pool.h" +#include "arrow/record_batch.h" +#include "arrow/result.h" +#include "arrow/stl_allocator.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" +#include "arrow/util/parsing.h" +#include "arrow/util/string_view.h" +#include "arrow/visitor_inline.h" + +namespace arrow { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +using internal::ArrayExportGuard; +using internal::ArrayExportTraits; +using internal::SchemaExportGuard; +using internal::SchemaExportTraits; + +// TODO export / import Extension types and arrays + +namespace { + +Status ExportingNotImplemented(const DataType& type) { + return Status::NotImplemented("Exporting ", type.ToString(), " array not supported"); +} + +// Allocate exported private data using MemoryPool, +// to allow accounting memory and checking for memory leaks. + +// XXX use Gandiva's SimpleArena? + +template +using PoolVector = std::vector>; + +template +struct PoolAllocationMixin { + static void* operator new(size_t size) { + DCHECK_EQ(size, sizeof(Derived)); + uint8_t* data; + ARROW_CHECK_OK(default_memory_pool()->Allocate(static_cast(size), &data)); + return data; + } + + static void operator delete(void* ptr) { + default_memory_pool()->Free(reinterpret_cast(ptr), sizeof(Derived)); + } +}; + +////////////////////////////////////////////////////////////////////////// +// C schema export + +struct ExportedSchemaPrivateData : PoolAllocationMixin { + std::string format_; + std::string name_; + std::string metadata_; + struct ArrowSchema dictionary_; + PoolVector children_; + PoolVector child_pointers_; + + ExportedSchemaPrivateData() = default; + ARROW_DEFAULT_MOVE_AND_ASSIGN(ExportedSchemaPrivateData); + ARROW_DISALLOW_COPY_AND_ASSIGN(ExportedSchemaPrivateData); +}; + +void ReleaseExportedSchema(struct ArrowSchema* schema) { + if (ArrowSchemaIsReleased(schema)) { + return; + } + for (int64_t i = 0; i < schema->n_children; ++i) { + struct ArrowSchema* child = schema->children[i]; + ArrowSchemaRelease(child); + DCHECK(ArrowSchemaIsReleased(child)) + << "Child release callback should have marked it released"; + } + struct ArrowSchema* dict = schema->dictionary; + if (dict != nullptr) { + ArrowSchemaRelease(dict); + DCHECK(ArrowSchemaIsReleased(dict)) + << "Dictionary release callback should have marked it released"; + } + DCHECK_NE(schema->private_data, nullptr); + delete reinterpret_cast(schema->private_data); + + ArrowSchemaMarkReleased(schema); +} + +template +Result DowncastMetadataSize(SizeType size) { + auto res = static_cast(size); + if (res < 0 || static_cast(res) != size) { + return Status::Invalid("Metadata too large (more than 2**31 items or bytes)"); + } + return res; +} + +Result EncodeMetadata(const KeyValueMetadata& metadata) { + ARROW_ASSIGN_OR_RAISE(auto npairs, DowncastMetadataSize(metadata.size())); + std::string exported; + + // Pre-compute total string size + size_t total_size = 4; + for (int32_t i = 0; i < npairs; ++i) { + total_size += 8 + metadata.key(i).length() + metadata.value(i).length(); + } + exported.resize(total_size); + + char* data_start = &exported[0]; + char* data = data_start; + auto write_int32 = [&](int32_t v) -> void { + memcpy(data, &v, 4); + data += 4; + }; + auto write_string = [&](const std::string& s) -> Status { + ARROW_ASSIGN_OR_RAISE(auto len, DowncastMetadataSize(s.length())); + write_int32(len); + if (len > 0) { + memcpy(data, s.data(), len); + data += len; + } + return Status::OK(); + }; + + write_int32(npairs); + for (int32_t i = 0; i < npairs; ++i) { + RETURN_NOT_OK(write_string(metadata.key(i))); + RETURN_NOT_OK(write_string(metadata.value(i))); + } + DCHECK_EQ(static_cast(data - data_start), total_size); + return exported; +} + +struct SchemaExporter { + Status ExportField(const Field& field) { + export_.name_ = field.name(); + flags_ = field.nullable() ? ARROW_FLAG_NULLABLE : 0; + + const DataType& type = *field.type(); + RETURN_NOT_OK(ExportFormat(type)); + RETURN_NOT_OK(ExportChildren(type.children())); + RETURN_NOT_OK(ExportMetadata(field.metadata().get())); + return Status::OK(); + } + + Status ExportType(const DataType& type) { + flags_ = ARROW_FLAG_NULLABLE; + + RETURN_NOT_OK(ExportFormat(type)); + RETURN_NOT_OK(ExportChildren(type.children())); + return Status::OK(); + } + + Status ExportSchema(const Schema& schema) { + static StructType dummy_struct_type({}); + flags_ = 0; + + RETURN_NOT_OK(ExportFormat(dummy_struct_type)); + RETURN_NOT_OK(ExportChildren(schema.fields())); + RETURN_NOT_OK(ExportMetadata(schema.metadata().get())); + return Status::OK(); + } + + // Finalize exporting by setting C struct fields and allocating + // autonomous private data for each schema node. + // + // This function can't fail, as properly reclaiming memory in case of error + // would be too fragile. After this function returns, memory is reclaimed + // by calling the release() pointer in the top level ArrowSchema struct. + void Finish(struct ArrowSchema* c_struct) { + // First, create permanent ExportedSchemaPrivateData + auto pdata = new ExportedSchemaPrivateData(std::move(export_)); + + // Second, finish dictionary and children. + if (dict_exporter_) { + dict_exporter_->Finish(&pdata->dictionary_); + } + pdata->child_pointers_.resize(child_exporters_.size(), nullptr); + for (size_t i = 0; i < child_exporters_.size(); ++i) { + auto ptr = pdata->child_pointers_[i] = &pdata->children_[i]; + child_exporters_[i].Finish(ptr); + } + + // Third, fill C struct. + DCHECK_NE(c_struct, nullptr); + memset(c_struct, 0, sizeof(*c_struct)); + + c_struct->format = pdata->format_.c_str(); + c_struct->name = pdata->name_.c_str(); + c_struct->metadata = pdata->metadata_.empty() ? nullptr : pdata->metadata_.c_str(); + c_struct->flags = flags_; + + c_struct->n_children = static_cast(child_exporters_.size()); + c_struct->children = pdata->child_pointers_.data(); + c_struct->dictionary = dict_exporter_ ? &pdata->dictionary_ : nullptr; + c_struct->private_data = pdata; + c_struct->release = ReleaseExportedSchema; + } + + Status ExportFormat(const DataType& type) { + if (type.id() == Type::DICTIONARY) { + const auto& dict_type = checked_cast(type); + if (dict_type.ordered()) { + flags_ |= ARROW_FLAG_DICTIONARY_ORDERED; + } + // Dictionary type: parent struct describes index type, + // child dictionary struct describes value type. + RETURN_NOT_OK(VisitTypeInline(*dict_type.index_type(), this)); + dict_exporter_.reset(new SchemaExporter()); + RETURN_NOT_OK(dict_exporter_->ExportType(*dict_type.value_type())); + } else { + RETURN_NOT_OK(VisitTypeInline(type, this)); + } + DCHECK(!export_.format_.empty()); + return Status::OK(); + } + + Status ExportChildren(const std::vector>& fields) { + export_.children_.resize(fields.size()); + child_exporters_.resize(fields.size()); + for (size_t i = 0; i < fields.size(); ++i) { + RETURN_NOT_OK(child_exporters_[i].ExportField(*fields[i])); + } + return Status::OK(); + } + + Status ExportMetadata(const KeyValueMetadata* metadata) { + if (metadata != nullptr && metadata->size() >= 0) { + ARROW_ASSIGN_OR_RAISE(export_.metadata_, EncodeMetadata(*metadata)); + } + return Status::OK(); + } + + // Type-specific visitors + + Status Visit(const DataType& type) { return ExportingNotImplemented(type); } + + Status Visit(const NullType& type) { + export_.format_ = "n"; + return Status::OK(); + } + + Status Visit(const BooleanType& type) { + export_.format_ = "b"; + return Status::OK(); + } + + Status Visit(const Int8Type& type) { + export_.format_ = "c"; + return Status::OK(); + } + + Status Visit(const UInt8Type& type) { + export_.format_ = "C"; + return Status::OK(); + } + + Status Visit(const Int16Type& type) { + export_.format_ = "s"; + return Status::OK(); + } + + Status Visit(const UInt16Type& type) { + export_.format_ = "S"; + return Status::OK(); + } + + Status Visit(const Int32Type& type) { + export_.format_ = "i"; + return Status::OK(); + } + + Status Visit(const UInt32Type& type) { + export_.format_ = "I"; + return Status::OK(); + } + + Status Visit(const Int64Type& type) { + export_.format_ = "l"; + return Status::OK(); + } + + Status Visit(const UInt64Type& type) { + export_.format_ = "L"; + return Status::OK(); + } + + Status Visit(const HalfFloatType& type) { + export_.format_ = "e"; + return Status::OK(); + } + + Status Visit(const FloatType& type) { + export_.format_ = "f"; + return Status::OK(); + } + + Status Visit(const DoubleType& type) { + export_.format_ = "g"; + return Status::OK(); + } + + Status Visit(const FixedSizeBinaryType& type) { + export_.format_ = "w:" + std::to_string(type.byte_width()); + return Status::OK(); + } + + Status Visit(const Decimal128Type& type) { + export_.format_ = + "d:" + std::to_string(type.precision()) + "," + std::to_string(type.scale()); + return Status::OK(); + } + + Status Visit(const BinaryType& type) { + export_.format_ = "z"; + return Status::OK(); + } + + Status Visit(const LargeBinaryType& type) { + export_.format_ = "Z"; + return Status::OK(); + } + + Status Visit(const StringType& type) { + export_.format_ = "u"; + return Status::OK(); + } + + Status Visit(const LargeStringType& type) { + export_.format_ = "U"; + return Status::OK(); + } + + Status Visit(const Date32Type& type) { + export_.format_ = "tdD"; + return Status::OK(); + } + + Status Visit(const Date64Type& type) { + export_.format_ = "tdm"; + return Status::OK(); + } + + Status Visit(const Time32Type& type) { + switch (type.unit()) { + case TimeUnit::SECOND: + export_.format_ = "tts"; + break; + case TimeUnit::MILLI: + export_.format_ = "ttm"; + break; + default: + return Status::Invalid("Invalid time unit for Time32: ", type.unit()); + } + return Status::OK(); + } + + Status Visit(const Time64Type& type) { + switch (type.unit()) { + case TimeUnit::MICRO: + export_.format_ = "ttu"; + break; + case TimeUnit::NANO: + export_.format_ = "ttn"; + break; + default: + return Status::Invalid("Invalid time unit for Time64: ", type.unit()); + } + return Status::OK(); + } + + Status Visit(const TimestampType& type) { + switch (type.unit()) { + case TimeUnit::SECOND: + export_.format_ = "tss:"; + break; + case TimeUnit::MILLI: + export_.format_ = "tsm:"; + break; + case TimeUnit::MICRO: + export_.format_ = "tsu:"; + break; + case TimeUnit::NANO: + export_.format_ = "tsn:"; + break; + default: + return Status::Invalid("Invalid time unit for Timestamp: ", type.unit()); + } + export_.format_ += type.timezone(); + return Status::OK(); + } + + Status Visit(const DurationType& type) { + switch (type.unit()) { + case TimeUnit::SECOND: + export_.format_ = "tDs"; + break; + case TimeUnit::MILLI: + export_.format_ = "tDm"; + break; + case TimeUnit::MICRO: + export_.format_ = "tDu"; + break; + case TimeUnit::NANO: + export_.format_ = "tDn"; + break; + default: + return Status::Invalid("Invalid time unit for Duration: ", type.unit()); + } + return Status::OK(); + } + + Status Visit(const MonthIntervalType& type) { + export_.format_ = "tiM"; + return Status::OK(); + } + + Status Visit(const DayTimeIntervalType& type) { + export_.format_ = "tiD"; + return Status::OK(); + } + + Status Visit(const ListType& type) { + export_.format_ = "+l"; + return Status::OK(); + } + + Status Visit(const LargeListType& type) { + export_.format_ = "+L"; + return Status::OK(); + } + + Status Visit(const FixedSizeListType& type) { + export_.format_ = "+w:" + std::to_string(type.list_size()); + return Status::OK(); + } + + Status Visit(const StructType& type) { + export_.format_ = "+s"; + return Status::OK(); + } + + Status Visit(const MapType& type) { + export_.format_ = "+m"; + if (type.keys_sorted()) { + flags_ |= ARROW_FLAG_MAP_KEYS_SORTED; + } + return Status::OK(); + } + + Status Visit(const UnionType& type) { + std::string& s = export_.format_; + s = "+u"; + if (type.mode() == UnionMode::DENSE) { + s += "d:"; + } else { + DCHECK_EQ(type.mode(), UnionMode::SPARSE); + s += "s:"; + } + bool first = true; + for (const auto code : type.type_codes()) { + if (!first) { + s += ","; + } + s += std::to_string(code); + first = false; + } + return Status::OK(); + } + + ExportedSchemaPrivateData export_; + int64_t flags_ = 0; + std::unique_ptr dict_exporter_; + std::vector child_exporters_; +}; + +} // namespace + +Status ExportType(const DataType& type, struct ArrowSchema* out) { + SchemaExporter exporter; + RETURN_NOT_OK(exporter.ExportType(type)); + exporter.Finish(out); + return Status::OK(); +} + +Status ExportField(const Field& field, struct ArrowSchema* out) { + SchemaExporter exporter; + RETURN_NOT_OK(exporter.ExportField(field)); + exporter.Finish(out); + return Status::OK(); +} + +Status ExportSchema(const Schema& schema, struct ArrowSchema* out) { + SchemaExporter exporter; + RETURN_NOT_OK(exporter.ExportSchema(schema)); + exporter.Finish(out); + return Status::OK(); +} + +////////////////////////////////////////////////////////////////////////// +// C data export + +namespace { + +struct ExportedArrayPrivateData : PoolAllocationMixin { + // The buffers are owned by the ArrayData member + PoolVector buffers_; + struct ArrowArray dictionary_; + PoolVector children_; + PoolVector child_pointers_; + + std::shared_ptr data_; + + ExportedArrayPrivateData() = default; + ARROW_DEFAULT_MOVE_AND_ASSIGN(ExportedArrayPrivateData); + ARROW_DISALLOW_COPY_AND_ASSIGN(ExportedArrayPrivateData); +}; + +void ReleaseExportedArray(struct ArrowArray* array) { + if (ArrowArrayIsReleased(array)) { + return; + } + for (int64_t i = 0; i < array->n_children; ++i) { + struct ArrowArray* child = array->children[i]; + ArrowArrayRelease(child); + DCHECK(ArrowArrayIsReleased(child)) + << "Child release callback should have marked it released"; + } + struct ArrowArray* dict = array->dictionary; + if (dict != nullptr) { + ArrowArrayRelease(dict); + DCHECK(ArrowArrayIsReleased(dict)) + << "Dictionary release callback should have marked it released"; + } + DCHECK_NE(array->private_data, nullptr); + delete reinterpret_cast(array->private_data); + + ArrowArrayMarkReleased(array); +} + +struct ArrayExporter { + Status Export(const std::shared_ptr& data) { + // Store buffer pointers + export_.buffers_.resize(data->buffers.size()); + std::transform(data->buffers.begin(), data->buffers.end(), export_.buffers_.begin(), + [](const std::shared_ptr& buffer) -> const void* { + return buffer ? buffer->data() : nullptr; + }); + + // Export dictionary + if (data->dictionary != nullptr) { + dict_exporter_.reset(new ArrayExporter()); + RETURN_NOT_OK(dict_exporter_->Export(data->dictionary->data())); + } + + // Export children + export_.children_.resize(data->child_data.size()); + child_exporters_.resize(data->child_data.size()); + for (size_t i = 0; i < data->child_data.size(); ++i) { + RETURN_NOT_OK(child_exporters_[i].Export(data->child_data[i])); + } + + // Store owning pointer to ArrayData + export_.data_ = data; + + return Status::OK(); + } + + // Finalize exporting by setting C struct fields and allocating + // autonomous private data for each array node. + // + // This function can't fail, as properly reclaiming memory in case of error + // would be too fragile. After this function returns, memory is reclaimed + // by calling the release() pointer in the top level ArrowArray struct. + void Finish(struct ArrowArray* c_struct_) { + // First, create permanent ExportedArrayPrivateData, to make sure that + // child ArrayData pointers don't get invalidated. + auto pdata = new ExportedArrayPrivateData(std::move(export_)); + const ArrayData& data = *pdata->data_; + + // Second, finish dictionary and children. + if (dict_exporter_) { + dict_exporter_->Finish(&pdata->dictionary_); + } + pdata->child_pointers_.resize(data.child_data.size(), nullptr); + for (size_t i = 0; i < data.child_data.size(); ++i) { + auto ptr = pdata->child_pointers_[i] = &pdata->children_[i]; + child_exporters_[i].Finish(ptr); + } + + // Third, fill C struct. + DCHECK_NE(c_struct_, nullptr); + memset(c_struct_, 0, sizeof(*c_struct_)); + + c_struct_->length = data.length; + c_struct_->null_count = data.null_count; + c_struct_->offset = data.offset; + c_struct_->n_buffers = static_cast(pdata->buffers_.size()); + c_struct_->n_children = static_cast(pdata->child_pointers_.size()); + c_struct_->buffers = pdata->buffers_.data(); + c_struct_->children = pdata->child_pointers_.data(); + c_struct_->dictionary = dict_exporter_ ? &pdata->dictionary_ : nullptr; + c_struct_->private_data = pdata; + c_struct_->release = ReleaseExportedArray; + } + + ExportedArrayPrivateData export_; + std::unique_ptr dict_exporter_; + std::vector child_exporters_; +}; + +} // namespace + +Status ExportArray(const Array& array, struct ArrowArray* out, + struct ArrowSchema* out_schema) { + SchemaExportGuard guard(out_schema); + if (out_schema != nullptr) { + RETURN_NOT_OK(ExportType(*array.type(), out_schema)); + } + ArrayExporter exporter; + RETURN_NOT_OK(exporter.Export(array.data())); + exporter.Finish(out); + guard.Detach(); + return Status::OK(); +} + +Status ExportRecordBatch(const RecordBatch& batch, struct ArrowArray* out, + struct ArrowSchema* out_schema) { + std::shared_ptr array; + // XXX perhaps bypass ToStructArray() for speed? + RETURN_NOT_OK(batch.ToStructArray(&array)); + + SchemaExportGuard guard(out_schema); + if (out_schema != nullptr) { + // Export the schema, not the struct type, so as not to lose top-level metadata + RETURN_NOT_OK(ExportSchema(*batch.schema(), out_schema)); + } + ArrayExporter exporter; + RETURN_NOT_OK(exporter.Export(array->data())); + exporter.Finish(out); + guard.Detach(); + return Status::OK(); +} + +////////////////////////////////////////////////////////////////////////// +// C schema import + +namespace { + +static constexpr int64_t kMaxImportRecursionLevel = 64; + +Status InvalidFormatString(util::string_view v) { + return Status::Invalid("Invalid or unsupported format string: '", v, "'"); +} + +class FormatStringParser { + public: + FormatStringParser() {} + + explicit FormatStringParser(util::string_view v) : view_(v), index_(0) {} + + bool AtEnd() const { return index_ >= view_.length(); } + + char Next() { return view_[index_++]; } + + util::string_view Rest() { return view_.substr(index_); } + + Status CheckNext(char c) { + if (AtEnd() || Next() != c) { + return Invalid(); + } + return Status::OK(); + } + + Status CheckHasNext() { + if (AtEnd()) { + return Invalid(); + } + return Status::OK(); + } + + Status CheckAtEnd() { + if (!AtEnd()) { + return Invalid(); + } + return Status::OK(); + } + + template + Status ParseInt(util::string_view v, IntType* out) { + using ArrowIntType = typename CTypeTraits::ArrowType; + internal::StringConverter converter; + if (!converter(v.data(), v.size(), out)) { + return Invalid(); + } + return Status::OK(); + } + + Status ParseTimeUnit(TimeUnit::type* out) { + RETURN_NOT_OK(CheckHasNext()); + switch (Next()) { + case 's': + *out = TimeUnit::SECOND; + break; + case 'm': + *out = TimeUnit::MILLI; + break; + case 'u': + *out = TimeUnit::MICRO; + break; + case 'n': + *out = TimeUnit::NANO; + break; + default: + return Invalid(); + } + return Status::OK(); + } + + std::vector Split(util::string_view v, char delim = ',') { + std::vector parts; + size_t start = 0, end; + while (true) { + end = v.find_first_of(delim, start); + parts.push_back(v.substr(start, end - start)); + if (end == util::string_view::npos) { + break; + } + start = end + 1; + } + return parts; + } + + template + Status ParseInts(util::string_view v, std::vector* out) { + auto parts = Split(v); + std::vector result; + result.reserve(parts.size()); + for (const auto& p : parts) { + IntType i; + RETURN_NOT_OK(ParseInt(p, &i)); + result.push_back(i); + } + *out = std::move(result); + return Status::OK(); + } + + Status Invalid() { return InvalidFormatString(view_); } + + protected: + util::string_view view_; + size_t index_; +}; + +Result> DecodeMetadata(const char* metadata) { + auto read_int32 = [&](int32_t* out) -> Status { + int32_t v; + memcpy(&v, metadata, 4); + metadata += 4; + *out = BitUtil::FromLittleEndian(v); + if (*out < 0) { + return Status::Invalid("Invalid encoded metadata string"); + } + return Status::OK(); + }; + + auto read_string = [&](std::string* out) -> Status { + int32_t len; + RETURN_NOT_OK(read_int32(&len)); + out->resize(len); + if (len > 0) { + memcpy(&(*out)[0], metadata, len); + metadata += len; + } + return Status::OK(); + }; + + if (metadata == nullptr) { + return nullptr; + } + int32_t npairs; + RETURN_NOT_OK(read_int32(&npairs)); + if (npairs == 0) { + return nullptr; + } + std::vector keys(npairs); + std::vector values(npairs); + for (int32_t i = 0; i < npairs; ++i) { + RETURN_NOT_OK(read_string(&keys[i])); + RETURN_NOT_OK(read_string(&values[i])); + } + return key_value_metadata(std::move(keys), std::move(values)); +} + +struct SchemaImporter { + SchemaImporter() : c_struct_(nullptr), guard_(nullptr) {} + + Status Import(struct ArrowSchema* src) { + if (ArrowSchemaIsReleased(src)) { + return Status::Invalid("Cannot import released ArrowSchema"); + } + guard_.Reset(src); + recursion_level_ = 0; + c_struct_ = src; + return DoImport(); + } + + Result> MakeField() const { + ARROW_ASSIGN_OR_RAISE(auto metadata, DecodeMetadata(c_struct_->metadata)); + const char* name = c_struct_->name ? c_struct_->name : ""; + bool nullable = (c_struct_->flags & ARROW_FLAG_NULLABLE) != 0; + return field(name, type_, nullable, metadata); + } + + Result> MakeSchema() const { + if (type_->id() != Type::STRUCT) { + return Status::Invalid( + "Cannot import schema: ArrowSchema describes non-struct type ", + type_->ToString()); + } + ARROW_ASSIGN_OR_RAISE(auto metadata, DecodeMetadata(c_struct_->metadata)); + return schema(type_->children(), std::move(metadata)); + } + + Result> MakeType() const { return type_; } + + protected: + Status ImportChild(const SchemaImporter* parent, struct ArrowSchema* src) { + if (ArrowSchemaIsReleased(src)) { + return Status::Invalid("Cannot import released ArrowSchema"); + } + recursion_level_ = parent->recursion_level_ + 1; + if (recursion_level_ >= kMaxImportRecursionLevel) { + return Status::Invalid("Recursion level in ArrowSchema struct exceeded"); + } + // The ArrowSchema is owned by its parent, so don't release it ourselves + c_struct_ = src; + return DoImport(); + } + + Status ImportDict(const SchemaImporter* parent, struct ArrowSchema* src) { + return ImportChild(parent, src); + } + + Status DoImport() { + // First import children (required for reconstituting parent type) + child_importers_.resize(c_struct_->n_children); + for (int64_t i = 0; i < c_struct_->n_children; ++i) { + DCHECK_NE(c_struct_->children[i], nullptr); + RETURN_NOT_OK(child_importers_[i].ImportChild(this, c_struct_->children[i])); + } + + // Import main type + RETURN_NOT_OK(ProcessFormat()); + DCHECK_NE(type_, nullptr); + + // Import dictionary type + if (c_struct_->dictionary != nullptr) { + // Check this index type + bool indices_ok = false; + if (is_integer(type_->id())) { + indices_ok = checked_cast(*type_).is_signed(); + } + if (!indices_ok) { + return Status::Invalid( + "ArrowSchema struct has a dictionary but is not a signed integer type: ", + type_->ToString()); + } + SchemaImporter dict_importer; + RETURN_NOT_OK(dict_importer.ImportDict(this, c_struct_->dictionary)); + bool ordered = (c_struct_->flags & ARROW_FLAG_DICTIONARY_ORDERED) != 0; + type_ = dictionary(type_, dict_importer.type_, ordered); + } + return Status::OK(); + } + + Status ProcessFormat() { + f_parser_ = FormatStringParser(c_struct_->format); + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'n': + return ProcessPrimitive(null()); + case 'b': + return ProcessPrimitive(boolean()); + case 'c': + return ProcessPrimitive(int8()); + case 'C': + return ProcessPrimitive(uint8()); + case 's': + return ProcessPrimitive(int16()); + case 'S': + return ProcessPrimitive(uint16()); + case 'i': + return ProcessPrimitive(int32()); + case 'I': + return ProcessPrimitive(uint32()); + case 'l': + return ProcessPrimitive(int64()); + case 'L': + return ProcessPrimitive(uint64()); + case 'e': + return ProcessPrimitive(float16()); + case 'f': + return ProcessPrimitive(float32()); + case 'g': + return ProcessPrimitive(float64()); + case 'u': + return ProcessPrimitive(utf8()); + case 'U': + return ProcessPrimitive(large_utf8()); + case 'z': + return ProcessPrimitive(binary()); + case 'Z': + return ProcessPrimitive(large_binary()); + case 'w': + return ProcessFixedSizeBinary(); + case 'd': + return ProcessDecimal(); + case 't': + return ProcessTemporal(); + case '+': + return ProcessNested(); + } + return f_parser_.Invalid(); + } + + Status ProcessTemporal() { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'd': + return ProcessDate(); + case 't': + return ProcessTime(); + case 'D': + return ProcessDuration(); + case 'i': + return ProcessInterval(); + case 's': + return ProcessTimestamp(); + } + return f_parser_.Invalid(); + } + + Status ProcessNested() { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'l': + return ProcessListLike(); + case 'L': + return ProcessListLike(); + case 'w': + return ProcessFixedSizeList(); + case 's': + return ProcessStruct(); + case 'm': + return ProcessMap(); + case 'u': + return ProcessUnion(); + } + return f_parser_.Invalid(); + } + + Status ProcessDate() { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'D': + return ProcessPrimitive(date32()); + case 'm': + return ProcessPrimitive(date64()); + } + return f_parser_.Invalid(); + } + + Status ProcessInterval() { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'D': + return ProcessPrimitive(day_time_interval()); + case 'M': + return ProcessPrimitive(month_interval()); + } + return f_parser_.Invalid(); + } + + Status ProcessTime() { + TimeUnit::type unit; + RETURN_NOT_OK(f_parser_.ParseTimeUnit(&unit)); + if (unit == TimeUnit::SECOND || unit == TimeUnit::MILLI) { + return ProcessPrimitive(time32(unit)); + } else { + return ProcessPrimitive(time64(unit)); + } + } + + Status ProcessDuration() { + TimeUnit::type unit; + RETURN_NOT_OK(f_parser_.ParseTimeUnit(&unit)); + return ProcessPrimitive(duration(unit)); + } + + Status ProcessTimestamp() { + TimeUnit::type unit; + RETURN_NOT_OK(f_parser_.ParseTimeUnit(&unit)); + RETURN_NOT_OK(f_parser_.CheckNext(':')); + type_ = timestamp(unit, std::string(f_parser_.Rest())); + return Status::OK(); + } + + Status ProcessFixedSizeBinary() { + RETURN_NOT_OK(f_parser_.CheckNext(':')); + int32_t byte_width = -1; + RETURN_NOT_OK(f_parser_.ParseInt(f_parser_.Rest(), &byte_width)); + if (byte_width < 0) { + return f_parser_.Invalid(); + } + type_ = fixed_size_binary(byte_width); + return Status::OK(); + } + + Status ProcessDecimal() { + RETURN_NOT_OK(f_parser_.CheckNext(':')); + std::vector prec_scale; + RETURN_NOT_OK(f_parser_.ParseInts(f_parser_.Rest(), &prec_scale)); + if (prec_scale.size() != 2) { + return f_parser_.Invalid(); + } + if (prec_scale[0] <= 0 || prec_scale[1] <= 0) { + return f_parser_.Invalid(); + } + type_ = decimal(prec_scale[0], prec_scale[1]); + return Status::OK(); + } + + Status ProcessPrimitive(const std::shared_ptr& type) { + RETURN_NOT_OK(f_parser_.CheckAtEnd()); + type_ = type; + return CheckNoChildren(type); + } + + template + Status ProcessListLike() { + RETURN_NOT_OK(f_parser_.CheckAtEnd()); + RETURN_NOT_OK(CheckNumChildren(1)); + ARROW_ASSIGN_OR_RAISE(auto field, MakeChildField(0)); + type_ = std::make_shared(field); + return Status::OK(); + } + + Status ProcessMap() { + RETURN_NOT_OK(f_parser_.CheckAtEnd()); + RETURN_NOT_OK(CheckNumChildren(1)); + ARROW_ASSIGN_OR_RAISE(auto field, MakeChildField(0)); + const auto& value_type = field->type(); + if (value_type->id() != Type::STRUCT) { + return Status::Invalid("Imported map array has unexpected child field type: ", + field->ToString()); + } + if (value_type->num_children() != 2) { + return Status::Invalid("Imported map array has unexpected child field type: ", + field->ToString()); + } + + bool keys_sorted = (c_struct_->flags & ARROW_FLAG_MAP_KEYS_SORTED); + type_ = map(value_type->child(0)->type(), value_type->child(1)->type(), keys_sorted); + return Status::OK(); + } + + Status ProcessFixedSizeList() { + RETURN_NOT_OK(f_parser_.CheckNext(':')); + int32_t list_size = -1; + RETURN_NOT_OK(f_parser_.ParseInt(f_parser_.Rest(), &list_size)); + if (list_size < 0) { + return f_parser_.Invalid(); + } + RETURN_NOT_OK(CheckNumChildren(1)); + ARROW_ASSIGN_OR_RAISE(auto field, MakeChildField(0)); + type_ = fixed_size_list(field, list_size); + return Status::OK(); + } + + Status ProcessStruct() { + RETURN_NOT_OK(f_parser_.CheckAtEnd()); + ARROW_ASSIGN_OR_RAISE(auto fields, MakeChildFields()); + type_ = struct_(std::move(fields)); + return Status::OK(); + } + + Status ProcessUnion() { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + UnionMode::type mode; + switch (f_parser_.Next()) { + case 'd': + mode = UnionMode::DENSE; + break; + case 's': + mode = UnionMode::SPARSE; + break; + default: + return f_parser_.Invalid(); + } + RETURN_NOT_OK(f_parser_.CheckNext(':')); + std::vector type_codes; + RETURN_NOT_OK(f_parser_.ParseInts(f_parser_.Rest(), &type_codes)); + ARROW_ASSIGN_OR_RAISE(auto fields, MakeChildFields()); + if (fields.size() != type_codes.size()) { + return Status::Invalid( + "ArrowArray struct number of children incompatible with format string '", + c_struct_->format, "'"); + } + for (const auto code : type_codes) { + if (code < 0) { + return Status::Invalid("Negative type code in union: format string '", + c_struct_->format, "'"); + } + } + type_ = union_(std::move(fields), std::move(type_codes), mode); + return Status::OK(); + } + + Result> MakeChildField(int64_t child_id) { + const auto& child = child_importers_[child_id]; + if (child.c_struct_->name == nullptr) { + return Status::Invalid("Expected non-null name in imported array child"); + } + return child.MakeField(); + } + + Result>> MakeChildFields() { + std::vector> fields(child_importers_.size()); + for (int64_t i = 0; i < static_cast(child_importers_.size()); ++i) { + ARROW_ASSIGN_OR_RAISE(fields[i], MakeChildField(i)); + } + return fields; + } + + Status CheckNoChildren(const std::shared_ptr& type) { + return CheckNumChildren(type, 0); + } + + Status CheckNumChildren(const std::shared_ptr& type, int64_t n_children) { + if (c_struct_->n_children != n_children) { + return Status::Invalid("Expected ", n_children, " children for imported type ", + *type, ", ArrowArray struct has ", c_struct_->n_children); + } + return Status::OK(); + } + + Status CheckNumChildren(int64_t n_children) { + if (c_struct_->n_children != n_children) { + return Status::Invalid("Expected ", n_children, " children for imported format '", + c_struct_->format, "', ArrowArray struct has ", + c_struct_->n_children); + } + return Status::OK(); + } + + struct ArrowSchema* c_struct_; + SchemaExportGuard guard_; + FormatStringParser f_parser_; + int64_t recursion_level_; + std::vector child_importers_; + std::shared_ptr type_; +}; + +} // namespace + +Result> ImportType(struct ArrowSchema* schema) { + SchemaImporter importer; + RETURN_NOT_OK(importer.Import(schema)); + return importer.MakeType(); +} + +Result> ImportField(struct ArrowSchema* schema) { + SchemaImporter importer; + RETURN_NOT_OK(importer.Import(schema)); + return importer.MakeField(); +} + +Result> ImportSchema(struct ArrowSchema* schema) { + SchemaImporter importer; + RETURN_NOT_OK(importer.Import(schema)); + return importer.MakeSchema(); +} + +////////////////////////////////////////////////////////////////////////// +// C data import + +namespace { + +// A wrapper struct for an imported C ArrowArray. +// The ArrowArray is released on destruction. +struct ImportedArrayData { + struct ArrowArray array_; + + ImportedArrayData() { + ArrowArrayMarkReleased(&array_); // Initially released + } + + void Release() { + if (!ArrowArrayIsReleased(&array_)) { + ArrowArrayRelease(&array_); + DCHECK(ArrowArrayIsReleased(&array_)); + } + } + + ~ImportedArrayData() { Release(); } + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ImportedArrayData); +}; + +// A buffer wrapping an imported piece of data. +class ImportedBuffer : public Buffer { + public: + ImportedBuffer(const uint8_t* data, int64_t size, + std::shared_ptr import) + : Buffer(data, size), import_(std::move(import)) {} + + ~ImportedBuffer() override {} + + protected: + std::shared_ptr import_; +}; + +struct ArrayImporter { + explicit ArrayImporter(const std::shared_ptr& type) : type_(type) {} + + Status Import(struct ArrowArray* src) { + if (ArrowArrayIsReleased(src)) { + return Status::Invalid("Cannot import released ArrowArray"); + } + recursion_level_ = 0; + import_ = std::make_shared(); + c_struct_ = &import_->array_; + ArrowArrayMove(src, c_struct_); + return DoImport(); + } + + Result> MakeArray() { + DCHECK_NE(data_, nullptr); + return ::arrow::MakeArray(data_); + } + + Result> MakeRecordBatch(std::shared_ptr schema) { + DCHECK_NE(data_, nullptr); + if (data_->null_count != 0) { + return Status::Invalid( + "ArrowArray struct has non-zero null count, " + "cannot be imported as RecordBatch"); + } + if (data_->offset != 0) { + return Status::Invalid( + "ArrowArray struct has non-zero offset, " + "cannot be imported as RecordBatch"); + } + return RecordBatch::Make(std::move(schema), data_->length, + std::move(data_->child_data)); + } + + Status ImportChild(const ArrayImporter* parent, struct ArrowArray* src) { + if (ArrowArrayIsReleased(src)) { + return Status::Invalid("Cannot import released ArrowArray"); + } + recursion_level_ = parent->recursion_level_ + 1; + if (recursion_level_ >= kMaxImportRecursionLevel) { + return Status::Invalid("Recursion level in ArrowArray struct exceeded"); + } + // Child buffers will keep the entire parent import alive. + // Perhaps we can move the child structs to an owned area + // when the parent ImportedArrayData::Release() gets called, + // but that is another level of complication. + import_ = parent->import_; + // The ArrowArray shouldn't be moved, it's owned by its parent + c_struct_ = src; + return DoImport(); + } + + Status ImportDict(const ArrayImporter* parent, struct ArrowArray* src) { + return ImportChild(parent, src); + } + + Status DoImport() { + // First import children (required for reconstituting parent array data) + const auto& fields = type_->children(); + if (c_struct_->n_children != static_cast(fields.size())) { + return Status::Invalid("ArrowArray struct has ", c_struct_->n_children, + " children, expected ", fields.size(), " for type ", + type_->ToString()); + } + child_importers_.reserve(fields.size()); + for (int64_t i = 0; i < c_struct_->n_children; ++i) { + DCHECK_NE(c_struct_->children[i], nullptr); + child_importers_.emplace_back(fields[i]->type()); + RETURN_NOT_OK(child_importers_.back().ImportChild(this, c_struct_->children[i])); + } + + // Import main data + RETURN_NOT_OK(ImportMainData()); + + bool is_dict_type = (type_->id() == Type::DICTIONARY); + if (c_struct_->dictionary != nullptr) { + if (!is_dict_type) { + return Status::Invalid("Import type is ", type_->ToString(), + " but dictionary field in ArrowArray struct is not null"); + } + const auto& dict_type = checked_cast(*type_); + // Import dictionary values + ArrayImporter dict_importer(dict_type.value_type()); + RETURN_NOT_OK(dict_importer.ImportDict(this, c_struct_->dictionary)); + ARROW_ASSIGN_OR_RAISE(data_->dictionary, dict_importer.MakeArray()); + } else { + if (is_dict_type) { + return Status::Invalid("Import type is ", type_->ToString(), + " but dictionary field in ArrowArray struct is null"); + } + } + return Status::OK(); + } + + Status ImportMainData() { return VisitTypeInline(*type_, this); } + + Status Visit(const DataType& type) { + return Status::NotImplemented("Cannot import array of type ", type_->ToString()); + } + + Status Visit(const FixedWidthType& type) { return ImportFixedSizePrimitive(); } + + Status Visit(const NullType& type) { + RETURN_NOT_OK(CheckNoChildren()); + // XXX should we be lenient on the number of buffers? + RETURN_NOT_OK(CheckNumBuffers(1)); + RETURN_NOT_OK(AllocateArrayData()); + RETURN_NOT_OK(ImportBitsBuffer(0)); + return Status::OK(); + } + + Status Visit(const StringType& type) { return ImportStringLike(type); } + + Status Visit(const BinaryType& type) { return ImportStringLike(type); } + + Status Visit(const LargeStringType& type) { return ImportStringLike(type); } + + Status Visit(const LargeBinaryType& type) { return ImportStringLike(type); } + + Status Visit(const ListType& type) { return ImportListLike(type); } + + Status Visit(const LargeListType& type) { return ImportListLike(type); } + + Status Visit(const FixedSizeListType& type) { + RETURN_NOT_OK(CheckNumChildren(1)); + RETURN_NOT_OK(CheckNumBuffers(1)); + RETURN_NOT_OK(AllocateArrayData()); + RETURN_NOT_OK(ImportNullBitmap()); + return Status::OK(); + } + + Status Visit(const StructType& type) { + RETURN_NOT_OK(CheckNumBuffers(1)); + RETURN_NOT_OK(AllocateArrayData()); + RETURN_NOT_OK(ImportNullBitmap()); + return Status::OK(); + } + + Status Visit(const UnionType& type) { + auto mode = type.mode(); + RETURN_NOT_OK(CheckNumBuffers(3)); + RETURN_NOT_OK(AllocateArrayData()); + RETURN_NOT_OK(ImportNullBitmap()); + RETURN_NOT_OK(ImportFixedSizeBuffer(1, sizeof(int8_t))); + if (mode == UnionMode::DENSE) { + RETURN_NOT_OK(ImportFixedSizeBuffer(2, sizeof(int32_t))); + } else { + RETURN_NOT_OK(ImportUnusedBuffer(2)); + } + return Status::OK(); + } + +#if 0 + Status ProcessMap() { + RETURN_NOT_OK(f_parser_.CheckAtEnd()); + RETURN_NOT_OK(CheckNumChildren(1)); + ARROW_ASSIGN_OR_RAISE(auto field, MakeChildField(0)); + const auto& value_type = field->type(); + if (value_type->id() != Type::STRUCT) { + return Status::Invalid("Imported map array has unexpected child field type: ", + field->ToString()); + } + if (value_type->num_children() != 2) { + return Status::Invalid("Imported map array has unexpected child field type: ", + field->ToString()); + } + + bool keys_sorted = (c_struct_->flags & ARROW_FLAG_MAP_KEYS_SORTED); + auto type = + map(value_type->child(0)->type(), value_type->child(1)->type(), keys_sorted); + // Process buffers as for ListType + RETURN_NOT_OK(CheckNumBuffers(type, 2)); + RETURN_NOT_OK(AllocateArrayData(type)); + RETURN_NOT_OK(ImportNullBitmap()); + RETURN_NOT_OK(ImportOffsetsBuffer(1)); + return Status::OK(); + } +#endif + + Status ImportFixedSizePrimitive() { + const auto& fw_type = checked_cast(*type_); + RETURN_NOT_OK(CheckNoChildren()); + RETURN_NOT_OK(CheckNumBuffers(2)); + RETURN_NOT_OK(AllocateArrayData()); + RETURN_NOT_OK(ImportNullBitmap()); + if (BitUtil::IsMultipleOf8(fw_type.bit_width())) { + RETURN_NOT_OK(ImportFixedSizeBuffer(1, fw_type.bit_width() / 8)); + } else { + DCHECK_EQ(fw_type.bit_width(), 1); + RETURN_NOT_OK(ImportBitsBuffer(1)); + } + return Status::OK(); + } + + template + Status ImportStringLike(const StringType& type) { + RETURN_NOT_OK(CheckNoChildren()); + RETURN_NOT_OK(CheckNumBuffers(3)); + RETURN_NOT_OK(AllocateArrayData()); + RETURN_NOT_OK(ImportNullBitmap()); + RETURN_NOT_OK(ImportOffsetsBuffer(1)); + RETURN_NOT_OK(ImportStringValuesBuffer(1, 2)); + return Status::OK(); + } + + template + Status ImportListLike(const ListType& type) { + RETURN_NOT_OK(CheckNumChildren(1)); + RETURN_NOT_OK(CheckNumBuffers(2)); + RETURN_NOT_OK(AllocateArrayData()); + RETURN_NOT_OK(ImportNullBitmap()); + RETURN_NOT_OK(ImportOffsetsBuffer(1)); + return Status::OK(); + } + + Status CheckNoChildren() { return CheckNumChildren(0); } + + Status CheckNumChildren(int64_t n_children) { + if (c_struct_->n_children != n_children) { + return Status::Invalid("Expected ", n_children, " children for imported type ", + type_->ToString(), ", ArrowArray struct has ", + c_struct_->n_children); + } + return Status::OK(); + } + + Status CheckNumBuffers(int64_t n_buffers) { + if (n_buffers != c_struct_->n_buffers) { + return Status::Invalid("Expected ", n_buffers, " buffers for imported type ", + type_->ToString(), ", ArrowArray struct has ", + c_struct_->n_buffers); + } + return Status::OK(); + } + + Status AllocateArrayData() { + DCHECK_EQ(data_, nullptr); + data_ = std::make_shared(type_, c_struct_->length, c_struct_->null_count, + c_struct_->offset); + data_->buffers.resize(static_cast(c_struct_->n_buffers)); + data_->child_data.resize(static_cast(c_struct_->n_children)); + DCHECK_EQ(child_importers_.size(), data_->child_data.size()); + std::transform(child_importers_.begin(), child_importers_.end(), + data_->child_data.begin(), + [](ArrayImporter& child) { return child.data_; }); + return Status::OK(); + } + + Status ImportNullBitmap(int32_t buffer_id = 0) { + RETURN_NOT_OK(ImportBitsBuffer(buffer_id)); + if (data_->null_count != 0 && data_->buffers[buffer_id] == nullptr) { + return Status::Invalid( + "ArrowArray struct has null bitmap buffer but non-zero null_count ", + data_->null_count); + } + return Status::OK(); + } + + Status ImportBitsBuffer(int32_t buffer_id) { + // Compute visible size of buffer + int64_t buffer_size = + BitUtil::RoundUpToMultipleOf8(c_struct_->length + c_struct_->offset) / 8; + return ImportBuffer(buffer_id, buffer_size); + } + + Status ImportUnusedBuffer(int32_t buffer_id) { return ImportBuffer(buffer_id, 0); } + + Status ImportFixedSizeBuffer(int32_t buffer_id, int64_t byte_width) { + // Compute visible size of buffer + int64_t buffer_size = byte_width * (c_struct_->length + c_struct_->offset); + return ImportBuffer(buffer_id, buffer_size); + } + + template + Status ImportOffsetsBuffer(int32_t buffer_id) { + // Compute visible size of buffer + int64_t buffer_size = + sizeof(OffsetType) * (c_struct_->length + c_struct_->offset + 1); + return ImportBuffer(buffer_id, buffer_size); + } + + template + Status ImportStringValuesBuffer(int32_t offsets_buffer_id, int32_t buffer_id, + int64_t byte_width = 1) { + auto offsets = data_->GetValues(offsets_buffer_id); + // Compute visible size of buffer + int64_t buffer_size = byte_width * offsets[c_struct_->length]; + return ImportBuffer(buffer_id, buffer_size); + } + + Status ImportBuffer(int32_t buffer_id, int64_t buffer_size) { + std::shared_ptr* out = &data_->buffers[buffer_id]; + auto data = reinterpret_cast(c_struct_->buffers[buffer_id]); + if (data != nullptr) { + *out = std::make_shared(data, buffer_size, import_); + } else { + out->reset(); + } + return Status::OK(); + } + + struct ArrowArray* c_struct_; + int64_t recursion_level_; + const std::shared_ptr& type_; + + std::shared_ptr import_; + std::shared_ptr data_; + std::vector child_importers_; +}; + +} // namespace + +Result> ImportArray(struct ArrowArray* array, + std::shared_ptr type) { + ArrayImporter importer(type); + RETURN_NOT_OK(importer.Import(array)); + return importer.MakeArray(); +} + +Result> ImportArray(struct ArrowArray* array, + struct ArrowSchema* type) { + auto maybe_type = ImportType(type); + if (!maybe_type.ok()) { + ArrowArrayRelease(array); + return maybe_type.status(); + } + return ImportArray(array, *maybe_type); +} + +Result> ImportRecordBatch(struct ArrowArray* array, + std::shared_ptr schema) { + auto type = struct_(schema->fields()); + ArrayImporter importer(type); + RETURN_NOT_OK(importer.Import(array)); + return importer.MakeRecordBatch(std::move(schema)); +} + +Result> ImportRecordBatch(struct ArrowArray* array, + struct ArrowSchema* schema) { + auto maybe_schema = ImportSchema(schema); + if (!maybe_schema.ok()) { + ArrowArrayRelease(array); + return maybe_schema.status(); + } + return ImportRecordBatch(array, *maybe_schema); +} + +} // namespace arrow diff --git a/cpp/src/arrow/c/bridge.h b/cpp/src/arrow/c/bridge.h new file mode 100644 index 00000000000..8efb5d98bed --- /dev/null +++ b/cpp/src/arrow/c/bridge.h @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/c/abi.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \brief Export C++ DataType using the C data interface format. +/// +/// The root type is considered to have empty name and metadata. +/// If you want the root type to have a name and/or metadata, pass +/// a Field instead. +/// +/// \param[in] type DataType object to export +/// \param[out] out C struct where to export the datatype +ARROW_EXPORT +Status ExportType(const DataType& type, struct ArrowSchema* out); + +/// \brief Export C++ Field using the C data interface format. +/// +/// \param[in] field Field object to export +/// \param[out] out C struct where to export the field +ARROW_EXPORT +Status ExportField(const Field& field, struct ArrowSchema* out); + +/// \brief Export C++ Schema using the C data interface format. +/// +/// \param[in] schema Schema object to export +/// \param[out] out C struct where to export the field +ARROW_EXPORT +Status ExportSchema(const Schema& schema, struct ArrowSchema* out); + +/// \brief Export C++ Array using the C data interface format. +/// +/// The resulting ArrowArray struct keeps the array data and buffers alive +/// until its release callback is called by the consumer. +/// +/// \param[in] array Array object to export +/// \param[out] out C struct where to export the array +/// \param[out] out_schema optional C struct where to export the array type +ARROW_EXPORT +Status ExportArray(const Array& array, struct ArrowArray* out, + struct ArrowSchema* out_schema = NULLPTR); + +/// \brief Export C++ RecordBatch using the C data interface format. +/// +/// The record batch is exported as if it were a struct array. +/// The resulting ArrowArray struct keeps the record batch data and buffers alive +/// until its release callback is called by the consumer. +/// +/// \param[in] batch Record batch to export +/// \param[out] out C struct where to export the record batch +/// \param[out] out_schema optional C struct where to export the record batch schema +ARROW_EXPORT +Status ExportRecordBatch(const RecordBatch& batch, struct ArrowArray* out, + struct ArrowSchema* out_schema = NULLPTR); + +/// \brief Import C++ DataType from the C data interface. +/// +/// The given ArrowSchema struct is released (as per the C data interface +/// specification), even if this function fails. +/// +/// \param[in,out] schema C data interface struct representing the data type +/// \return Imported type object +ARROW_EXPORT +Result> ImportType(struct ArrowSchema* schema); + +/// \brief Import C++ Field from the C data interface. +/// +/// The given ArrowSchema struct is released (as per the C data interface +/// specification), even if this function fails. +/// +/// \param[in,out] schema C data interface struct representing the field +/// \return Imported field object +ARROW_EXPORT +Result> ImportField(struct ArrowSchema* schema); + +/// \brief Import C++ Schema from the C data interface. +/// +/// The given ArrowSchema struct is released (as per the C data interface +/// specification), even if this function fails. +/// +/// \param[in,out] schema C data interface struct representing the field +/// \return Imported field object +ARROW_EXPORT +Result> ImportSchema(struct ArrowSchema* schema); + +/// \brief Import C++ array from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting array. +/// +/// \param[in,out] array C data interface struct holding the array data +/// \param[in] type type of the imported array +/// \return Imported array object +ARROW_EXPORT +Result> ImportArray(struct ArrowArray* array, + std::shared_ptr type); + +/// \brief Import C++ array and its type from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting array. +/// The ArrowSchema struct is released, even if this function fails. +/// +/// \param[in,out] array C data interface struct holding the array data +/// \param[in,out] type C data interface struct holding the array type +/// \return Imported array object +ARROW_EXPORT +Result> ImportArray(struct ArrowArray* array, + struct ArrowSchema* type); + +/// \brief Import C++ record batch from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting record batch. +/// +/// \param[in,out] array C data interface struct holding the record batch data +/// \param[in] schema schema of the imported record batch +/// \return Imported record batch object +ARROW_EXPORT +Result> ImportRecordBatch(struct ArrowArray* array, + std::shared_ptr schema); + +/// \brief Import C++ record batch and its schema from the C data interface. +/// +/// The type represented by the ArrowSchema struct must be a struct type array. +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting record batch. +/// The ArrowSchema struct is released, even if this function fails. +/// +/// \param[in,out] array C data interface struct holding the record batch data +/// \param[in,out] schema C data interface struct holding the record batch schema +/// \return Imported record batch object +ARROW_EXPORT +Result> ImportRecordBatch(struct ArrowArray* array, + struct ArrowSchema* schema); + +} // namespace arrow diff --git a/cpp/src/arrow/c/bridge_benchmark.cc b/cpp/src/arrow/c/bridge_benchmark.cc new file mode 100644 index 00000000000..94d5e060540 --- /dev/null +++ b/cpp/src/arrow/c/bridge_benchmark.cc @@ -0,0 +1,160 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "benchmark/benchmark.h" + +#include "arrow/array.h" +#include "arrow/c/bridge.h" +#include "arrow/c/helpers.h" +#include "arrow/ipc/json_simple.h" +#include "arrow/record_batch.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/key_value_metadata.h" + +namespace arrow { + +std::shared_ptr ExampleSchema() { + auto f0 = field("f0", utf8()); + auto f1 = field("f1", timestamp(TimeUnit::MICRO, "UTC")); + auto f2 = field("f2", int64()); + auto f3 = field("f3", int16()); + auto f4 = field("f4", int16()); + auto f5 = field("f5", float32()); + auto f6 = field("f6", float32()); + auto f7 = field("f7", float32()); + auto f8 = field("f8", decimal(19, 10)); + return schema({f0, f1, f2, f3, f4, f5, f6, f7, f8}); +} + +std::shared_ptr ExampleRecordBatch() { + // We don't care about the actual data, since it's exported as raw buffer pointers + auto schema = ExampleSchema(); + int64_t length = 1000; + std::vector> columns; + for (const auto& field : schema->fields()) { + std::shared_ptr array; + ABORT_NOT_OK(MakeArrayOfNull(field->type(), length, &array)); + columns.push_back(array); + } + return RecordBatch::Make(schema, length, columns); +} + +static void ExportType(benchmark::State& state) { // NOLINT non-const reference + struct ArrowSchema c_export; + auto type = utf8(); + + for (auto _ : state) { + ABORT_NOT_OK(ExportType(*type, &c_export)); + ArrowSchemaRelease(&c_export); + } + state.SetItemsProcessed(state.iterations()); +} + +static void ExportSchema(benchmark::State& state) { // NOLINT non-const reference + struct ArrowSchema c_export; + auto schema = ExampleSchema(); + + for (auto _ : state) { + ABORT_NOT_OK(ExportSchema(*schema, &c_export)); + ArrowSchemaRelease(&c_export); + } + state.SetItemsProcessed(state.iterations()); +} + +static void ExportArray(benchmark::State& state) { // NOLINT non-const reference + struct ArrowArray c_export; + auto array = ArrayFromJSON(utf8(), R"(["foo", "bar", null])"); + + for (auto _ : state) { + ABORT_NOT_OK(ExportArray(*array, &c_export)); + ArrowArrayRelease(&c_export); + } + state.SetItemsProcessed(state.iterations()); +} + +static void ExportRecordBatch(benchmark::State& state) { // NOLINT non-const reference + struct ArrowArray c_export; + auto batch = ExampleRecordBatch(); + + for (auto _ : state) { + ABORT_NOT_OK(ExportRecordBatch(*batch, &c_export)); + ArrowArrayRelease(&c_export); + } + state.SetItemsProcessed(state.iterations()); +} + +static void ExportImportType(benchmark::State& state) { // NOLINT non-const reference + struct ArrowSchema c_export; + auto type = utf8(); + + for (auto _ : state) { + ABORT_NOT_OK(ExportType(*type, &c_export)); + ImportType(&c_export).ValueOrDie(); + } + state.SetItemsProcessed(state.iterations()); +} + +static void ExportImportSchema(benchmark::State& state) { // NOLINT non-const reference + struct ArrowSchema c_export; + auto schema = ExampleSchema(); + + for (auto _ : state) { + ABORT_NOT_OK(ExportSchema(*schema, &c_export)); + ImportSchema(&c_export).ValueOrDie(); + } + state.SetItemsProcessed(state.iterations()); +} + +static void ExportImportArray(benchmark::State& state) { // NOLINT non-const reference + struct ArrowArray c_export; + auto array = ArrayFromJSON(utf8(), R"(["foo", "bar", null])"); + auto type = array->type(); + + for (auto _ : state) { + ABORT_NOT_OK(ExportArray(*array, &c_export)); + ImportArray(&c_export, type).ValueOrDie(); + } + state.SetItemsProcessed(state.iterations()); +} + +static void ExportImportRecordBatch( + benchmark::State& state) { // NOLINT non-const reference + struct ArrowArray c_export; + auto batch = ExampleRecordBatch(); + auto schema = batch->schema(); + + for (auto _ : state) { + ABORT_NOT_OK(ExportRecordBatch(*batch, &c_export)); + ImportRecordBatch(&c_export, schema).ValueOrDie(); + } + state.SetItemsProcessed(state.iterations()); +} + +BENCHMARK(ExportType); +BENCHMARK(ExportSchema); +BENCHMARK(ExportArray); +BENCHMARK(ExportRecordBatch); + +BENCHMARK(ExportImportType); +BENCHMARK(ExportImportSchema); +BENCHMARK(ExportImportArray); +BENCHMARK(ExportImportRecordBatch); + +} // namespace arrow diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc new file mode 100644 index 00000000000..4ac204e0679 --- /dev/null +++ b/cpp/src/arrow/c/bridge_test.cc @@ -0,0 +1,2544 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include + +#include "arrow/c/bridge.h" +#include "arrow/c/helpers.h" +#include "arrow/c/util_internal.h" +#include "arrow/ipc/json_simple.h" +#include "arrow/memory_pool.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" + +namespace arrow { + +using internal::ArrayExportGuard; +using internal::ArrayExportTraits; +using internal::SchemaExportGuard; +using internal::SchemaExportTraits; + +template +struct ExportTraits {}; + +template <> +struct ExportTraits { + static constexpr auto ExportFunc = ExportType; +}; + +template <> +struct ExportTraits { + static constexpr auto ExportFunc = ExportField; +}; + +template <> +struct ExportTraits { + static constexpr auto ExportFunc = ExportSchema; +}; + +// An interceptor that checks whether a release callback was called. +// (for import tests) +template +class ReleaseCallback { + public: + using CType = typename Traits::CType; + + explicit ReleaseCallback(CType* c_struct) : called_(false) { + orig_release_ = c_struct->release; + orig_private_data_ = c_struct->private_data; + c_struct->release = ReleaseUnbound; + c_struct->private_data = this; + } + + static void ReleaseUnbound(CType* c_struct) { + reinterpret_cast(c_struct->private_data)->Release(c_struct); + } + + void Release(CType* c_struct) { + ASSERT_FALSE(called_) << "ReleaseCallback called twice"; + called_ = true; + ASSERT_FALSE(Traits::IsReleasedFunc(c_struct)) + << "ReleaseCallback called with released ArrowSchema"; + // Call original release callback + c_struct->release = orig_release_; + c_struct->private_data = orig_private_data_; + Traits::ReleaseFunc(c_struct); + ASSERT_TRUE(Traits::IsReleasedFunc(c_struct)) + << "ReleaseCallback did not release ArrowSchema"; + } + + void AssertCalled() { ASSERT_TRUE(called_) << "ReleaseCallback was not called"; } + + void AssertNotCalled() { ASSERT_FALSE(called_) << "ReleaseCallback was called"; } + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ReleaseCallback); + + bool called_; + void (*orig_release_)(CType*); + void* orig_private_data_; +}; + +using SchemaReleaseCallback = ReleaseCallback; +using ArrayReleaseCallback = ReleaseCallback; + +static const std::vector kMetadataKeys1{"key1", "key2"}; +static const std::vector kMetadataValues1{"", "bar"}; +// clang-format off +static const std::string kEncodedMetadata1{ // NOLINT: runtime/string + 2, 0, 0, 0, + 4, 0, 0, 0, 'k', 'e', 'y', '1', 0, 0, 0, 0, + 4, 0, 0, 0, 'k', 'e', 'y', '2', 3, 0, 0, 0, 'b', 'a', 'r'}; +// clang-format off + +static const std::vector kMetadataKeys2{"key"}; +static const std::vector kMetadataValues2{"abcde"}; +// clang-format off +static const std::string kEncodedMetadata2{ // NOLINT: runtime/string + 1, 0, 0, 0, + 3, 0, 0, 0, 'k', 'e', 'y', 5, 0, 0, 0, 'a', 'b', 'c', 'd', 'e'}; +// clang-format off + +static constexpr int64_t kDefaultFlags = ARROW_FLAG_NULLABLE; + +//////////////////////////////////////////////////////////////////////////// +// Schema export tests + +struct SchemaExportChecker { + SchemaExportChecker(std::vector flattened_formats, + std::vector flattened_names, + std::vector flattened_flags = {}, + std::vector flattened_metadata = {}) + : flattened_formats_(std::move(flattened_formats)), + flattened_names_(std::move(flattened_names)), + flattened_flags_( + flattened_flags.empty() + ? std::vector(flattened_formats_.size(), kDefaultFlags) + : std::move(flattened_flags)), + flattened_metadata_(std::move(flattened_metadata)), + flattened_index_(0) {} + + void operator()(struct ArrowSchema* c_export, bool inner = false) { + ASSERT_LT(flattened_index_, flattened_formats_.size()); + ASSERT_LT(flattened_index_, flattened_names_.size()); + ASSERT_LT(flattened_index_, flattened_flags_.size()); + ASSERT_EQ(std::string(c_export->format), flattened_formats_[flattened_index_]); + ASSERT_EQ(std::string(c_export->name), flattened_names_[flattened_index_]); + std::string expected_md; + if (!flattened_metadata_.empty()) { + expected_md = flattened_metadata_[flattened_index_]; + } + if (!expected_md.empty()) { + ASSERT_NE(c_export->metadata, nullptr); + ASSERT_EQ(std::string(c_export->metadata, expected_md.size()), expected_md); + } else { + ASSERT_EQ(c_export->metadata, nullptr); + } + ASSERT_EQ(c_export->flags, flattened_flags_[flattened_index_]); + ++flattened_index_; + + if (c_export->dictionary != nullptr) { + // Recurse into dictionary + operator()(c_export->dictionary, true); + } + + if (c_export->n_children > 0) { + ASSERT_NE(c_export->children, nullptr); + // Recurse into children + for (int64_t i = 0; i < c_export->n_children; ++i) { + ASSERT_NE(c_export->children[i], nullptr); + operator()(c_export->children[i], true); + } + } else { + ASSERT_EQ(c_export->children, nullptr); + } + + if (!inner) { + // Caller gave the right number of names and format strings + ASSERT_EQ(flattened_index_, flattened_formats_.size()); + ASSERT_EQ(flattened_index_, flattened_names_.size()); + ASSERT_EQ(flattened_index_, flattened_flags_.size()); + } + } + + const std::vector flattened_formats_; + const std::vector flattened_names_; + std::vector flattened_flags_; + const std::vector flattened_metadata_; + size_t flattened_index_; +}; + +class TestSchemaExport : public ::testing::Test { + public: + void SetUp() override { pool_ = default_memory_pool(); } + + template + void TestNested(const std::shared_ptr& schema_like, + std::vector flattened_formats, + std::vector flattened_names, + std::vector flattened_flags = {}, + std::vector flattened_metadata = {}) { + SchemaExportChecker checker(std::move(flattened_formats), std::move(flattened_names), + std::move(flattened_flags), std::move(flattened_metadata)); + + auto orig_bytes = pool_->bytes_allocated(); + + struct ArrowSchema c_export; + ASSERT_OK(ExportTraits::ExportFunc(*schema_like, &c_export)); + + SchemaExportGuard guard(&c_export); + auto new_bytes = pool_->bytes_allocated(); + ASSERT_GT(new_bytes, orig_bytes); + + checker(&c_export); + + // Release the ArrowSchema, underlying data should be destroyed + guard.Release(); + ASSERT_EQ(pool_->bytes_allocated(), orig_bytes); + } + + template + void TestPrimitive(const std::shared_ptr& schema_like, + const char* format, const std::string& name = "", + int64_t flags = kDefaultFlags, const std::string& metadata = "") { + TestNested(schema_like, {format}, {name}, {flags}, {metadata}); + } + + protected: + MemoryPool* pool_; +}; + +TEST_F(TestSchemaExport, Primitive) { + TestPrimitive(int8(), "c"); + TestPrimitive(int16(), "s"); + TestPrimitive(int32(), "i"); + TestPrimitive(int64(), "l"); + TestPrimitive(uint8(), "C"); + TestPrimitive(uint16(), "S"); + TestPrimitive(uint32(), "I"); + TestPrimitive(uint64(), "L"); + + TestPrimitive(boolean(), "b"); + TestPrimitive(null(), "n"); + + TestPrimitive(float16(), "e"); + TestPrimitive(float32(), "f"); + TestPrimitive(float64(), "g"); + + TestPrimitive(fixed_size_binary(3), "w:3"); + TestPrimitive(binary(), "z"); + TestPrimitive(large_binary(), "Z"); + TestPrimitive(utf8(), "u"); + TestPrimitive(large_utf8(), "U"); + + TestPrimitive(decimal(16, 4), "d:16,4"); +} + +TEST_F(TestSchemaExport, Temporal) { + TestPrimitive(date32(), "tdD"); + TestPrimitive(date64(), "tdm"); + TestPrimitive(time32(TimeUnit::SECOND), "tts"); + TestPrimitive(time32(TimeUnit::MILLI), "ttm"); + TestPrimitive(time64(TimeUnit::MICRO), "ttu"); + TestPrimitive(time64(TimeUnit::NANO), "ttn"); + TestPrimitive(duration(TimeUnit::SECOND), "tDs"); + TestPrimitive(duration(TimeUnit::MILLI), "tDm"); + TestPrimitive(duration(TimeUnit::MICRO), "tDu"); + TestPrimitive(duration(TimeUnit::NANO), "tDn"); + TestPrimitive(month_interval(), "tiM"); + + TestPrimitive(day_time_interval(), "tiD"); + + TestPrimitive(timestamp(TimeUnit::SECOND), "tss:"); + TestPrimitive(timestamp(TimeUnit::SECOND, "Europe/Paris"), "tss:Europe/Paris"); + TestPrimitive(timestamp(TimeUnit::MILLI), "tsm:"); + TestPrimitive(timestamp(TimeUnit::MILLI, "Europe/Paris"), "tsm:Europe/Paris"); + TestPrimitive(timestamp(TimeUnit::MICRO), "tsu:"); + TestPrimitive(timestamp(TimeUnit::MICRO, "Europe/Paris"), "tsu:Europe/Paris"); + TestPrimitive(timestamp(TimeUnit::NANO), "tsn:"); + TestPrimitive(timestamp(TimeUnit::NANO, "Europe/Paris"), "tsn:Europe/Paris"); +} + +TEST_F(TestSchemaExport, List) { + TestNested(list(int8()), {"+l", "c"}, {"", "item"}); + TestNested(large_list(uint16()), {"+L", "S"}, + {"", "item"}); + TestNested(fixed_size_list(int64(), 2), {"+w:2", "l"}, + {"", "item"}); + + TestNested(list(large_list(int32())), {"+l", "+L", "i"}, + {"", "item", "item"}); +} + +TEST_F(TestSchemaExport, Struct) { + auto type = struct_({field("a", int8()), field("b", utf8())}); + TestNested(type, {"+s", "c", "u"}, {"", "a", "b"}, + {ARROW_FLAG_NULLABLE, ARROW_FLAG_NULLABLE, ARROW_FLAG_NULLABLE}); + + // With nullable = false + type = struct_({field("a", int8(), /*nullable=*/false), field("b", utf8())}); + TestNested(type, {"+s", "c", "u"}, {"", "a", "b"}, + {ARROW_FLAG_NULLABLE, 0, ARROW_FLAG_NULLABLE}); + + // With metadata + auto f0 = type->child(0); + auto f1 = type->child(1)->WithMetadata( + key_value_metadata(kMetadataKeys1, kMetadataValues1)); + type = struct_({f0, f1}); + TestNested(type, {"+s", "c", "u"}, {"", "a", "b"}, + {ARROW_FLAG_NULLABLE, 0, ARROW_FLAG_NULLABLE}, + {"", "", kEncodedMetadata1}); +} + +TEST_F(TestSchemaExport, Map) { + TestNested(map(int8(), utf8()), + {"+m", "+s", "c", "u"}, {"", "entries", "key", "value"}, + {ARROW_FLAG_NULLABLE, 0, 0, ARROW_FLAG_NULLABLE}); + TestNested(map(int8(), utf8(), /*keys_sorted=*/ true), + {"+m", "+s", "c", "u"}, {"", "entries", "key", "value"}, + {ARROW_FLAG_NULLABLE | ARROW_FLAG_MAP_KEYS_SORTED, 0, 0, + ARROW_FLAG_NULLABLE}); +} + +TEST_F(TestSchemaExport, Union) { + // Dense + auto field_a = field("a", int8()); + auto field_b = field("b", boolean(), /*nullable=*/false); + auto type = union_({field_a, field_b}, {42, 43}, UnionMode::DENSE); + TestNested(type, {"+ud:42,43", "c", "b"}, {"", "a", "b"}, + {ARROW_FLAG_NULLABLE, ARROW_FLAG_NULLABLE, 0}); + // Sparse + field_a = field("a", int8(), /*nullable=*/false); + field_b = field("b", boolean()); + type = union_({field_a, field_b}, {42, 43}, UnionMode::SPARSE); + TestNested(type, {"+us:42,43", "c", "b"}, {"", "a", "b"}, + {ARROW_FLAG_NULLABLE, 0, ARROW_FLAG_NULLABLE}); +} + +TEST_F(TestSchemaExport, Dictionary) { + TestNested(dictionary(int32(), utf8()), {"i", "u"}, {"", ""}); + TestNested(dictionary(int32(), list(utf8()), /*ordered=*/true), + {"i", "+l", "u"}, {"", "", "item"}, + {ARROW_FLAG_NULLABLE | ARROW_FLAG_DICTIONARY_ORDERED, + ARROW_FLAG_NULLABLE, ARROW_FLAG_NULLABLE}); + TestNested(large_list(dictionary(int32(), list(utf8()))), + {"+L", "i", "+l", "u"}, {"", "item", "", "item"}); +} + +TEST_F(TestSchemaExport, ExportField) { + TestPrimitive(field("thing", null()), "n", "thing", ARROW_FLAG_NULLABLE); + // With nullable = false + TestPrimitive(field("thing", null(), /*nullable=*/false), "n", "thing", 0); + // With metadata + auto f = field("thing", null(), /*nullable=*/false); + f = f->WithMetadata(key_value_metadata(kMetadataKeys1, kMetadataValues1)); + TestPrimitive(f, "n", "thing", 0, kEncodedMetadata1); +} + +TEST_F(TestSchemaExport, ExportSchema) { + // A schema is exported as an equivalent struct type (+ top-level metadata) + auto f1 = field("nulls", null(), /*nullable=*/false); + auto f2 = field("lists", list(int64())); + auto schema = ::arrow::schema({f1, f2}); + TestNested(schema, {"+s", "n", "+l", "l"}, {"", "nulls", "lists", "item"}, + {0, 0, ARROW_FLAG_NULLABLE, ARROW_FLAG_NULLABLE}); + + // With field metadata + f2 = f2->WithMetadata(key_value_metadata(kMetadataKeys1, kMetadataValues1)); + schema = ::arrow::schema({f1, f2}); + TestNested(schema, {"+s", "n", "+l", "l"}, {"", "nulls", "lists", "item"}, + {0, 0, ARROW_FLAG_NULLABLE, ARROW_FLAG_NULLABLE}, + {"", "", kEncodedMetadata1, ""}); + + // With field metadata and schema metadata + schema = schema->WithMetadata(key_value_metadata(kMetadataKeys2, kMetadataValues2)); + TestNested(schema, {"+s", "n", "+l", "l"}, {"", "nulls", "lists", "item"}, + {0, 0, ARROW_FLAG_NULLABLE, ARROW_FLAG_NULLABLE}, + {kEncodedMetadata2, "", kEncodedMetadata1, ""}); +} + +//////////////////////////////////////////////////////////////////////////// +// Array export tests + +struct ArrayExportChecker { + void operator()(struct ArrowArray* c_export, const ArrayData& expected_data) { + ASSERT_EQ(c_export->length, expected_data.length); + ASSERT_EQ(c_export->null_count, expected_data.null_count); + ASSERT_EQ(c_export->offset, expected_data.offset); + + ASSERT_EQ(c_export->n_buffers, static_cast(expected_data.buffers.size())); + ASSERT_EQ(c_export->n_children, + static_cast(expected_data.child_data.size())); + ASSERT_NE(c_export->buffers, nullptr); + for (int64_t i = 0; i < c_export->n_buffers; ++i) { + auto expected_ptr = + expected_data.buffers[i] ? expected_data.buffers[i]->data() : nullptr; + ASSERT_EQ(c_export->buffers[i], expected_ptr); + } + + if (expected_data.dictionary != nullptr) { + // Recurse into dictionary + ASSERT_NE(c_export->dictionary, nullptr); + operator()(c_export->dictionary, *expected_data.dictionary->data()); + } else { + ASSERT_EQ(c_export->dictionary, nullptr); + } + + if (c_export->n_children > 0) { + ASSERT_NE(c_export->children, nullptr); + // Recurse into children + for (int64_t i = 0; i < c_export->n_children; ++i) { + ASSERT_NE(c_export->children[i], nullptr); + operator()(c_export->children[i], *expected_data.child_data[i]); + } + } else { + ASSERT_EQ(c_export->children, nullptr); + } + } +}; + +struct RecordBatchExportChecker { + void operator()(struct ArrowArray* c_export, const RecordBatch& expected_batch) { + ASSERT_EQ(c_export->length, expected_batch.num_rows()); + ASSERT_EQ(c_export->null_count, 0); + ASSERT_EQ(c_export->offset, 0); + + ASSERT_EQ(c_export->n_buffers, 1); // Like a struct array + ASSERT_NE(c_export->buffers, nullptr); + ASSERT_EQ(c_export->buffers[0], nullptr); // No null bitmap + ASSERT_EQ(c_export->dictionary, nullptr); + + ASSERT_EQ(c_export->n_children, expected_batch.num_columns()); + if (c_export->n_children > 0) { + ArrayExportChecker array_checker{}; + + ASSERT_NE(c_export->children, nullptr); + // Recurse into children + for (int i = 0; i < expected_batch.num_columns(); ++i) { + ASSERT_NE(c_export->children[i], nullptr); + array_checker(c_export->children[i], + *expected_batch.column(i)->data()); + } + } else { + ASSERT_EQ(c_export->children, nullptr); + } + } +}; + +class TestArrayExport : public ::testing::Test { + public: + void SetUp() override { pool_ = default_memory_pool(); } + + static std::function*)> JSONArrayFactory( + std::shared_ptr type, const char* json) { + return [=](std::shared_ptr* out) -> Status { + return ::arrow::ipc::internal::json::ArrayFromJSON(type, json, out); + }; + } + + template + void TestWithArrayFactory(ArrayFactory&& factory, ExportCheckFunc&& check_func) { + auto orig_bytes = pool_->bytes_allocated(); + + std::shared_ptr arr; + ASSERT_OK(factory(&arr)); + const ArrayData& data = *arr->data(); // non-owning reference + struct ArrowArray c_export; + ASSERT_OK(ExportArray(*arr, &c_export)); + + ArrayExportGuard guard(&c_export); + auto new_bytes = pool_->bytes_allocated(); + ASSERT_GT(new_bytes, orig_bytes); + + // Release the shared_ptr, underlying data should be held alive + arr.reset(); + ASSERT_EQ(pool_->bytes_allocated(), new_bytes); + check_func(&c_export, data); + + // Release the ArrowArray, underlying data should be destroyed + guard.Release(); + ASSERT_EQ(pool_->bytes_allocated(), orig_bytes); + } + + template + void TestNested(ArrayFactory&& factory) { + ArrayExportChecker checker; + TestWithArrayFactory(std::forward(factory), checker); + } + + void TestNested(const std::shared_ptr& type, const char* json) { + TestNested(JSONArrayFactory(type, json)); + } + + template + void TestPrimitive(ArrayFactory&& factory) { + TestNested(std::forward(factory)); + } + + void TestPrimitive(const std::shared_ptr& type, const char* json) { + TestNested(type, json); + } + + template + void TestMoveWithArrayFactory(ArrayFactory&& factory, ExportCheckFunc&& check_func) { + auto orig_bytes = pool_->bytes_allocated(); + + std::shared_ptr arr; + ASSERT_OK(factory(&arr)); + const ArrayData& data = *arr->data(); // non-owning reference + struct ArrowArray c_export_temp, c_export_final; + ASSERT_OK(ExportArray(*arr, &c_export_temp)); + + // Move the ArrowArray to its final location + ArrowArrayMove(&c_export_temp, &c_export_final); + ASSERT_TRUE(ArrowArrayIsReleased(&c_export_temp)); + + ArrayExportGuard guard(&c_export_final); + auto new_bytes = pool_->bytes_allocated(); + ASSERT_GT(new_bytes, orig_bytes); + check_func(&c_export_final, data); + + // Release the shared_ptr, underlying data should be held alive + arr.reset(); + ASSERT_EQ(pool_->bytes_allocated(), new_bytes); + check_func(&c_export_final, data); + + // Release the ArrowArray, underlying data should be destroyed + guard.Release(); + ASSERT_EQ(pool_->bytes_allocated(), orig_bytes); + } + + template + void TestMoveNested(ArrayFactory&& factory) { + ArrayExportChecker checker; + + TestMoveWithArrayFactory(std::forward(factory), checker); + } + + void TestMoveNested(const std::shared_ptr& type, const char* json) { + TestMoveNested(JSONArrayFactory(type, json)); + } + + void TestMovePrimitive(const std::shared_ptr& type, const char* json) { + TestMoveNested(type, json); + } + + template + void TestMoveChildWithArrayFactory(ArrayFactory&& factory, int64_t child_id, + ExportCheckFunc&& check_func) { + auto orig_bytes = pool_->bytes_allocated(); + + std::shared_ptr arr; + ASSERT_OK(factory(&arr)); + struct ArrowArray c_export_parent, c_export_child; + ASSERT_OK(ExportArray(*arr, &c_export_parent)); + + auto bytes_with_parent = pool_->bytes_allocated(); + ASSERT_GT(bytes_with_parent, orig_bytes); + + // Move the child ArrowArray to its final location + { + ArrayExportGuard parent_guard(&c_export_parent); + ASSERT_LT(child_id, c_export_parent.n_children); + ArrowArrayMove(c_export_parent.children[child_id], &c_export_child); + } + ArrayExportGuard child_guard(&c_export_child); + + // Now parent is released + ASSERT_TRUE(ArrowArrayIsReleased(&c_export_parent)); + auto bytes_with_child = pool_->bytes_allocated(); + ASSERT_LT(bytes_with_child, bytes_with_parent); + ASSERT_GT(bytes_with_child, orig_bytes); + + const ArrayData& data = *arr->data()->child_data[child_id]; // non-owning reference + check_func(&c_export_child, data); + + // Release the shared_ptr, some underlying data should be held alive + arr.reset(); + ASSERT_LT(pool_->bytes_allocated(), bytes_with_child); + ASSERT_GT(pool_->bytes_allocated(), orig_bytes); + check_func(&c_export_child, data); + + // Release the ArrowArray, underlying data should be destroyed + child_guard.Release(); + ASSERT_EQ(pool_->bytes_allocated(), orig_bytes); + } + + template + void TestMoveChild(ArrayFactory&& factory, int64_t child_id) { + ArrayExportChecker checker; + + TestMoveChildWithArrayFactory(std::forward(factory), child_id, checker); + } + + void TestMoveChild(const std::shared_ptr& type, const char* json, + int64_t child_id) { + TestMoveChild(JSONArrayFactory(type, json), child_id); + } + + protected: + MemoryPool* pool_; +}; + +TEST_F(TestArrayExport, Primitive) { + TestPrimitive(int8(), "[1, 2, null, -3]"); + TestPrimitive(int16(), "[1, 2, -3]"); + TestPrimitive(int32(), "[1, 2, null, -3]"); + TestPrimitive(int64(), "[1, 2, -3]"); + TestPrimitive(uint8(), "[1, 2, 3]"); + TestPrimitive(uint16(), "[1, 2, null, 3]"); + TestPrimitive(uint32(), "[1, 2, 3]"); + TestPrimitive(uint64(), "[1, 2, null, 3]"); + + TestPrimitive(boolean(), "[true, false, null]"); + TestPrimitive(null(), "[null, null]"); + + TestPrimitive(float32(), "[1.5, null]"); + TestPrimitive(float64(), "[1.5, null]"); + + TestPrimitive(fixed_size_binary(3), R"(["foo", "bar", null])"); + TestPrimitive(binary(), R"(["foo", "bar", null])"); + TestPrimitive(large_binary(), R"(["foo", "bar", null])"); + TestPrimitive(utf8(), R"(["foo", "bar", null])"); + TestPrimitive(large_utf8(), R"(["foo", "bar", null])"); + + TestPrimitive(decimal(16, 4), R"(["1234.5670", null])"); +} + +TEST_F(TestArrayExport, PrimitiveSliced) { + auto factory = [](std::shared_ptr* out) -> Status { + *out = ArrayFromJSON(int16(), "[1, 2, null, -3]")->Slice(1, 2); + return Status::OK(); + }; + + TestPrimitive(factory); +} + +TEST_F(TestArrayExport, Null) { + TestPrimitive(null(), "[null, null, null]"); + TestPrimitive(null(), "[]"); +} + +TEST_F(TestArrayExport, Temporal) { + const char* json = "[1, 2, null, 42]"; + TestPrimitive(date32(), json); + TestPrimitive(date64(), json); + TestPrimitive(time32(TimeUnit::SECOND), json); + TestPrimitive(time32(TimeUnit::MILLI), json); + TestPrimitive(time64(TimeUnit::MICRO), json); + TestPrimitive(time64(TimeUnit::NANO), json); + TestPrimitive(duration(TimeUnit::SECOND), json); + TestPrimitive(duration(TimeUnit::MILLI), json); + TestPrimitive(duration(TimeUnit::MICRO), json); + TestPrimitive(duration(TimeUnit::NANO), json); + TestPrimitive(month_interval(), json); + + TestPrimitive(day_time_interval(), "[[7, 600], null]"); + + json = R"(["1970-01-01","2000-02-29","1900-02-28"])"; + TestPrimitive(timestamp(TimeUnit::SECOND), json); + TestPrimitive(timestamp(TimeUnit::SECOND, "Europe/Paris"), json); + TestPrimitive(timestamp(TimeUnit::MILLI), json); + TestPrimitive(timestamp(TimeUnit::MILLI, "Europe/Paris"), json); + TestPrimitive(timestamp(TimeUnit::MICRO), json); + TestPrimitive(timestamp(TimeUnit::MICRO, "Europe/Paris"), json); + TestPrimitive(timestamp(TimeUnit::NANO), json); + TestPrimitive(timestamp(TimeUnit::NANO, "Europe/Paris"), json); +} + +TEST_F(TestArrayExport, List) { + TestNested(list(int8()), "[[1, 2], [3, null], null]"); + TestNested(large_list(uint16()), "[[1, 2], [3, null], null]"); + TestNested(fixed_size_list(int64(), 2), "[[1, 2], [3, null], null]"); + + TestNested(list(large_list(int32())), "[[[1, 2], [3], null], null]"); +} + +TEST_F(TestArrayExport, ListSliced) { + { + auto factory = [](std::shared_ptr* out) -> Status { + *out = ArrayFromJSON(list(int8()), "[[1, 2], [3, null], [4, 5, 6], null]") + ->Slice(1, 2); + return Status::OK(); + }; + TestNested(factory); + } + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(int16(), "[1, 2, 3, 4, null, 5, 6, 7, 8]")->Slice(1, 6); + auto offsets = ArrayFromJSON(int32(), "[0, 2, 3, 5, 6]")->Slice(2, 4); + return ListArray::FromArrays(*offsets, *values, default_memory_pool(), out); + }; + TestNested(factory); + } +} + +TEST_F(TestArrayExport, Struct) { + const char* data = R"([[1, "foo"], [2, null]])"; + auto type = struct_({field("a", int8()), field("b", utf8())}); + TestNested(type, data); +} + +TEST_F(TestArrayExport, Map) { + const char* json = R"([[[1, "foo"], [2, null]], [[3, "bar"]]])"; + TestNested(map(int8(), utf8()), json); + TestNested(map(int8(), utf8(), /*keys_sorted=*/ true), json); +} + +TEST_F(TestArrayExport, Union) { + const char* data = "[null, [42, 1], [43, true], [42, null], [42, 2]]"; + // Dense + auto field_a = field("a", int8()); + auto field_b = field("b", boolean(), /*nullable=*/false); + auto type = union_({field_a, field_b}, {42, 43}, UnionMode::DENSE); + TestNested(type, data); + // Sparse + field_a = field("a", int8(), /*nullable=*/false); + field_b = field("b", boolean()); + type = union_({field_a, field_b}, {42, 43}, UnionMode::SPARSE); + TestNested(type, data); +} + +TEST_F(TestArrayExport, Dictionary) { + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(utf8(), R"(["foo", "bar", "quux"])"); + auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); + return DictionaryArray::FromArrays(dictionary(indices->type(), values->type()), + indices, values, out); + }; + TestNested(factory); + } + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(list(utf8()), R"([["abc", "def"], ["efg"], []])"); + auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); + return DictionaryArray::FromArrays( + dictionary(indices->type(), values->type(), /*ordered=*/true), indices, values, + out); + }; + TestNested(factory); + } + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(list(utf8()), R"([["abc", "def"], ["efg"], []])"); + auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); + std::shared_ptr dict_array; + RETURN_NOT_OK(DictionaryArray::FromArrays( + dictionary(indices->type(), values->type()), indices, values, &dict_array)); + auto offsets = ArrayFromJSON(int64(), "[0, 2, 5]"); + RETURN_NOT_OK( + LargeListArray::FromArrays(*offsets, *dict_array, default_memory_pool(), out)); + return (*out)->ValidateFull(); + }; + TestNested(factory); + } +} + +TEST_F(TestArrayExport, MovePrimitive) { + TestMovePrimitive(int8(), "[1, 2, null, -3]"); + TestMovePrimitive(fixed_size_binary(3), R"(["foo", "bar", null])"); + TestMovePrimitive(binary(), R"(["foo", "bar", null])"); +} + +TEST_F(TestArrayExport, MoveNested) { + TestMoveNested(list(int8()), "[[1, 2], [3, null], null]"); + TestMoveNested(list(large_list(int32())), "[[[1, 2], [3], null], null]"); + TestMoveNested(struct_({field("a", int8()), field("b", utf8())}), + R"([[1, "foo"], [2, null]])"); +} + +TEST_F(TestArrayExport, MoveDictionary) { + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(utf8(), R"(["foo", "bar", "quux"])"); + auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); + return DictionaryArray::FromArrays(dictionary(indices->type(), values->type()), + indices, values, out); + }; + TestMoveNested(factory); + } + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(list(utf8()), R"([["abc", "def"], ["efg"], []])"); + auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); + std::shared_ptr dict_array; + RETURN_NOT_OK(DictionaryArray::FromArrays( + dictionary(indices->type(), values->type()), indices, values, &dict_array)); + auto offsets = ArrayFromJSON(int64(), "[0, 2, 5]"); + RETURN_NOT_OK( + LargeListArray::FromArrays(*offsets, *dict_array, default_memory_pool(), out)); + return (*out)->ValidateFull(); + }; + TestMoveNested(factory); + } +} + +TEST_F(TestArrayExport, MoveChild) { + TestMoveChild(list(int8()), "[[1, 2], [3, null], null]", /*child_id=*/0); + TestMoveChild(list(large_list(int32())), "[[[1, 2], [3], null], null]", + /*child_id=*/0); + TestMoveChild(struct_({field("ints", int8()), field("strs", utf8())}), + R"([[1, "foo"], [2, null]])", + /*child_id=*/0); + TestMoveChild(struct_({field("ints", int8()), field("strs", utf8())}), + R"([[1, "foo"], [2, null]])", + /*child_id=*/1); + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(list(utf8()), R"([["abc", "def"], ["efg"], []])"); + auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); + std::shared_ptr dict_array; + RETURN_NOT_OK(DictionaryArray::FromArrays( + dictionary(indices->type(), values->type()), indices, values, &dict_array)); + auto offsets = ArrayFromJSON(int64(), "[0, 2, 5]"); + RETURN_NOT_OK( + LargeListArray::FromArrays(*offsets, *dict_array, default_memory_pool(), out)); + return (*out)->ValidateFull(); + }; + TestMoveChild(factory, /*child_id=*/0); + } +} + +TEST_F(TestArrayExport, ExportArrayAndType) { + struct ArrowSchema c_schema{}; + struct ArrowArray c_array{}; + SchemaExportGuard schema_guard(&c_schema); + ArrayExportGuard array_guard(&c_array); + + auto array = ArrayFromJSON(int8(), "[1, 2, 3]"); + ASSERT_OK(ExportArray(*array, &c_array, &c_schema)); + const ArrayData& data = *array->data(); + array.reset(); + ASSERT_FALSE(ArrowSchemaIsReleased(&c_schema)); + ASSERT_FALSE(ArrowArrayIsReleased(&c_array)); + ASSERT_EQ(c_schema.format, std::string("c")); + ASSERT_EQ(c_schema.n_children, 0); + ArrayExportChecker checker{}; + checker(&c_array, data); +} + +TEST_F(TestArrayExport, ExportRecordBatch) { + struct ArrowSchema c_schema{}; + struct ArrowArray c_array{}; + + auto schema = ::arrow::schema( + {field("ints", int16()), field("bools", boolean(), /*nullable=*/false)}); + schema = schema->WithMetadata(key_value_metadata(kMetadataKeys2, kMetadataValues2)); + auto arr0 = ArrayFromJSON(int16(), "[1, 2, null]"); + auto arr1 = ArrayFromJSON(boolean(), "[false, true, false]"); + + auto batch_factory = [&]() { + return RecordBatch::Make(schema, 3, {arr0, arr1}); + }; + + { + auto batch = batch_factory(); + + ASSERT_OK(ExportRecordBatch(*batch, &c_array)); + ArrayExportGuard array_guard(&c_array); + RecordBatchExportChecker checker{}; + checker(&c_array, *batch); + + // Create batch anew, with the same buffer pointers + batch = batch_factory(); + checker(&c_array, *batch); + } + { + // Check one can export both schema and record batch at once + auto batch = batch_factory(); + + ASSERT_OK(ExportRecordBatch(*batch, &c_array, &c_schema)); + SchemaExportGuard schema_guard(&c_schema); + ArrayExportGuard array_guard(&c_array); + ASSERT_EQ(c_schema.format, std::string("+s")); + ASSERT_EQ(c_schema.n_children, 2); + ASSERT_NE(c_schema.metadata, nullptr); + ASSERT_EQ(kEncodedMetadata2, + std::string(c_schema.metadata, kEncodedMetadata2.size())); + RecordBatchExportChecker checker{}; + checker(&c_array, *batch); + + // Create batch anew, with the same buffer pointers + batch = batch_factory(); + checker(&c_array, *batch); + } +} + +//////////////////////////////////////////////////////////////////////////// +// Schema import tests + +void NoOpSchemaRelease(struct ArrowSchema* schema) { + ArrowSchemaMarkReleased(schema); +} + +class SchemaStructBuilder { + public: + SchemaStructBuilder() { Reset(); } + + void Reset() { + memset(&c_struct_, 0, sizeof(c_struct_)); + c_struct_.release = NoOpSchemaRelease; + nested_structs_.clear(); + children_arrays_.clear(); + } + + // Create a new ArrowSchema struct with a stable C pointer + struct ArrowSchema* AddChild() { + nested_structs_.emplace_back(); + struct ArrowSchema* result = &nested_structs_.back(); + memset(result, 0, sizeof(*result)); + result->release = NoOpSchemaRelease; + return result; + } + + // Create a stable C pointer to the N last structs in nested_structs_ + struct ArrowSchema** NLastChildren(int64_t n_children, struct ArrowSchema* parent) { + children_arrays_.emplace_back(n_children); + struct ArrowSchema** children = children_arrays_.back().data(); + int64_t nested_offset; + // If parent is itself at the end of nested_structs_, skip it + if (parent != nullptr && &nested_structs_.back() == parent) { + nested_offset = static_cast(nested_structs_.size()) - n_children - 1; + } else { + nested_offset = static_cast(nested_structs_.size()) - n_children; + } + for (int64_t i = 0; i < n_children; ++i) { + children[i] = &nested_structs_[nested_offset + i]; + } + return children; + } + + struct ArrowSchema* LastChild(struct ArrowSchema* parent = nullptr) { + return *NLastChildren(1, parent); + } + + void FillPrimitive(struct ArrowSchema* c, const char* format, + const char* name = nullptr, int64_t flags = kDefaultFlags) { + c->flags = flags; + c->format = format; + c->name = name; + } + + void FillDictionary(struct ArrowSchema* c) { c->dictionary = LastChild(c); } + + void FillListLike(struct ArrowSchema* c, const char* format, + const char* name = nullptr, int64_t flags = kDefaultFlags) { + c->flags = flags; + c->format = format; + c->name = name; + c->n_children = 1; + c->children = NLastChildren(1, c); + c->children[0]->name = "item"; + } + + void FillStructLike(struct ArrowSchema* c, const char* format, + int64_t n_children, const char* name = nullptr, + int64_t flags = kDefaultFlags) { + c->flags = flags; + c->format = format; + c->name = name; + c->n_children = n_children; + c->children = NLastChildren(c->n_children, c); + } + + void FillPrimitive(const char* format, const char* name = nullptr, + int64_t flags = kDefaultFlags) { + FillPrimitive(&c_struct_, format, name, flags); + } + + void FillDictionary() { FillDictionary(&c_struct_); } + + void FillListLike(const char* format, const char* name = nullptr, + int64_t flags = kDefaultFlags) { + FillListLike(&c_struct_, format, name, flags); + } + + void FillStructLike(const char* format, int64_t n_children, + const char* name = nullptr, int64_t flags = kDefaultFlags) { + FillStructLike(&c_struct_, format, n_children, name, flags); + } + + struct ArrowSchema c_struct_; + // Deque elements don't move when the deque is appended to, which allows taking + // stable C pointers to them. + std::deque nested_structs_; + std::deque> children_arrays_; +}; + +class TestSchemaImport : public ::testing::Test, public SchemaStructBuilder { + public: + void SetUp() override { + Reset(); + } + + void CheckImport(const std::shared_ptr& expected) { + SchemaReleaseCallback cb(&c_struct_); + + ASSERT_OK_AND_ASSIGN(auto type, ImportType(&c_struct_)); + ASSERT_TRUE(ArrowSchemaIsReleased(&c_struct_)); + Reset(); // for further tests + cb.AssertCalled(); // was released + AssertTypeEqual(*expected, *type); + } + + void CheckImport(const std::shared_ptr& expected) { + SchemaReleaseCallback cb(&c_struct_); + + ASSERT_OK_AND_ASSIGN(auto field, ImportField(&c_struct_)); + ASSERT_TRUE(ArrowSchemaIsReleased(&c_struct_)); + Reset(); // for further tests + cb.AssertCalled(); // was released + AssertFieldEqual(*expected, *field); + } + + void CheckImport(const std::shared_ptr& expected) { + SchemaReleaseCallback cb(&c_struct_); + + ASSERT_OK_AND_ASSIGN(auto schema, ImportSchema(&c_struct_)); + ASSERT_TRUE(ArrowSchemaIsReleased(&c_struct_)); + Reset(); // for further tests + cb.AssertCalled(); // was released + AssertSchemaEqual(*expected, *schema); + } + + void CheckImportError() { + SchemaReleaseCallback cb(&c_struct_); + + ASSERT_RAISES(Invalid, ImportField(&c_struct_)); + ASSERT_TRUE(ArrowSchemaIsReleased(&c_struct_)); + cb.AssertCalled(); // was released + } + + void CheckSchemaImportError() { + SchemaReleaseCallback cb(&c_struct_); + + ASSERT_RAISES(Invalid, ImportSchema(&c_struct_)); + ASSERT_TRUE(ArrowSchemaIsReleased(&c_struct_)); + cb.AssertCalled(); // was released + } +}; + +TEST_F(TestSchemaImport, Primitive) { + FillPrimitive("c"); + CheckImport(int8()); + FillPrimitive("c"); + CheckImport(field("", int8())); + FillPrimitive("C"); + CheckImport(field("", uint8())); + FillPrimitive("s"); + CheckImport(field("", int16())); + FillPrimitive("S"); + CheckImport(field("", uint16())); + FillPrimitive("i"); + CheckImport(field("", int32())); + FillPrimitive("I"); + CheckImport(field("", uint32())); + FillPrimitive("l"); + CheckImport(field("", int64())); + FillPrimitive("L"); + CheckImport(field("", uint64())); + + FillPrimitive("b"); + CheckImport(field("", boolean())); + FillPrimitive("e"); + CheckImport(field("", float16())); + FillPrimitive("f"); + CheckImport(field("", float32())); + FillPrimitive("g"); + CheckImport(field("", float64())); +} + +TEST_F(TestSchemaImport, Temporal) { + FillPrimitive("tdD"); + CheckImport(date32()); + FillPrimitive("tdm"); + CheckImport(date64()); + + FillPrimitive("tts"); + CheckImport(time32(TimeUnit::SECOND)); + FillPrimitive("ttm"); + CheckImport(time32(TimeUnit::MILLI)); + FillPrimitive("ttu"); + CheckImport(time64(TimeUnit::MICRO)); + FillPrimitive("ttn"); + CheckImport(time64(TimeUnit::NANO)); + + FillPrimitive("tDs"); + CheckImport(duration(TimeUnit::SECOND)); + FillPrimitive("tDm"); + CheckImport(duration(TimeUnit::MILLI)); + FillPrimitive("tDu"); + CheckImport(duration(TimeUnit::MICRO)); + FillPrimitive("tDn"); + CheckImport(duration(TimeUnit::NANO)); + + FillPrimitive("tiM"); + CheckImport(month_interval()); + FillPrimitive("tiD"); + CheckImport(day_time_interval()); + + FillPrimitive("tss:"); + CheckImport(timestamp(TimeUnit::SECOND)); + FillPrimitive("tsm:"); + CheckImport(timestamp(TimeUnit::MILLI)); + FillPrimitive("tsu:"); + CheckImport(timestamp(TimeUnit::MICRO)); + FillPrimitive("tsn:"); + CheckImport(timestamp(TimeUnit::NANO)); + + FillPrimitive("tss:Europe/Paris"); + CheckImport(timestamp(TimeUnit::SECOND, "Europe/Paris")); + FillPrimitive("tsm:Europe/Paris"); + CheckImport(timestamp(TimeUnit::MILLI, "Europe/Paris")); + FillPrimitive("tsu:Europe/Paris"); + CheckImport(timestamp(TimeUnit::MICRO, "Europe/Paris")); + FillPrimitive("tsn:Europe/Paris"); + CheckImport(timestamp(TimeUnit::NANO, "Europe/Paris")); +} + +TEST_F(TestSchemaImport, String) { + FillPrimitive("u"); + CheckImport(utf8()); + FillPrimitive("z"); + CheckImport(binary()); + FillPrimitive("U"); + CheckImport(large_utf8()); + FillPrimitive("Z"); + CheckImport(large_binary()); + + FillPrimitive("w:3"); + CheckImport(fixed_size_binary(3)); + FillPrimitive("d:15,4"); + CheckImport(decimal(15, 4)); +} + +TEST_F(TestSchemaImport, List) { + FillPrimitive(AddChild(), "c"); + FillListLike("+l"); + CheckImport(list(int8())); + + FillPrimitive(AddChild(), "s", "item", 0); + FillListLike("+l"); + CheckImport(list(field("item", int16(), /*nullable=*/ false))); + + // Large list + FillPrimitive(AddChild(), "s"); + FillListLike("+L"); + CheckImport(large_list(int16())); + + // Fixed-size list + FillPrimitive(AddChild(), "c"); + FillListLike("+w:3"); + CheckImport(fixed_size_list(int8(), 3)); +} + +TEST_F(TestSchemaImport, NestedList) { + FillPrimitive(AddChild(), "c"); + FillListLike(AddChild(), "+l"); + FillListLike("+L"); + CheckImport(large_list(list(int8()))); + + FillPrimitive(AddChild(), "c"); + FillListLike(AddChild(), "+w:3"); + FillListLike("+l"); + CheckImport(list(fixed_size_list(int8(), 3))); +} + +TEST_F(TestSchemaImport, Struct) { + FillPrimitive(AddChild(), "u", "strs"); + FillPrimitive(AddChild(), "S", "ints"); + FillStructLike("+s", 2); + auto expected = struct_({field("strs", utf8()), field("ints", uint16())}); + CheckImport(expected); + + FillPrimitive(AddChild(), "u", "strs", 0); + FillPrimitive(AddChild(), "S", "ints", kDefaultFlags); + FillStructLike("+s", 2); + expected = struct_({field("strs", utf8(), /*nullable=*/false), + field("ints", uint16())}); + CheckImport(expected); + + // With metadata + auto c = AddChild(); + FillPrimitive(c, "u", "strs", 0); + c->metadata = kEncodedMetadata2.c_str(); + FillPrimitive(AddChild(), "S", "ints", kDefaultFlags); + FillStructLike("+s", 2); + expected = struct_( + {field("strs", utf8(), /*nullable=*/false, + key_value_metadata(kMetadataKeys2, kMetadataValues2)), + field("ints", uint16())}); + CheckImport(expected); +} + +TEST_F(TestSchemaImport, Union) { + // Sparse + FillPrimitive(AddChild(), "u", "strs"); + FillPrimitive(AddChild(), "c", "ints"); + FillStructLike("+us:43,42", 2); + auto expected = + union_({field("strs", utf8()), field("ints", int8())}, {43, 42}, UnionMode::SPARSE); + CheckImport(expected); + + // Dense + FillPrimitive(AddChild(), "u", "strs"); + FillPrimitive(AddChild(), "c", "ints"); + FillStructLike("+ud:43,42", 2); + expected = + union_({field("strs", utf8()), field("ints", int8())}, {43, 42}, UnionMode::DENSE); + CheckImport(expected); +} + +TEST_F(TestSchemaImport, Map) { + FillPrimitive(AddChild(), "u", "key"); + FillPrimitive(AddChild(), "i", "value"); + FillStructLike(AddChild(), "+s", 2, "entries"); + FillListLike("+m"); + auto expected = map(utf8(), int32()); + CheckImport(expected); + + FillPrimitive(AddChild(), "u", "key"); + FillPrimitive(AddChild(), "i", "value"); + FillStructLike(AddChild(), "+s", 2, "entries"); + FillListLike("+m", "", ARROW_FLAG_MAP_KEYS_SORTED); + expected = map(utf8(), int32(), /*keys_sorted=*/ true); + CheckImport(expected); +} + +TEST_F(TestSchemaImport, Dictionary) { + FillPrimitive(AddChild(), "u"); + FillPrimitive("c"); + FillDictionary(); + auto expected = dictionary(int8(), utf8()); + CheckImport(expected); + + FillPrimitive(AddChild(), "u"); + FillPrimitive("c", "", ARROW_FLAG_NULLABLE | ARROW_FLAG_DICTIONARY_ORDERED); + FillDictionary(); + expected = dictionary(int8(), utf8(), /*ordered=*/true); + CheckImport(expected); + + FillPrimitive(AddChild(), "u"); + FillListLike(AddChild(), "+L"); + FillPrimitive("c"); + FillDictionary(); + expected = dictionary(int8(), large_list(utf8())); + CheckImport(expected); + + FillPrimitive(AddChild(), "u"); + FillPrimitive(AddChild(), "c"); + FillDictionary(LastChild()); + FillListLike("+l"); + expected = list(dictionary(int8(), utf8())); + CheckImport(expected); +} + +TEST_F(TestSchemaImport, FormatStringError) { + FillPrimitive(""); + CheckImportError(); + FillPrimitive("cc"); + CheckImportError(); + FillPrimitive("w3"); + CheckImportError(); + FillPrimitive("w:three"); + CheckImportError(); + FillPrimitive("w:3,5"); + CheckImportError(); + FillPrimitive("d:15"); + CheckImportError(); + FillPrimitive("d:15.4"); + CheckImportError(); + FillPrimitive("t"); + CheckImportError(); + FillPrimitive("td"); + CheckImportError(); + FillPrimitive("tz"); + CheckImportError(); + FillPrimitive("tdd"); + CheckImportError(); + FillPrimitive("tdDd"); + CheckImportError(); + FillPrimitive("tss"); + CheckImportError(); + FillPrimitive("tss;UTC"); + CheckImportError(); + FillPrimitive("+"); + CheckImportError(); + FillPrimitive("+mm"); + CheckImportError(); + FillPrimitive("+u"); + CheckImportError(); +} + +TEST_F(TestSchemaImport, UnionError) { + FillPrimitive(AddChild(), "u", "strs"); + FillStructLike("+uz", 1); + CheckImportError(); + + FillPrimitive(AddChild(), "u", "strs"); + FillStructLike("+uz:", 1); + CheckImportError(); + + FillPrimitive(AddChild(), "u", "strs"); + FillStructLike("+uz:1", 1); + CheckImportError(); + + FillPrimitive(AddChild(), "u", "strs"); + FillStructLike("+us:1.2", 1); + CheckImportError(); + + FillPrimitive(AddChild(), "u", "strs"); + FillStructLike("+ud:-1", 1); + CheckImportError(); + + FillPrimitive(AddChild(), "u", "strs"); + FillStructLike("+ud:1,2", 1); + CheckImportError(); +} + +TEST_F(TestSchemaImport, DictionaryError) { + // Bad index type + FillPrimitive(AddChild(), "c"); + FillPrimitive("u"); + FillDictionary(); + CheckImportError(); + + // Nested dictionary + FillPrimitive(AddChild(), "c"); + FillPrimitive(AddChild(), "u"); + FillDictionary(LastChild()); + FillPrimitive("u"); + FillDictionary(); + CheckImportError(); +} + +TEST_F(TestSchemaImport, RecursionError) { + FillPrimitive(AddChild(), "c", "unused"); + auto c = AddChild(); + FillStructLike(c, "+s", 1, "child"); + FillStructLike("+s", 1, "parent"); + c->children[0] = &c_struct_; + CheckImportError(); +} + +TEST_F(TestSchemaImport, ImportField) { + FillPrimitive("c", "thing", kDefaultFlags); + CheckImport(field("thing", int8())); + FillPrimitive("c", "thing", 0); + CheckImport(field("thing", int8(), /*nullable=*/ false)); + // With metadata + FillPrimitive("c", "thing", kDefaultFlags); + c_struct_.metadata = kEncodedMetadata1.c_str(); + CheckImport(field("thing", int8(), /*nullable=*/ true, + key_value_metadata(kMetadataKeys1, kMetadataValues1))); +} + +TEST_F(TestSchemaImport, ImportSchema) { + FillPrimitive(AddChild(), "l"); + FillListLike(AddChild(), "+l", "int_lists"); + FillPrimitive(AddChild(), "u", "strs"); + FillStructLike("+s", 2); + auto f1 = field("int_lists", list(int64())); + auto f2 = field("strs", utf8()); + auto expected = schema({f1, f2}); + CheckImport(expected); + + // With metadata + FillPrimitive(AddChild(), "l"); + FillListLike(AddChild(), "+l", "int_lists"); + LastChild()->metadata = kEncodedMetadata2.c_str(); + FillPrimitive(AddChild(), "u", "strs"); + FillStructLike("+s", 2); + c_struct_.metadata = kEncodedMetadata1.c_str(); + f1 = f1->WithMetadata(key_value_metadata(kMetadataKeys2, kMetadataValues2)); + expected = schema({f1, f2}, key_value_metadata(kMetadataKeys1, kMetadataValues1)); + CheckImport(expected); +} + +TEST_F(TestSchemaImport, ImportSchemaError) { + // Not a struct type + FillPrimitive("n"); + CheckSchemaImportError(); + + FillPrimitive(AddChild(), "l", "ints"); + FillPrimitive(AddChild(), "u", "strs"); + FillStructLike("+us:43,42", 2); + CheckSchemaImportError(); +} + +//////////////////////////////////////////////////////////////////////////// +// Data import tests + +// [true, false, true, true, false, true, true, true] * 2 +static const uint8_t bits_buffer1[] = {0xed, 0xed}; + +static const void* buffers_no_nulls_no_data[1] = {nullptr}; +static const void* buffers_nulls_no_data1[1] = {bits_buffer1}; + +static const uint8_t data_buffer1[] = {1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16}; +static const uint8_t data_buffer2[] = "abcdefghijklmnopqrstuvwxyz"; +static const uint64_t data_buffer3[] = {123456789, 0, 987654321, 0}; +static const uint8_t data_buffer4[] = {1, 2, 0, 1, 3, 0}; +static const float data_buffer5[] = {0.0f, 1.5f, -2.0f, 3.0f, 4.0f, 5.0f}; +static const double data_buffer6[] = {0.0, 1.5, -2.0, 3.0, 4.0, 5.0}; +static const int32_t data_buffer7[] = {1234, 5678, 9012, 3456}; +static const int64_t data_buffer8[] = {123456789, 987654321, -123456789, -987654321}; +static const void* primitive_buffers_no_nulls1[2] = {nullptr, data_buffer1}; +static const void* primitive_buffers_nulls1[2] = {bits_buffer1, data_buffer1}; +static const void* primitive_buffers_no_nulls2[2] = {nullptr, data_buffer2}; +static const void* primitive_buffers_no_nulls3[2] = {nullptr, data_buffer3}; +static const void* primitive_buffers_no_nulls4[2] = {nullptr, data_buffer4}; +static const void* primitive_buffers_no_nulls5[2] = {nullptr, data_buffer5}; +static const void* primitive_buffers_no_nulls6[2] = {nullptr, data_buffer6}; +static const void* primitive_buffers_no_nulls7[2] = {nullptr, data_buffer7}; +static const void* primitive_buffers_nulls7[2] = {bits_buffer1, data_buffer7}; +static const void* primitive_buffers_no_nulls8[2] = {nullptr, data_buffer8}; +static const void* primitive_buffers_nulls8[2] = {bits_buffer1, data_buffer8}; + +static const int64_t timestamp_data_buffer1[] = {0, 951782400, -2203977600LL}; +static const int64_t timestamp_data_buffer2[] = {0, 951782400000LL, -2203977600000LL}; +static const int64_t timestamp_data_buffer3[] = {0, 951782400000000LL, + -2203977600000000LL}; +static const int64_t timestamp_data_buffer4[] = {0, 951782400000000000LL, + -2203977600000000000LL}; +static const void* timestamp_buffers_no_nulls1[2] = {nullptr, timestamp_data_buffer1}; +static const void* timestamp_buffers_nulls1[2] = {bits_buffer1, timestamp_data_buffer1}; +static const void* timestamp_buffers_no_nulls2[2] = {nullptr, timestamp_data_buffer2}; +static const void* timestamp_buffers_no_nulls3[2] = {nullptr, timestamp_data_buffer3}; +static const void* timestamp_buffers_no_nulls4[2] = {nullptr, timestamp_data_buffer4}; + +static const uint8_t string_data_buffer1[] = "foobarquuxxyzzy"; + +static const int32_t string_offsets_buffer1[] = {0, 3, 3, 6, 10, 15}; +static const void* string_buffers_no_nulls1[3] = {nullptr, string_offsets_buffer1, + string_data_buffer1}; + +static const int64_t large_string_offsets_buffer1[] = {0, 3, 3, 6, 10}; +static const void* large_string_buffers_no_nulls1[3] = { + nullptr, large_string_offsets_buffer1, string_data_buffer1}; + +static const int32_t list_offsets_buffer1[] = {0, 2, 2, 5, 6, 8}; +static const void* list_buffers_no_nulls1[2] = {nullptr, list_offsets_buffer1}; +static const void* list_buffers_nulls1[2] = {bits_buffer1, list_offsets_buffer1}; + +static const int64_t large_list_offsets_buffer1[] = {0, 2, 2, 5, 6, 8}; +static const void* large_list_buffers_no_nulls1[2] = {nullptr, + large_list_offsets_buffer1}; + +static const int8_t type_codes_buffer1[] = {42, 42, 43, 43, 42}; +static const int32_t union_offsets_buffer1[] = {0, 1, 0, 1, 2}; +static const void* sparse_union_buffers_no_nulls1[3] = {nullptr, type_codes_buffer1, + nullptr}; +static const void* dense_union_buffers_no_nulls1[3] = {nullptr, type_codes_buffer1, + union_offsets_buffer1}; + +void NoOpArrayRelease(struct ArrowArray* schema) { + ArrowArrayMarkReleased(schema); +} + +class TestArrayImport : public ::testing::Test { + public: + void SetUp() override { + Reset(); + } + + void Reset() { + memset(&c_struct_, 0, sizeof(c_struct_)); + c_struct_.release = NoOpArrayRelease; + nested_structs_.clear(); + children_arrays_.clear(); + } + + // Create a new ArrowArray struct with a stable C pointer + struct ArrowArray* AddChild() { + nested_structs_.emplace_back(); + struct ArrowArray* result = &nested_structs_.back(); + memset(result, 0, sizeof(*result)); + result->release = NoOpArrayRelease; + return result; + } + + // Create a stable C pointer to the N last structs in nested_structs_ + struct ArrowArray** NLastChildren(int64_t n_children, struct ArrowArray* parent) { + children_arrays_.emplace_back(n_children); + struct ArrowArray** children = children_arrays_.back().data(); + int64_t nested_offset; + // If parent is itself at the end of nested_structs_, skip it + if (parent != nullptr && &nested_structs_.back() == parent) { + nested_offset = static_cast(nested_structs_.size()) - n_children - 1; + } else { + nested_offset = static_cast(nested_structs_.size()) - n_children; + } + for (int64_t i = 0; i < n_children; ++i) { + children[i] = &nested_structs_[nested_offset + i]; + } + return children; + } + + struct ArrowArray* LastChild(struct ArrowArray* parent = nullptr) { + return *NLastChildren(1, parent); + } + + void FillPrimitive(struct ArrowArray* c, int64_t length, + int64_t null_count, int64_t offset, const void** buffers) { + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 2; + c->buffers = buffers; + } + + void FillDictionary(struct ArrowArray* c) { c->dictionary = LastChild(c); } + + void FillStringLike(struct ArrowArray* c, int64_t length, + int64_t null_count, int64_t offset, const void** buffers) { + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 3; + c->buffers = buffers; + } + + void FillListLike(struct ArrowArray* c, int64_t length, + int64_t null_count, int64_t offset, const void** buffers) { + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 2; + c->buffers = buffers; + c->n_children = 1; + c->children = NLastChildren(1, c); + } + + void FillFixedSizeListLike(struct ArrowArray* c, int64_t length, + int64_t null_count, int64_t offset, const void** buffers) { + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 1; + c->buffers = buffers; + c->n_children = 1; + c->children = NLastChildren(1, c); + } + + void FillStructLike(struct ArrowArray* c, int64_t length, + int64_t null_count, int64_t offset, + int64_t n_children, const void** buffers) { + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 1; + c->buffers = buffers; + c->n_children = n_children; + c->children = NLastChildren(c->n_children, c); + } + + void FillUnionLike(struct ArrowArray* c, int64_t length, + int64_t null_count, int64_t offset, + int64_t n_children, const void** buffers) { + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 3; + c->buffers = buffers; + c->n_children = n_children; + c->children = NLastChildren(c->n_children, c); + } + + void FillPrimitive(int64_t length, int64_t null_count, + int64_t offset, const void** buffers) { + FillPrimitive(&c_struct_, length, null_count, offset, buffers); + } + + void FillDictionary() { FillDictionary(&c_struct_); } + + void FillStringLike(int64_t length, int64_t null_count, + int64_t offset, const void** buffers) { + FillStringLike(&c_struct_, length, null_count, offset, buffers); + } + + void FillListLike(int64_t length, int64_t null_count, + int64_t offset, const void** buffers) { + FillListLike(&c_struct_, length, null_count, offset, buffers); + } + + void FillFixedSizeListLike(int64_t length, int64_t null_count, + int64_t offset, const void** buffers) { + FillFixedSizeListLike(&c_struct_, length, null_count, offset, buffers); + } + + void FillStructLike(int64_t length, int64_t null_count, + int64_t offset, int64_t n_children, + const void** buffers) { + FillStructLike(&c_struct_, length, null_count, offset, n_children, buffers); + } + + void FillUnionLike(int64_t length, int64_t null_count, + int64_t offset, int64_t n_children, + const void** buffers) { + FillUnionLike(&c_struct_, length, null_count, offset, n_children, buffers); + } + + void CheckImport(const std::shared_ptr& expected) { + ArrayReleaseCallback cb(&c_struct_); + + auto type = expected->type(); + ASSERT_OK_AND_ASSIGN(auto array, ImportArray(&c_struct_, type)); + ASSERT_TRUE(ArrowArrayIsReleased(&c_struct_)); // was moved + Reset(); // for further tests + + ASSERT_OK(array->ValidateFull()); + // Special case: Null array doesn't have any data, so it needn't + // keep the ArrowArray struct alive. + if (type->id() != Type::NA) { + cb.AssertNotCalled(); + } + AssertArraysEqual(*expected, *array, true); + array.reset(); + cb.AssertCalled(); + } + + void CheckImport(const std::shared_ptr& expected) { + ArrayReleaseCallback cb(&c_struct_); + + auto schema = expected->schema(); + ASSERT_OK_AND_ASSIGN(auto batch, ImportRecordBatch(&c_struct_, schema)); + ASSERT_TRUE(ArrowArrayIsReleased(&c_struct_)); // was moved + Reset(); // for further tests + + ASSERT_OK(batch->ValidateFull()); + AssertBatchesEqual(*expected, *batch); + cb.AssertNotCalled(); + batch.reset(); + cb.AssertCalled(); + } + + void CheckImportError(const std::shared_ptr& type) { + ArrayReleaseCallback cb(&c_struct_); + + ASSERT_RAISES(Invalid, ImportArray(&c_struct_, type)); + ASSERT_TRUE(ArrowArrayIsReleased(&c_struct_)); + Reset(); // for further tests + cb.AssertCalled(); // was released + } + + void CheckImportError(const std::shared_ptr& schema) { + ArrayReleaseCallback cb(&c_struct_); + + ASSERT_RAISES(Invalid, ImportRecordBatch(&c_struct_, schema)); + ASSERT_TRUE(ArrowArrayIsReleased(&c_struct_)); + Reset(); // for further tests + cb.AssertCalled(); // was released + } + + protected: + struct ArrowArray c_struct_; + // Deque elements don't move when the deque is appended to, which allows taking + // stable C pointers to them. + std::deque nested_structs_; + std::deque> children_arrays_; +}; + +TEST_F(TestArrayImport, Primitive) { + FillPrimitive(3, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(int8(), "[1, 2, 3]")); + FillPrimitive(5, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(uint8(), "[1, 2, 3, 4, 5]")); + FillPrimitive(3, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(int16(), "[513, 1027, 1541]")); + FillPrimitive(3, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(uint16(), "[513, 1027, 1541]")); + FillPrimitive(2, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(int32(), "[67305985, 134678021]")); + FillPrimitive(2, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(uint32(), "[67305985, 134678021]")); + FillPrimitive(2, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(int64(), "[578437695752307201, 1157159078456920585]")); + FillPrimitive(2, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(uint64(), "[578437695752307201, 1157159078456920585]")); + + FillPrimitive(3, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(boolean(), "[true, false, false]")); + FillPrimitive(6, 0, 0, primitive_buffers_no_nulls5); + CheckImport(ArrayFromJSON(float32(), "[0.0, 1.5, -2.0, 3.0, 4.0, 5.0]")); + FillPrimitive(6, 0, 0, primitive_buffers_no_nulls6); + CheckImport(ArrayFromJSON(float64(), "[0.0, 1.5, -2.0, 3.0, 4.0, 5.0]")); + + // With nulls + FillPrimitive(9, -1, 0, primitive_buffers_nulls1); + CheckImport(ArrayFromJSON(int8(), "[1, null, 3, 4, null, 6, 7, 8, 9]")); + FillPrimitive(9, 2, 0, primitive_buffers_nulls1); + CheckImport(ArrayFromJSON(int8(), "[1, null, 3, 4, null, 6, 7, 8, 9]")); + FillPrimitive(3, -1, 0, primitive_buffers_nulls1); + CheckImport(ArrayFromJSON(int16(), "[513, null, 1541]")); + FillPrimitive(3, 1, 0, primitive_buffers_nulls1); + CheckImport(ArrayFromJSON(int16(), "[513, null, 1541]")); + FillPrimitive(3, -1, 0, primitive_buffers_nulls1); + CheckImport(ArrayFromJSON(boolean(), "[true, null, false]")); + FillPrimitive(3, 1, 0, primitive_buffers_nulls1); + CheckImport(ArrayFromJSON(boolean(), "[true, null, false]")); +} + +TEST_F(TestArrayImport, Temporal) { + FillPrimitive(3, 0, 0, primitive_buffers_no_nulls7); + CheckImport(ArrayFromJSON(date32(), "[1234, 5678, 9012]")); + FillPrimitive(3, 0, 0, primitive_buffers_no_nulls8); + CheckImport(ArrayFromJSON(date64(), "[123456789, 987654321, -123456789]")); + + FillPrimitive(2, 0, 0, primitive_buffers_no_nulls7); + CheckImport(ArrayFromJSON(time32(TimeUnit::SECOND), "[1234, 5678]")); + FillPrimitive(2, 0, 0, primitive_buffers_no_nulls7); + CheckImport(ArrayFromJSON(time32(TimeUnit::MILLI), "[1234, 5678]")); + FillPrimitive(2, 0, 0, primitive_buffers_no_nulls8); + CheckImport(ArrayFromJSON(time64(TimeUnit::MICRO), "[123456789, 987654321]")); + FillPrimitive(2, 0, 0, primitive_buffers_no_nulls8); + CheckImport(ArrayFromJSON(time64(TimeUnit::NANO), "[123456789, 987654321]")); + + FillPrimitive(2, 0, 0, primitive_buffers_no_nulls8); + CheckImport(ArrayFromJSON(duration(TimeUnit::SECOND), "[123456789, 987654321]")); + FillPrimitive(2, 0, 0, primitive_buffers_no_nulls8); + CheckImport(ArrayFromJSON(duration(TimeUnit::MILLI), "[123456789, 987654321]")); + FillPrimitive(2, 0, 0, primitive_buffers_no_nulls8); + CheckImport(ArrayFromJSON(duration(TimeUnit::MICRO), "[123456789, 987654321]")); + FillPrimitive(2, 0, 0, primitive_buffers_no_nulls8); + CheckImport(ArrayFromJSON(duration(TimeUnit::NANO), "[123456789, 987654321]")); + + FillPrimitive(3, 0, 0, primitive_buffers_no_nulls7); + CheckImport(ArrayFromJSON(month_interval(), "[1234, 5678, 9012]")); + FillPrimitive(2, 0, 0, primitive_buffers_no_nulls7); + CheckImport(ArrayFromJSON(day_time_interval(), "[[1234, 5678], [9012, 3456]]")); + + const char* json = R"(["1970-01-01","2000-02-29","1900-02-28"])"; + FillPrimitive(3, 0, 0, timestamp_buffers_no_nulls1); + CheckImport(ArrayFromJSON(timestamp(TimeUnit::SECOND), json)); + FillPrimitive(3, 0, 0, timestamp_buffers_no_nulls2); + CheckImport(ArrayFromJSON(timestamp(TimeUnit::MILLI), json)); + FillPrimitive(3, 0, 0, timestamp_buffers_no_nulls3); + CheckImport(ArrayFromJSON(timestamp(TimeUnit::MICRO), json)); + FillPrimitive(3, 0, 0, timestamp_buffers_no_nulls4); + CheckImport(ArrayFromJSON(timestamp(TimeUnit::NANO), json)); + + // With nulls + FillPrimitive(3, -1, 0, primitive_buffers_nulls7); + CheckImport(ArrayFromJSON(date32(), "[1234, null, 9012]")); + FillPrimitive(3, -1, 0, primitive_buffers_nulls8); + CheckImport(ArrayFromJSON(date64(), "[123456789, null, -123456789]")); + FillPrimitive(2, -1, 0, primitive_buffers_nulls8); + CheckImport(ArrayFromJSON(time64(TimeUnit::NANO), "[123456789, null]")); + FillPrimitive(2, -1, 0, primitive_buffers_nulls8); + CheckImport(ArrayFromJSON(duration(TimeUnit::NANO), "[123456789, null]")); + FillPrimitive(3, -1, 0, primitive_buffers_nulls7); + CheckImport(ArrayFromJSON(month_interval(), "[1234, null, 9012]")); + FillPrimitive(2, -1, 0, primitive_buffers_nulls7); + CheckImport(ArrayFromJSON(day_time_interval(), "[[1234, 5678], null]")); + FillPrimitive(3, -1, 0, timestamp_buffers_nulls1); + CheckImport(ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC+2"), + R"(["1970-01-01",null,"1900-02-28"])")); +} + +TEST_F(TestArrayImport, Null) { + const void* buffers[] = {nullptr}; + c_struct_.length = 3; + c_struct_.null_count = 3; + c_struct_.offset = 0; + c_struct_.n_buffers = 1; + c_struct_.buffers = buffers; + CheckImport(ArrayFromJSON(null(), "[null, null, null]")); +} + +TEST_F(TestArrayImport, PrimitiveWithOffset) { + FillPrimitive(3, 0, 2, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(int8(), "[3, 4, 5]")); + FillPrimitive(3, 0, 1, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(uint16(), "[1027, 1541, 2055]")); + + FillPrimitive(4, 0, 7, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(boolean(), "[false, false, true, false]")); +} + +TEST_F(TestArrayImport, NullWithOffset) { + const void* buffers[] = {nullptr}; + c_struct_.length = 3; + c_struct_.null_count = 3; + c_struct_.offset = 5; + c_struct_.n_buffers = 1; + c_struct_.buffers = buffers; + CheckImport(ArrayFromJSON(null(), "[null, null, null]")); +} + +TEST_F(TestArrayImport, String) { + FillStringLike(4, 0, 0, string_buffers_no_nulls1); + CheckImport(ArrayFromJSON(utf8(), R"(["foo", "", "bar", "quux"])")); + FillStringLike(4, 0, 0, string_buffers_no_nulls1); + CheckImport(ArrayFromJSON(binary(), R"(["foo", "", "bar", "quux"])")); + FillStringLike(4, 0, 0, large_string_buffers_no_nulls1); + CheckImport(ArrayFromJSON(large_utf8(), R"(["foo", "", "bar", "quux"])")); + FillStringLike(4, 0, 0, large_string_buffers_no_nulls1); + CheckImport(ArrayFromJSON(large_binary(), R"(["foo", "", "bar", "quux"])")); + + FillPrimitive(2, 0, 0, primitive_buffers_no_nulls2); + CheckImport(ArrayFromJSON(fixed_size_binary(3), R"(["abc", "def"])")); + FillPrimitive(2, 0, 0, primitive_buffers_no_nulls3); + CheckImport(ArrayFromJSON(decimal(15, 4), R"(["12345.6789", "98765.4321"])")); +} + +TEST_F(TestArrayImport, List) { + FillPrimitive(AddChild(), 8, 0, 0, primitive_buffers_no_nulls1); + FillListLike(5, 0, 0, list_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list(int8()), "[[1, 2], [], [3, 4, 5], [6], [7, 8]]")); + FillPrimitive(AddChild(), 5, 0, 0, primitive_buffers_no_nulls1); + FillListLike(3, 1, 0, list_buffers_nulls1); + CheckImport(ArrayFromJSON(list(int16()), "[[513, 1027], null, [1541, 2055, 2569]]")); + + // Large list + FillPrimitive(AddChild(), 5, 0, 0, primitive_buffers_no_nulls1); + FillListLike(3, 0, 0, large_list_buffers_no_nulls1); + CheckImport( + ArrayFromJSON(large_list(int16()), "[[513, 1027], [], [1541, 2055, 2569]]")); + + // Fixed-size list + FillPrimitive(AddChild(), 9, 0, 0, primitive_buffers_no_nulls1); + FillFixedSizeListLike(3, 0, 0, buffers_no_nulls_no_data); + CheckImport( + ArrayFromJSON(fixed_size_list(int8(), 3), "[[1, 2, 3], [4, 5, 6], [7, 8, 9]]")); +} + +TEST_F(TestArrayImport, NestedList) { + FillPrimitive(AddChild(), 8, 0, 0, primitive_buffers_no_nulls1); + FillListLike(AddChild(), 5, 0, 0, list_buffers_no_nulls1); + FillListLike(3, 0, 0, large_list_buffers_no_nulls1); + CheckImport(ArrayFromJSON(large_list(list(int8())), + "[[[1, 2], []], [], [[3, 4, 5], [6], [7, 8]]]")); + + FillPrimitive(AddChild(), 6, 0, 0, primitive_buffers_no_nulls1); + FillFixedSizeListLike(AddChild(), 2, 0, 0, buffers_no_nulls_no_data); + FillListLike(2, 0, 0, list_buffers_no_nulls1); + CheckImport( + ArrayFromJSON(list(fixed_size_list(int8(), 3)), "[[[1, 2, 3], [4, 5, 6]], []]")); +} + +TEST_F(TestArrayImport, ListWithOffset) { + // Offset in child + FillPrimitive(AddChild(), 8, 0, 1, primitive_buffers_no_nulls1); + FillListLike(5, 0, 0, list_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list(int8()), "[[2, 3], [], [4, 5, 6], [7], [8, 9]]")); + + FillPrimitive(AddChild(), 9, 0, 1, primitive_buffers_no_nulls1); + FillFixedSizeListLike(3, 0, 0, buffers_no_nulls_no_data); + CheckImport( + ArrayFromJSON(fixed_size_list(int8(), 3), "[[2, 3, 4], [5, 6, 7], [8, 9, 10]]")); + + // Offset in parent + FillPrimitive(AddChild(), 8, 0, 0, primitive_buffers_no_nulls1); + FillListLike(4, 0, 1, list_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list(int8()), "[[], [3, 4, 5], [6], [7, 8]]")); + + FillPrimitive(AddChild(), 9, 0, 0, primitive_buffers_no_nulls1); + FillFixedSizeListLike(3, 0, 1, buffers_no_nulls_no_data); + CheckImport( + ArrayFromJSON(fixed_size_list(int8(), 3), "[[4, 5, 6], [7, 8, 9], [10, 11, 12]]")); + + // Both + FillPrimitive(AddChild(), 8, 0, 2, primitive_buffers_no_nulls1); + FillListLike(4, 0, 1, list_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list(int8()), "[[], [5, 6, 7], [8], [9, 10]]")); + + FillPrimitive(AddChild(), 9, 0, 2, primitive_buffers_no_nulls1); + FillFixedSizeListLike(3, 0, 1, buffers_no_nulls_no_data); + CheckImport(ArrayFromJSON(fixed_size_list(int8(), 3), + "[[6, 7, 8], [9, 10, 11], [12, 13, 14]]")); +} + +TEST_F(TestArrayImport, Struct) { + FillStringLike(AddChild(), 3, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), 3, -1, 0, primitive_buffers_nulls1); + FillStructLike(3, 0, 0, 2, buffers_no_nulls_no_data); + auto expected = ArrayFromJSON(struct_({field("strs", utf8()), field("ints", uint16())}), + R"([["foo", 513], ["", null], ["bar", 1541]])"); + CheckImport(expected); + + FillStringLike(AddChild(), 3, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), 3, 0, 0, primitive_buffers_no_nulls1); + FillStructLike(3, -1, 0, 2, buffers_nulls_no_data1); + expected = ArrayFromJSON(struct_({field("strs", utf8()), field("ints", uint16())}), + R"([["foo", 513], null, ["bar", 1541]])"); + CheckImport(expected); + + FillStringLike(AddChild(), 3, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), 3, 0, 0, primitive_buffers_no_nulls1); + FillStructLike(3, -1, 0, 2, buffers_nulls_no_data1); + expected = ArrayFromJSON( + struct_({field("strs", utf8(), /*nullable=*/false), field("ints", uint16())}), + R"([["foo", 513], null, ["bar", 1541]])"); + CheckImport(expected); +} + +TEST_F(TestArrayImport, Union) { + // Sparse + FillStringLike(AddChild(), 4, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), 4, -1, 0, primitive_buffers_nulls1); + FillUnionLike(4, 0, 0, 2, sparse_union_buffers_no_nulls1); + auto type = + union_({field("strs", utf8()), field("ints", int8())}, {43, 42}, UnionMode::SPARSE); + auto expected = + ArrayFromJSON(type, R"([[42, 1], [42, null], [43, "bar"], [43, "quux"]])"); + CheckImport(expected); + + // Dense + FillStringLike(AddChild(), 2, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), 3, -1, 0, primitive_buffers_nulls1); + FillUnionLike(5, 0, 0, 2, dense_union_buffers_no_nulls1); + type = + union_({field("strs", utf8()), field("ints", int8())}, {43, 42}, UnionMode::DENSE); + expected = + ArrayFromJSON(type, R"([[42, 1], [42, null], [43, "foo"], [43, ""], [42, 3]])"); + CheckImport(expected); +} + +TEST_F(TestArrayImport, StructWithOffset) { + // Child + FillStringLike(AddChild(), 3, 0, 1, string_buffers_no_nulls1); + FillPrimitive(AddChild(), 3, 0, 2, primitive_buffers_no_nulls1); + FillStructLike(3, 0, 0, 2, buffers_no_nulls_no_data); + auto expected = ArrayFromJSON(struct_({field("strs", utf8()), field("ints", int8())}), + R"([["", 3], ["bar", 4], ["quux", 5]])"); + CheckImport(expected); + + // Parent and child + FillStringLike(AddChild(), 4, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), 4, 0, 2, primitive_buffers_no_nulls1); + FillStructLike(3, 0, 1, 2, buffers_no_nulls_no_data); + expected = ArrayFromJSON(struct_({field("strs", utf8()), field("ints", int8())}), + R"([["", 4], ["bar", 5], ["quux", 6]])"); + CheckImport(expected); +} + +TEST_F(TestArrayImport, Map) { + FillStringLike(AddChild(), 5, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), 5, 0, 0, primitive_buffers_no_nulls1); + FillStructLike(AddChild(), 5, 0, 0, 2, buffers_no_nulls_no_data); + FillListLike(3, 1, 0, list_buffers_nulls1); + auto expected = ArrayFromJSON( + map(utf8(), uint8()), + R"([[["foo", 1], ["", 2]], null, [["bar", 3], ["quux", 4], ["xyzzy", 5]]])"); + CheckImport(expected); +} + +TEST_F(TestArrayImport, Dictionary) { + FillStringLike(AddChild(), 4, 0, 0, string_buffers_no_nulls1); + FillPrimitive(6, 0, 0, primitive_buffers_no_nulls4); + FillDictionary(); + + auto dict_values = ArrayFromJSON(utf8(), R"(["foo", "", "bar", "quux"])"); + auto indices = ArrayFromJSON(int8(), "[1, 2, 0, 1, 3, 0]"); + std::shared_ptr expected; + ASSERT_OK(DictionaryArray::FromArrays(dictionary(int8(), utf8()), indices, dict_values, + &expected)); + CheckImport(expected); + + FillStringLike(AddChild(), 4, 0, 0, string_buffers_no_nulls1); + FillPrimitive(6, 0, 0, primitive_buffers_no_nulls4); + FillDictionary(); + + ASSERT_OK(DictionaryArray::FromArrays(dictionary(int8(), utf8(), /*ordered=*/true), + indices, dict_values, &expected)); + CheckImport(expected); +} + +TEST_F(TestArrayImport, NestedDictionary) { + FillPrimitive(AddChild(), 6, 0, 0, primitive_buffers_no_nulls1); + FillListLike(AddChild(), 4, 0, 0, list_buffers_no_nulls1); + FillPrimitive(6, 0, 0, primitive_buffers_no_nulls4); + FillDictionary(); + + auto dict_values = ArrayFromJSON(list(int8()), "[[1, 2], [], [3, 4, 5], [6]]"); + auto indices = ArrayFromJSON(int8(), "[1, 2, 0, 1, 3, 0]"); + std::shared_ptr expected; + ASSERT_OK(DictionaryArray::FromArrays(dictionary(int8(), list(int8())), + indices, dict_values, &expected)); + CheckImport(expected); + + FillStringLike(AddChild(), 4, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), 6, 0, 0, primitive_buffers_no_nulls4); + FillDictionary(LastChild()); + FillListLike(3, 0, 0, list_buffers_no_nulls1); + + dict_values = ArrayFromJSON(utf8(), R"(["foo", "", "bar", "quux"])"); + indices = ArrayFromJSON(int8(), "[1, 2, 0, 1, 3, 0]"); + std::shared_ptr dict_array; + ASSERT_OK(DictionaryArray::FromArrays(dictionary(int8(), utf8()), indices, dict_values, + &dict_array)); + auto offsets = ArrayFromJSON(int32(), "[0, 2, 2, 5]"); + ASSERT_OK(ListArray::FromArrays(*offsets, *dict_array, default_memory_pool(), + &expected)); + CheckImport(expected); +} + +TEST_F(TestArrayImport, DictionaryWithOffset) { + FillStringLike(AddChild(), 3, 0, 1, string_buffers_no_nulls1); + FillPrimitive(3, 0, 0, primitive_buffers_no_nulls4); + FillDictionary(); + + auto dict_values = ArrayFromJSON(utf8(), R"(["", "bar", "quux"])"); + auto indices = ArrayFromJSON(int8(), "[1, 2, 0]"); + std::shared_ptr expected; + ASSERT_OK(DictionaryArray::FromArrays(dictionary(int8(), utf8()), indices, dict_values, + &expected)); + CheckImport(expected); + + FillStringLike(AddChild(), 4, 0, 0, string_buffers_no_nulls1); + FillPrimitive(4, 0, 2, primitive_buffers_no_nulls4); + FillDictionary(); + + dict_values = ArrayFromJSON(utf8(), R"(["foo", "", "bar", "quux"])"); + indices = ArrayFromJSON(int8(), "[0, 1, 3, 0]"); + ASSERT_OK(DictionaryArray::FromArrays(dictionary(int8(), utf8()), indices, dict_values, + &expected)); + CheckImport(expected); +} + +TEST_F(TestArrayImport, PrimitiveError) { + // Bad number of buffers + FillPrimitive(3, 0, 0, primitive_buffers_no_nulls1); + c_struct_.n_buffers = 1; + CheckImportError(int8()); + + // Zero null bitmap but non-zero null_count + FillPrimitive(3, 1, 0, primitive_buffers_no_nulls1); + CheckImportError(int8()); +} + +TEST_F(TestArrayImport, StructError) { + // Bad number of children + FillStringLike(AddChild(), 3, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), 3, -1, 0, primitive_buffers_nulls1); + FillStructLike(3, 0, 0, 2, buffers_no_nulls_no_data); + CheckImportError(struct_({field("strs", utf8())})); +} + +TEST_F(TestArrayImport, MapError) { + // Bad number of (struct) children in map child + FillStringLike(AddChild(), 5, 0, 0, string_buffers_no_nulls1); + FillStructLike(AddChild(), 5, 0, 0, 1, buffers_no_nulls_no_data); + FillListLike(3, 1, 0, list_buffers_nulls1); + CheckImportError(map(utf8(), uint8())); +} + +TEST_F(TestArrayImport, DictionaryError) { + // Missing dictionary field + FillPrimitive(3, 0, 0, primitive_buffers_no_nulls4); + CheckImportError(dictionary(int8(), utf8())); + + // Unexpected dictionary field + FillStringLike(AddChild(), 4, 0, 0, string_buffers_no_nulls1); + FillPrimitive(6, 0, 0, primitive_buffers_no_nulls4); + FillDictionary(); + CheckImportError(int8()); +} + +TEST_F(TestArrayImport, RecursionError) { + // Infinite loop through children + FillStringLike(AddChild(), 3, 0, 0, string_buffers_no_nulls1); + FillStructLike(AddChild(), 3, 0, 0, 1, buffers_no_nulls_no_data); + FillStructLike(3, 0, 0, 1, buffers_no_nulls_no_data); + c_struct_.children[0] = &c_struct_; + CheckImportError(struct_({field("ints", struct_({field("ints", int8())}))})); +} + +TEST_F(TestArrayImport, ImportRecordBatch) { + auto schema = ::arrow::schema( + {field("strs", utf8(), /*nullable=*/false), field("ints", uint16())}); + auto expected_strs = ArrayFromJSON(utf8(), R"(["", "bar", "quux"])"); + auto expected_ints = ArrayFromJSON(uint16(), "[513, null, 1541]"); + + FillStringLike(AddChild(), 3, 0, 1, string_buffers_no_nulls1); + FillPrimitive(AddChild(), 3, -1, 0, primitive_buffers_nulls1); + FillStructLike(3, 0, 0, 2, buffers_no_nulls_no_data); + + auto expected = RecordBatch::Make(schema, 3, {expected_strs, expected_ints}); + CheckImport(expected); +} + +TEST_F(TestArrayImport, ImportRecordBatchError) { + // Struct with non-zero parent offset + FillStringLike(AddChild(), 4, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), 4, 0, 0, primitive_buffers_no_nulls1); + FillStructLike(3, 0, 1, 2, buffers_no_nulls_no_data); + auto schema = ::arrow::schema({field("strs", utf8()), field("ints", uint16())}); + CheckImportError(schema); + + // Struct with nulls in parent + FillStringLike(AddChild(), 3, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), 3, 0, 0, primitive_buffers_no_nulls1); + FillStructLike(3, 1, 0, 2, buffers_nulls_no_data1); + CheckImportError(schema); +} + +TEST_F(TestArrayImport, ImportArrayAndType) { + // Test importing both array and its type at the same time + SchemaStructBuilder schema_builder; + schema_builder.FillPrimitive("c"); + SchemaReleaseCallback schema_cb(&schema_builder.c_struct_); + + FillPrimitive(3, 0, 0, primitive_buffers_no_nulls1); + ArrayReleaseCallback array_cb(&c_struct_); + + ASSERT_OK_AND_ASSIGN(auto array, ImportArray(&c_struct_, &schema_builder.c_struct_)); + AssertArraysEqual(*array, *ArrayFromJSON(int8(), "[1, 2, 3]")); + schema_cb.AssertCalled(); // was released + array_cb.AssertNotCalled(); + ASSERT_TRUE(ArrowArrayIsReleased(&c_struct_)); // was moved + array.reset(); + array_cb.AssertCalled(); +} + +TEST_F(TestArrayImport, ImportArrayAndTypeError) { + // On error, both structs are released + SchemaStructBuilder schema_builder; + schema_builder.FillPrimitive("cc"); + SchemaReleaseCallback schema_cb(&schema_builder.c_struct_); + + FillPrimitive(3, 0, 0, primitive_buffers_no_nulls1); + ArrayReleaseCallback array_cb(&c_struct_); + + ASSERT_RAISES(Invalid, ImportArray(&c_struct_, &schema_builder.c_struct_)); + schema_cb.AssertCalled(); + array_cb.AssertCalled(); +} + +TEST_F(TestArrayImport, ImportRecordBatchAndSchema) { + // Test importing both record batch and its schema at the same time + auto schema = ::arrow::schema({field("strs", utf8()), field("ints", uint16())}); + auto expected_strs = ArrayFromJSON(utf8(), R"(["", "bar", "quux"])"); + auto expected_ints = ArrayFromJSON(uint16(), "[513, null, 1541]"); + + SchemaStructBuilder schema_builder; + schema_builder.FillPrimitive(schema_builder.AddChild(), "u", "strs"); + schema_builder.FillPrimitive(schema_builder.AddChild(), "S", "ints"); + schema_builder.FillStructLike("+s", 2); + SchemaReleaseCallback schema_cb(&schema_builder.c_struct_); + + FillStringLike(AddChild(), 3, 0, 1, string_buffers_no_nulls1); + FillPrimitive(AddChild(), 3, -1, 0, primitive_buffers_nulls1); + FillStructLike(3, 0, 0, 2, buffers_no_nulls_no_data); + ArrayReleaseCallback array_cb(&c_struct_); + + ASSERT_OK_AND_ASSIGN(auto batch, + ImportRecordBatch(&c_struct_, &schema_builder.c_struct_)); + auto expected = RecordBatch::Make(schema, 3, {expected_strs, expected_ints}); + AssertBatchesEqual(*batch, *expected); + schema_cb.AssertCalled(); // was released + array_cb.AssertNotCalled(); + ASSERT_TRUE(ArrowArrayIsReleased(&c_struct_)); // was moved + batch.reset(); + array_cb.AssertCalled(); +} + +TEST_F(TestArrayImport, ImportRecordBatchAndSchemaError) { + // On error, both structs are released + SchemaStructBuilder schema_builder; + schema_builder.FillPrimitive("cc"); + SchemaReleaseCallback schema_cb(&schema_builder.c_struct_); + + FillStringLike(AddChild(), 3, 0, 1, string_buffers_no_nulls1); + FillPrimitive(AddChild(), 3, -1, 0, primitive_buffers_nulls1); + FillStructLike(3, 0, 0, 2, buffers_no_nulls_no_data); + ArrayReleaseCallback array_cb(&c_struct_); + + ASSERT_RAISES(Invalid, ImportRecordBatch(&c_struct_, &schema_builder.c_struct_)); + schema_cb.AssertCalled(); + array_cb.AssertCalled(); +} + +//////////////////////////////////////////////////////////////////////////// +// C++ -> C -> C++ schema roundtripping tests + +class TestSchemaRoundtrip : public ::testing::Test { + public: + void SetUp() override { pool_ = default_memory_pool(); } + + template + void TestWithTypeFactory(TypeFactory&& factory) { + std::shared_ptr type, actual; + struct ArrowSchema c_schema{}; // zeroed + SchemaExportGuard schema_guard(&c_schema); + + auto orig_bytes = pool_->bytes_allocated(); + + type = factory(); + auto type_use_count = type.use_count(); + ASSERT_OK(ExportType(*type, &c_schema)); + ASSERT_GT(pool_->bytes_allocated(), orig_bytes); + // Export stores no reference to the type + ASSERT_EQ(type_use_count, type.use_count()); + type.reset(); + + // Recreate the type + ASSERT_OK_AND_ASSIGN(actual, ImportType(&c_schema)); + type = factory(); + AssertTypeEqual(*type, *actual); + type.reset(); + actual.reset(); + + ASSERT_EQ(pool_->bytes_allocated(), orig_bytes); + } + + template + void TestWithSchemaFactory(SchemaFactory&& factory) { + std::shared_ptr schema, actual; + struct ArrowSchema c_schema{}; // zeroed + SchemaExportGuard schema_guard(&c_schema); + + auto orig_bytes = pool_->bytes_allocated(); + + schema = factory(); + auto schema_use_count = schema.use_count(); + ASSERT_OK(ExportSchema(*schema, &c_schema)); + ASSERT_GT(pool_->bytes_allocated(), orig_bytes); + // Export stores no reference to the schema + ASSERT_EQ(schema_use_count, schema.use_count()); + schema.reset(); + + // Recreate the schema + ASSERT_OK_AND_ASSIGN(actual, ImportSchema(&c_schema)); + schema = factory(); + AssertSchemaEqual(*schema, *actual); + schema.reset(); + actual.reset(); + + ASSERT_EQ(pool_->bytes_allocated(), orig_bytes); + } + + protected: + MemoryPool* pool_; +}; + +TEST_F(TestSchemaRoundtrip, Null) { + TestWithTypeFactory(null); +} + +TEST_F(TestSchemaRoundtrip, Primitive) { + TestWithTypeFactory(int32); + TestWithTypeFactory(boolean); + TestWithTypeFactory(float16); + + TestWithTypeFactory(std::bind(decimal, 19, 4)); + TestWithTypeFactory(std::bind(fixed_size_binary, 3)); + TestWithTypeFactory(binary); + TestWithTypeFactory(large_utf8); +} + +TEST_F(TestSchemaRoundtrip, Temporal) { + TestWithTypeFactory(date32); + TestWithTypeFactory(day_time_interval); + TestWithTypeFactory(month_interval); + TestWithTypeFactory(std::bind(time64, TimeUnit::NANO)); + TestWithTypeFactory(std::bind(duration, TimeUnit::MICRO)); + TestWithTypeFactory([]() { return arrow::timestamp(TimeUnit::MICRO, "Europe/Paris"); }); +} + +TEST_F(TestSchemaRoundtrip, List) { + TestWithTypeFactory([]() { return list(utf8()); }); + TestWithTypeFactory([]() { return large_list(list(utf8())); }); + TestWithTypeFactory([]() { return fixed_size_list(utf8(), 5); }); + TestWithTypeFactory([]() { return list(fixed_size_list(utf8(), 5)); }); +} + +TEST_F(TestSchemaRoundtrip, Struct) { + auto f1 = field("f1", utf8(), /*nullable=*/ false); + auto f2 = field("f2", list(decimal(19, 4))); + + TestWithTypeFactory([&]() { return struct_({f1, f2}); }); + f2 = f2->WithMetadata(key_value_metadata(kMetadataKeys2, kMetadataValues2)); + TestWithTypeFactory([&]() { return struct_({f1, f2}); }); +} + +TEST_F(TestSchemaRoundtrip, Union) { + auto f1 = field("f1", utf8(), /*nullable=*/ false); + auto f2 = field("f2", list(decimal(19, 4))); + auto type_codes = std::vector{42, 43}; + + TestWithTypeFactory([&]() { return union_({f1, f2}, type_codes, UnionMode::SPARSE); }); + f2 = f2->WithMetadata(key_value_metadata(kMetadataKeys2, kMetadataValues2)); + TestWithTypeFactory([&]() { return union_({f1, f2}, type_codes, UnionMode::DENSE); }); +} + +TEST_F(TestSchemaRoundtrip, Dictionary) { + TestWithTypeFactory([&]() { return dictionary(int32(), utf8()); }); + TestWithTypeFactory([&]() { return dictionary(int32(), utf8(), /*ordered=*/ true); }); + + TestWithTypeFactory([&]() { return dictionary(int32(), list(utf8())); }); + TestWithTypeFactory([&]() { return list(dictionary(int32(), list(utf8()))); }); +} + +TEST_F(TestSchemaRoundtrip, Map) { + TestWithTypeFactory([&]() { return map(utf8(), int32()); }); + TestWithTypeFactory([&]() { return map(list(utf8()), int32()); }); + TestWithTypeFactory([&]() { return list(map(list(utf8()), int32())); }); +} + +TEST_F(TestSchemaRoundtrip, Schema) { + auto f1 = field("f1", utf8(), /*nullable=*/ false); + auto f2 = field("f2", list(decimal(19, 4))); + auto md1 = key_value_metadata(kMetadataKeys1, kMetadataValues1); + auto md2 = key_value_metadata(kMetadataKeys2, kMetadataValues2); + + TestWithSchemaFactory([&]() { return schema({f1, f2}); }); + f2 = f2->WithMetadata(md2); + TestWithSchemaFactory([&]() { return schema({f1, f2}); }); + TestWithSchemaFactory([&]() { return schema({f1, f2}, md1); }); +} + +//////////////////////////////////////////////////////////////////////////// +// C++ -> C -> C++ data roundtripping tests + +class TestArrayRoundtrip : public ::testing::Test { + public: + using ArrayFactory = std::function*)>; + + void SetUp() override { pool_ = default_memory_pool(); } + + static ArrayFactory JSONArrayFactory(std::shared_ptr type, const char* json) { + return [=](std::shared_ptr* out) -> Status { + return ::arrow::ipc::internal::json::ArrayFromJSON(type, json, out); + }; + } + + static ArrayFactory SlicedArrayFactory(ArrayFactory factory) { + return [=](std::shared_ptr* out) -> Status { + std::shared_ptr arr; + RETURN_NOT_OK(factory(&arr)); + DCHECK_GE(arr->length(), 2); + *out = arr->Slice(1, arr->length() - 2); + return Status::OK(); + }; + } + + template + void TestWithArrayFactory(ArrayFactory&& factory) { + std::shared_ptr array; + struct ArrowArray c_array{}; + struct ArrowSchema c_schema{}; + ArrayExportGuard array_guard(&c_array); + SchemaExportGuard schema_guard(&c_schema); + + auto orig_bytes = pool_->bytes_allocated(); + + ASSERT_OK(factory(&array)); + ASSERT_OK(ExportType(*array->type(), &c_schema)); + ASSERT_OK(ExportArray(*array, &c_array)); + + auto new_bytes = pool_->bytes_allocated(); + if (array->type_id() != Type::NA) { + ASSERT_GT(new_bytes, orig_bytes); + } + + array.reset(); + ASSERT_EQ(pool_->bytes_allocated(), new_bytes); + ASSERT_OK_AND_ASSIGN(array, ImportArray(&c_array, &c_schema)); + ASSERT_OK(array->ValidateFull()); + ASSERT_TRUE(ArrowSchemaIsReleased(&c_schema)); + ASSERT_TRUE(ArrowArrayIsReleased(&c_array)); + + // Re-export and re-import, now both at once + ASSERT_OK(ExportArray(*array, &c_array, &c_schema)); + array.reset(); + ASSERT_OK_AND_ASSIGN(array, ImportArray(&c_array, &c_schema)); + ASSERT_OK(array->ValidateFull()); + ASSERT_TRUE(ArrowSchemaIsReleased(&c_schema)); + ASSERT_TRUE(ArrowArrayIsReleased(&c_array)); + + // Check value of imported array + { + std::shared_ptr expected; + ASSERT_OK(factory(&expected)); + AssertTypeEqual(*expected->type(), *array->type()); + AssertArraysEqual(*expected, *array, true); + } + array.reset(); + ASSERT_EQ(pool_->bytes_allocated(), orig_bytes); + } + + template + void TestWithBatchFactory(BatchFactory&& factory) { + std::shared_ptr batch; + struct ArrowArray c_array{}; + struct ArrowSchema c_schema{}; + ArrayExportGuard array_guard(&c_array); + SchemaExportGuard schema_guard(&c_schema); + + auto orig_bytes = pool_->bytes_allocated(); + ASSERT_OK(factory(&batch)); + ASSERT_OK(ExportSchema(*batch->schema(), &c_schema)); + ASSERT_OK(ExportRecordBatch(*batch, &c_array)); + + auto new_bytes = pool_->bytes_allocated(); + batch.reset(); + ASSERT_EQ(pool_->bytes_allocated(), new_bytes); + ASSERT_OK_AND_ASSIGN(batch, ImportRecordBatch(&c_array, &c_schema)); + ASSERT_OK(batch->ValidateFull()); + ASSERT_TRUE(ArrowSchemaIsReleased(&c_schema)); + ASSERT_TRUE(ArrowArrayIsReleased(&c_array)); + + // Re-export and re-import, now both at once + ASSERT_OK(ExportRecordBatch(*batch, &c_array, &c_schema)); + batch.reset(); + ASSERT_OK_AND_ASSIGN(batch, ImportRecordBatch(&c_array, &c_schema)); + ASSERT_OK(batch->ValidateFull()); + ASSERT_TRUE(ArrowSchemaIsReleased(&c_schema)); + ASSERT_TRUE(ArrowArrayIsReleased(&c_array)); + + // Check value of imported record batch + { + std::shared_ptr expected; + ASSERT_OK(factory(&expected)); + AssertSchemaEqual(*expected->schema(), *batch->schema()); + AssertBatchesEqual(*expected, *batch); + } + batch.reset(); + ASSERT_EQ(pool_->bytes_allocated(), orig_bytes); + } + + void TestWithJSON(std::shared_ptr type, const char* json) { + TestWithArrayFactory(JSONArrayFactory(type, json)); + } + + void TestWithJSONSliced(std::shared_ptr type, const char* json) { + TestWithArrayFactory(SlicedArrayFactory(JSONArrayFactory(type, json))); + } + + protected: + MemoryPool* pool_; +}; + +TEST_F(TestArrayRoundtrip, Null) { + TestWithJSON(null(), "[]"); + TestWithJSON(null(), "[null, null]"); + + TestWithJSONSliced(null(), "[null, null]"); + TestWithJSONSliced(null(), "[null, null, null]"); +} + +TEST_F(TestArrayRoundtrip, Primitive) { + TestWithJSON(int32(), "[]"); + TestWithJSON(int32(), "[4, 5, null]"); + + TestWithJSONSliced(int32(), "[4, 5]"); + TestWithJSONSliced(int32(), "[4, 5, 6, null]"); +} + +TEST_F(TestArrayRoundtrip, Nested) { + TestWithJSON(list(int32()), "[]"); + TestWithJSON(list(int32()), "[[4, 5], [6, null], null]"); + + TestWithJSONSliced(list(int32()), "[[4, 5], [6, null], null]"); + + auto type = struct_({field("ints", int16()), field("bools", boolean())}); + TestWithJSON(type, "[]"); + TestWithJSON(type, "[[4, true], [5, false]]"); + TestWithJSON(type, "[[4, null], null, [5, false]]"); + + TestWithJSONSliced(type, "[[4, null], null, [5, false]]"); + + // With nullable = false and metadata + auto f0 = field("ints", int16(), /*nullable=*/false); + auto f1 = field("bools", boolean(), /*nullable=*/true, + key_value_metadata(kMetadataKeys1, kMetadataValues1)); + type = struct_({f0, f1}); + TestWithJSON(type, "[]"); + TestWithJSON(type, "[[4, true], [5, null]]"); + + TestWithJSONSliced(type, "[[4, true], [5, null], [6, false]]"); + + // Map type + type = map(utf8(), int32()); + const char* json = R"([[["foo", 123], ["bar", -456]], null, + [["foo", null]], []])"; + TestWithJSON(type, json); + TestWithJSONSliced(type, json); + + type = map(utf8(), int32(), /*keys_sorted=*/ true); + TestWithJSON(type, json); + TestWithJSONSliced(type, json); +} + +TEST_F(TestArrayRoundtrip, Dictionary) { + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(utf8(), R"(["foo", "bar", "quux"])"); + auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); + return DictionaryArray::FromArrays(dictionary(indices->type(), values->type()), + indices, values, out); + }; + TestWithArrayFactory(factory); + TestWithArrayFactory(SlicedArrayFactory(factory)); + } + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(list(utf8()), R"([["abc", "def"], ["efg"], []])"); + auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); + return DictionaryArray::FromArrays( + dictionary(indices->type(), values->type(), /*ordered=*/true), indices, values, + out); + }; + TestWithArrayFactory(factory); + TestWithArrayFactory(SlicedArrayFactory(factory)); + } +} + +TEST_F(TestArrayRoundtrip, RecordBatch) { + auto schema = ::arrow::schema( + {field("ints", int16()), field("bools", boolean(), /*nullable=*/false)}); + auto arr0 = ArrayFromJSON(int16(), "[1, 2, null]"); + auto arr1 = ArrayFromJSON(boolean(), "[false, true, false]"); + + { + auto factory = [&](std::shared_ptr* out) -> Status { + *out = RecordBatch::Make(schema, 3, {arr0, arr1}); + return Status::OK(); + }; + TestWithBatchFactory(factory); + } + { + // With schema and field metadata + auto factory = [&](std::shared_ptr* out) -> Status { + auto f0 = schema->field(0); + auto f1 = schema->field(1); + f1 = f1->WithMetadata(key_value_metadata(kMetadataKeys1, kMetadataValues1)); + auto schema_with_md = ::arrow::schema({f0, f1}, + key_value_metadata(kMetadataKeys2, kMetadataValues2)); + *out = RecordBatch::Make(schema_with_md, 3, {arr0, arr1}); + return Status::OK(); + }; + TestWithBatchFactory(factory); + } +} + +// TODO C -> C++ -> C roundtripping tests? + +} // namespace arrow diff --git a/cpp/src/arrow/c/helpers.h b/cpp/src/arrow/c/helpers.h new file mode 100644 index 00000000000..714a7f95858 --- /dev/null +++ b/cpp/src/arrow/c/helpers.h @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/c/abi.h" + +#ifdef __cplusplus +extern "C" { +#endif + +inline int ArrowSchemaIsReleased(const struct ArrowSchema* schema) { + return schema->release == NULL; +} + +inline void ArrowSchemaMarkReleased(struct ArrowSchema* schema) { + schema->release = NULL; +} + +inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dest) { + assert(dest != src); + assert(!ArrowSchemaIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowSchema)); + ArrowSchemaMarkReleased(src); +} + +inline void ArrowSchemaRelease(struct ArrowSchema* schema) { + if (!ArrowSchemaIsReleased(schema)) { + schema->release(schema); + assert(ArrowSchemaIsReleased(schema)); + } +} + +inline int ArrowArrayIsReleased(const struct ArrowArray* array) { + return array->release == NULL; +} + +inline void ArrowArrayMarkReleased(struct ArrowArray* array) { array->release = NULL; } + +inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dest) { + assert(dest != src); + assert(!ArrowArrayIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowArray)); + ArrowArrayMarkReleased(src); +} + +inline void ArrowArrayRelease(struct ArrowArray* array) { + if (!ArrowArrayIsReleased(array)) { + array->release(array); + assert(ArrowArrayIsReleased(array)); + } +} + +#ifdef __cplusplus +} +#endif diff --git a/cpp/src/arrow/c/util_internal.h b/cpp/src/arrow/c/util_internal.h new file mode 100644 index 00000000000..796f71d8b89 --- /dev/null +++ b/cpp/src/arrow/c/util_internal.h @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +namespace arrow { +namespace internal { + +struct SchemaExportTraits { + typedef struct ArrowSchema CType; + static constexpr auto IsReleasedFunc = &ArrowSchemaIsReleased; + static constexpr auto ReleaseFunc = &ArrowSchemaRelease; +}; + +struct ArrayExportTraits { + typedef struct ArrowArray CType; + static constexpr auto IsReleasedFunc = &ArrowArrayIsReleased; + static constexpr auto ReleaseFunc = &ArrowArrayRelease; +}; + +// A RAII-style object to release a C Array / Schema struct at block scope exit. +template +class ExportGuard { + public: + using CType = typename Traits::CType; + + explicit ExportGuard(CType* c_export) : c_export_(c_export) {} + + ARROW_DEFAULT_MOVE_AND_ASSIGN(ExportGuard); + + ~ExportGuard() { Release(); } + + void Detach() { c_export_ = nullptr; } + + void Reset(CType* c_export) { c_export_ = c_export; } + + void Release() { + if (c_export_) { + Traits::ReleaseFunc(c_export_); + c_export_ = nullptr; + } + } + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ExportGuard); + + CType* c_export_; +}; + +using SchemaExportGuard = ExportGuard; +using ArrayExportGuard = ExportGuard; + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index d2322009ea8..52dd64f57af 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -924,7 +924,8 @@ Status PrintDiff(const Array& left, const Array& right, std::ostream* os) { } if (!left.type()->Equals(right.type())) { - *os << "# Array types differed: " << *left.type() << " vs " << *right.type(); + *os << "# Array types differed: " << *left.type() << " vs " << *right.type() + << std::endl; return Status::OK(); } diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index 16f34575523..06320b844d2 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -39,6 +39,7 @@ namespace internal { namespace json { using ::arrow::internal::checked_cast; +using ::arrow::internal::checked_pointer_cast; static constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag; @@ -155,6 +156,7 @@ class BooleanConverter final : public ConcreteConverter { // Convert single signed integer value (also {Date,Time}{32,64} and Timestamp) template enable_if_physical_signed_integer ConvertNumber(const rj::Value& json_obj, + const DataType& type, typename T::c_type* out) { if (json_obj.IsInt64()) { int64_t v64 = json_obj.GetInt64(); @@ -162,8 +164,7 @@ enable_if_physical_signed_integer ConvertNumber(const rj::Value& json if (*out == v64) { return Status::OK(); } else { - return Status::Invalid("Value ", v64, " out of bounds for ", - TypeTraits::type_singleton()); + return Status::Invalid("Value ", v64, " out of bounds for ", type); } } else { *out = static_cast(0); @@ -174,6 +175,7 @@ enable_if_physical_signed_integer ConvertNumber(const rj::Value& json // Convert single unsigned integer value template enable_if_physical_unsigned_integer ConvertNumber(const rj::Value& json_obj, + const DataType& type, typename T::c_type* out) { if (json_obj.IsUint64()) { uint64_t v64 = json_obj.GetUint64(); @@ -181,8 +183,7 @@ enable_if_physical_unsigned_integer ConvertNumber(const rj::Value& js if (*out == v64) { return Status::OK(); } else { - return Status::Invalid("Value ", v64, " out of bounds for ", - TypeTraits::type_singleton()); + return Status::Invalid("Value ", v64, " out of bounds for ", type); } } else { *out = static_cast(0); @@ -193,6 +194,7 @@ enable_if_physical_unsigned_integer ConvertNumber(const rj::Value& js // Convert single floating point value template enable_if_physical_floating_point ConvertNumber(const rj::Value& json_obj, + const DataType& type, typename T::c_type* out) { if (json_obj.IsNumber()) { *out = static_cast(json_obj.GetDouble()); @@ -212,9 +214,13 @@ class IntegerConverter final : public ConcreteConverter> static constexpr auto is_signed = std::is_signed::value; public: - explicit IntegerConverter(const std::shared_ptr& type) { - this->type_ = type; - builder_ = std::make_shared>(); + explicit IntegerConverter(const std::shared_ptr& type) { this->type_ = type; } + + Status Init() override { + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(default_memory_pool(), this->type_, &builder)); + builder_ = checked_pointer_cast>(std::move(builder)); + return Status::OK(); } Status AppendNull() override { return builder_->AppendNull(); } @@ -224,7 +230,7 @@ class IntegerConverter final : public ConcreteConverter> return AppendNull(); } c_type value; - RETURN_NOT_OK(ConvertNumber(json_obj, &value)); + RETURN_NOT_OK(ConvertNumber(json_obj, *this->type_, &value)); return builder_->Append(value); } @@ -254,7 +260,7 @@ class FloatConverter final : public ConcreteConverter> { return AppendNull(); } c_type value; - RETURN_NOT_OK(ConvertNumber(json_obj, &value)); + RETURN_NOT_OK(ConvertNumber(json_obj, *this->type_, &value)); return builder_->Append(value); } @@ -321,7 +327,7 @@ class TimestampConverter final : public ConcreteConverter { } int64_t value; if (json_obj.IsNumber()) { - RETURN_NOT_OK(ConvertNumber(json_obj, &value)); + RETURN_NOT_OK(ConvertNumber(json_obj, *this->type_, &value)); } else if (json_obj.IsString()) { auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); if (!from_string_(view.data(), view.size(), &value)) { @@ -340,6 +346,43 @@ class TimestampConverter final : public ConcreteConverter { std::shared_ptr builder_; }; +// ------------------------------------------------------------------------ +// Converter for day-time interval arrays + +class DayTimeIntervalConverter final + : public ConcreteConverter { + public: + explicit DayTimeIntervalConverter(const std::shared_ptr& type) { + this->type_ = type; + builder_ = std::make_shared(default_memory_pool()); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + DayTimeIntervalType::DayMilliseconds value; + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); + } + if (json_obj.Size() != 2) { + return Status::Invalid( + "day time interval pair must have exactly two elements, had ", json_obj.Size()); + } + RETURN_NOT_OK(ConvertNumber(json_obj[0], *this->type_, &value.days)); + RETURN_NOT_OK( + ConvertNumber(json_obj[1], *this->type_, &value.milliseconds)); + return builder_->Append(value); + } + + std::shared_ptr builder() override { return builder_; } + + private: + std::shared_ptr builder_; +}; + // ------------------------------------------------------------------------ // Converter for binary and string arrays @@ -713,6 +756,11 @@ Status GetConverter(const std::shared_ptr& type, std::shared_ptr* out) { std::shared_ptr res; + auto not_implemented = [&]() -> Status { + return Status::NotImplemented("JSON conversion to ", type->ToString(), + " not implemented"); + }; + #define SIMPLE_CONVERTER_CASE(ID, CLASS) \ case ID: \ res = std::make_shared(type); \ @@ -722,16 +770,17 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::INT8, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::INT16, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::INT32, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::TIME32, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::DATE32, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::INT64, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::TIME64, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::TIMESTAMP, TimestampConverter) - SIMPLE_CONVERTER_CASE(Type::DATE64, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::UINT8, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::UINT16, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::UINT32, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::UINT64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIMESTAMP, TimestampConverter) + SIMPLE_CONVERTER_CASE(Type::DATE32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::DATE64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIME32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIME64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::DURATION, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::NA, NullConverter) SIMPLE_CONVERTER_CASE(Type::BOOL, BooleanConverter) SIMPLE_CONVERTER_CASE(Type::HALF_FLOAT, IntegerConverter) @@ -749,10 +798,21 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter) SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) SIMPLE_CONVERTER_CASE(Type::UNION, UnionConverter) - default: { - return Status::NotImplemented("JSON conversion to ", type->ToString(), - " not implemented"); + case Type::INTERVAL: { + switch (checked_cast(*type).interval_type()) { + case IntervalType::MONTHS: + res = std::make_shared>(type); + break; + case IntervalType::DAY_TIME: + res = std::make_shared(type); + break; + default: + return not_implemented(); + } + break; } + default: + return not_implemented(); } #undef SIMPLE_CONVERTER_CASE diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index b83c87d8c56..af75e1ce644 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -352,6 +352,59 @@ TEST(TestTimestamp, Basics) { {0, 951782400000000000LL, -2203977600000000000LL}); } +TEST(TestDate, Basics) { + auto type = date32(); + AssertJSONArray(type, R"([5, null, 42])", {true, false, true}, {5, 0, 42}); + type = date64(); + AssertJSONArray(type, R"([1, null, 9999999999999])", {true, false, true}, + {1, 0, 9999999999999LL}); +} + +TEST(TestTime, Basics) { + auto type = time32(TimeUnit::SECOND); + AssertJSONArray(type, R"([5, null, 42])", {true, false, true}, {5, 0, 42}); + type = time32(TimeUnit::MILLI); + AssertJSONArray(type, R"([5, null, 42])", {true, false, true}, {5, 0, 42}); + + type = time64(TimeUnit::MICRO); + AssertJSONArray(type, R"([1, null, 9999999999999])", {true, false, true}, + {1, 0, 9999999999999LL}); + type = time64(TimeUnit::NANO); + AssertJSONArray(type, R"([1, null, 9999999999999])", {true, false, true}, + {1, 0, 9999999999999LL}); +} + +TEST(TestDuration, Basics) { + auto type = duration(TimeUnit::SECOND); + AssertJSONArray(type, R"([null, -7777777777777, 9999999999999])", + {false, true, true}, + {0, -7777777777777LL, 9999999999999LL}); + type = duration(TimeUnit::MILLI); + AssertJSONArray(type, R"([null, -7777777777777, 9999999999999])", + {false, true, true}, + {0, -7777777777777LL, 9999999999999LL}); + type = duration(TimeUnit::MICRO); + AssertJSONArray(type, R"([null, -7777777777777, 9999999999999])", + {false, true, true}, + {0, -7777777777777LL, 9999999999999LL}); + type = duration(TimeUnit::NANO); + AssertJSONArray(type, R"([null, -7777777777777, 9999999999999])", + {false, true, true}, + {0, -7777777777777LL, 9999999999999LL}); +} + +TEST(TestMonthInterval, Basics) { + auto type = month_interval(); + AssertJSONArray(type, R"([123, -456, null])", {true, true, false}, + {123, -456, 0}); +} + +TEST(TestDayTimeInterval, Basics) { + auto type = day_time_interval(); + AssertJSONArray(type, R"([[1, -600], null])", {true, false}, + {{1, -600}, {}}); +} + TEST(TestString, Errors) { std::shared_ptr type = utf8(); std::shared_ptr array; diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 1f2bef9da07..5f36f7137dc 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -54,38 +54,31 @@ int RecordBatch::num_columns() const { return schema_->num_fields(); } /// \brief A basic, non-lazy in-memory record batch class SimpleRecordBatch : public RecordBatch { public: - SimpleRecordBatch(const std::shared_ptr& schema, int64_t num_rows, + SimpleRecordBatch(std::shared_ptr schema, int64_t num_rows, const std::vector>& columns) - : RecordBatch(schema, num_rows) { + : RecordBatch(std::move(schema), num_rows) { columns_.resize(columns.size()); - boxed_columns_.resize(schema->num_fields()); - for (size_t i = 0; i < columns.size(); ++i) { - columns_[i] = columns[i]->data(); + boxed_columns_ = columns; + for (size_t i = 0; i < columns_.size(); ++i) { + columns_[i] = boxed_columns_[i]->data(); } } - SimpleRecordBatch(const std::shared_ptr& schema, int64_t num_rows, + SimpleRecordBatch(std::shared_ptr schema, int64_t num_rows, std::vector>&& columns) - : RecordBatch(schema, num_rows) { + : RecordBatch(std::move(schema), num_rows) { columns_.resize(columns.size()); - boxed_columns_.resize(schema->num_fields()); - for (size_t i = 0; i < columns.size(); ++i) { - columns_[i] = columns[i]->data(); + boxed_columns_ = std::move(columns); + for (size_t i = 0; i < columns_.size(); ++i) { + columns_[i] = boxed_columns_[i]->data(); } } SimpleRecordBatch(const std::shared_ptr& schema, int64_t num_rows, - std::vector>&& columns) - : RecordBatch(schema, num_rows) { + std::vector> columns) + : RecordBatch(std::move(schema), num_rows) { columns_ = std::move(columns); - boxed_columns_.resize(schema->num_fields()); - } - - SimpleRecordBatch(const std::shared_ptr& schema, int64_t num_rows, - const std::vector>& columns) - : RecordBatch(schema, num_rows) { - columns_ = columns; - boxed_columns_.resize(schema->num_fields()); + boxed_columns_.resize(schema_->num_fields()); } std::shared_ptr column(int i) const override { @@ -173,35 +166,31 @@ RecordBatch::RecordBatch(const std::shared_ptr& schema, int64_t num_rows : schema_(schema), num_rows_(num_rows) {} std::shared_ptr RecordBatch::Make( - const std::shared_ptr& schema, int64_t num_rows, + std::shared_ptr schema, int64_t num_rows, const std::vector>& columns) { DCHECK_EQ(schema->num_fields(), static_cast(columns.size())); - return std::make_shared(schema, num_rows, columns); + return std::make_shared(std::move(schema), num_rows, columns); } std::shared_ptr RecordBatch::Make( - const std::shared_ptr& schema, int64_t num_rows, + std::shared_ptr schema, int64_t num_rows, std::vector>&& columns) { DCHECK_EQ(schema->num_fields(), static_cast(columns.size())); - return std::make_shared(schema, num_rows, std::move(columns)); -} - -std::shared_ptr RecordBatch::Make( - const std::shared_ptr& schema, int64_t num_rows, - std::vector>&& columns) { - DCHECK_EQ(schema->num_fields(), static_cast(columns.size())); - return std::make_shared(schema, num_rows, std::move(columns)); + return std::make_shared(std::move(schema), num_rows, + std::move(columns)); } std::shared_ptr RecordBatch::Make( - const std::shared_ptr& schema, int64_t num_rows, - const std::vector>& columns) { + std::shared_ptr schema, int64_t num_rows, + std::vector> columns) { DCHECK_EQ(schema->num_fields(), static_cast(columns.size())); - return std::make_shared(schema, num_rows, columns); + return std::make_shared(std::move(schema), num_rows, + std::move(columns)); } Status RecordBatch::FromStructArray(const std::shared_ptr& array, std::shared_ptr* out) { + // TODO fail if null_count != 0? if (array->type_id() != Type::STRUCT) { return Status::Invalid("Cannot construct record batch from array of type ", *array->type()); @@ -211,6 +200,19 @@ Status RecordBatch::FromStructArray(const std::shared_ptr& array, return Status::OK(); } +Status RecordBatch::ToStructArray(std::shared_ptr* out) const { + ARROW_ASSIGN_OR_RAISE(*out, StructArray::Make(columns(), schema()->fields())); + return Status::OK(); +} + +std::vector> RecordBatch::columns() const { + std::vector> children(num_columns()); + for (int i = 0; i < num_columns(); ++i) { + children[i] = column(i); + } + return children; +} + const std::string& RecordBatch::column_name(int i) const { return schema_->field(i)->name(); } diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 0eb5c135ebb..3b5f727a1e6 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -44,34 +44,42 @@ class ARROW_EXPORT RecordBatch { /// should have the same length as num_rows /// \param[in] columns the record batch fields as vector of arrays static std::shared_ptr Make( - const std::shared_ptr& schema, int64_t num_rows, + std::shared_ptr schema, int64_t num_rows, const std::vector>& columns); - /// \brief Move-based constructor for a vector of Array instances - static std::shared_ptr Make(const std::shared_ptr& schema, + /// \param[in] schema The record batch schema + /// \param[in] num_rows length of fields in the record batch. Each array + /// should have the same length as num_rows + /// \param[in] columns the record batch fields as vector of arrays + static std::shared_ptr Make(std::shared_ptr schema, int64_t num_rows, std::vector>&& columns); /// \brief Construct record batch from vector of internal data structures /// \since 0.5.0 /// - /// This class is only provided with an rvalue-reference for the input data, - /// and is intended for internal use, or advanced users. + /// This class is intended for internal use, or advanced users. /// /// \param schema the record batch schema /// \param num_rows the number of semantic rows in the record batch. This /// should be equal to the length of each field /// \param columns the data for the batch's columns static std::shared_ptr Make( - const std::shared_ptr& schema, int64_t num_rows, - std::vector>&& columns); + std::shared_ptr schema, int64_t num_rows, + std::vector> columns); - /// \brief Construct record batch by copying vector of array data - /// \since 0.5.0 - static std::shared_ptr Make( - const std::shared_ptr& schema, int64_t num_rows, - const std::vector>& columns); + /// \brief Convert record batch to struct array + /// + /// Create a struct array whose child arrays are the record batch's columns. + /// Note that the record batch's top-level field metadata cannot be reflected + /// in the resulting struct array. + Status ToStructArray(std::shared_ptr* out) const; + /// \brief Construct record batch from struct array + /// + /// This constructs a record batch using the child arrays of the given + /// array, which must be a struct array. Note that the struct array's own + /// null bitmap is not reflected in the resulting record batch. static Status FromStructArray(const std::shared_ptr& array, std::shared_ptr* out); @@ -86,6 +94,9 @@ class ARROW_EXPORT RecordBatch { /// \return true if batches are equal std::shared_ptr schema() const { return schema_; } + /// \brief Retrieve all columns at once + std::vector> columns() const; + /// \brief Retrieve an array from the record batch /// \param[in] i field index, does not boundscheck /// \return an Array object diff --git a/cpp/src/arrow/stl.h b/cpp/src/arrow/stl.h index 68bf04f1e3a..8e3d637a7ec 100644 --- a/cpp/src/arrow/stl.h +++ b/cpp/src/arrow/stl.h @@ -88,7 +88,8 @@ using CBuilderType = /// contiguous ranges while appending. This default implementation will call /// ConversionTraits::AppendRow() for each value in the range. template -Status AppendListValues(CBuilderType& value_builder, Range&& cell_range) { +inline Status AppendListValues(CBuilderType& value_builder, + Range&& cell_range) { for (auto const& value : cell_range) { ARROW_RETURN_NOT_OK(ConversionTraits::AppendRow(value_builder, value)); } @@ -109,7 +110,7 @@ Status AppendListValues(CBuilderType& value_builder, Range&& cell_ra }; \ \ template <> \ - Status AppendListValues&>( \ + inline Status AppendListValues&>( \ typename TypeTraits::BuilderType & value_builder, \ const std::vector& cell_range) { \ return value_builder.AppendValues(cell_range); \ @@ -460,127 +461,7 @@ Status TupleRangeFromTable(const Table& table, const compute::CastOptions& cast_ return Status::OK(); } -/// \brief A STL allocator delegating allocations to a Arrow MemoryPool -template -class allocator { - public: - using value_type = T; - using pointer = T*; - using const_pointer = const T*; - using reference = T&; - using const_reference = const T&; - using size_type = std::size_t; - using difference_type = std::ptrdiff_t; - - template - struct rebind { - using other = allocator; - }; - - /// \brief Construct an allocator from the default MemoryPool - allocator() noexcept : pool_(default_memory_pool()) {} - /// \brief Construct an allocator from the given MemoryPool - explicit allocator(MemoryPool* pool) noexcept : pool_(pool) {} - - template - allocator(const allocator& rhs) noexcept : pool_(rhs.pool()) {} - - ~allocator() { pool_ = NULLPTR; } - - pointer address(reference r) const noexcept { return std::addressof(r); } - - const_pointer address(const_reference r) const noexcept { return std::addressof(r); } - - pointer allocate(size_type n, const void* /*hint*/ = NULLPTR) { - uint8_t* data; - Status s = pool_->Allocate(n * sizeof(T), &data); - if (!s.ok()) throw std::bad_alloc(); - return reinterpret_cast(data); - } - - void deallocate(pointer p, size_type n) { - pool_->Free(reinterpret_cast(p), n * sizeof(T)); - } - - size_type size_max() const noexcept { return size_type(-1) / sizeof(T); } - - template - void construct(U* p, Args&&... args) { - new (reinterpret_cast(p)) U(std::forward(args)...); - } - - template - void destroy(U* p) { - p->~U(); - } - - MemoryPool* pool() const noexcept { return pool_; } - - private: - MemoryPool* pool_; -}; - -/// \brief A MemoryPool implementation delegating allocations to a STL allocator -/// -/// Note that STL allocators don't provide a resizing operation, and therefore -/// any buffer resizes will do a full reallocation and copy. -template > -class STLMemoryPool : public MemoryPool { - public: - /// \brief Construct a memory pool from the given allocator - explicit STLMemoryPool(const Allocator& alloc) : alloc_(alloc) {} - - Status Allocate(int64_t size, uint8_t** out) override { - try { - *out = alloc_.allocate(size); - } catch (std::bad_alloc& e) { - return Status::OutOfMemory(e.what()); - } - stats_.UpdateAllocatedBytes(size); - return Status::OK(); - } - - Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override { - uint8_t* old_ptr = *ptr; - try { - *ptr = alloc_.allocate(new_size); - } catch (std::bad_alloc& e) { - return Status::OutOfMemory(e.what()); - } - memcpy(*ptr, old_ptr, std::min(old_size, new_size)); - alloc_.deallocate(old_ptr, old_size); - stats_.UpdateAllocatedBytes(new_size - old_size); - return Status::OK(); - } - - void Free(uint8_t* buffer, int64_t size) override { - alloc_.deallocate(buffer, size); - stats_.UpdateAllocatedBytes(-size); - } - - int64_t bytes_allocated() const override { return stats_.bytes_allocated(); } - - int64_t max_memory() const override { return stats_.max_memory(); } - - std::string backend_name() const override { return "stl"; } - - private: - Allocator alloc_; - arrow::internal::MemoryPoolStats stats_; -}; - -template -bool operator==(const allocator& lhs, const allocator& rhs) noexcept { - return lhs.pool() == rhs.pool(); -} - -template -bool operator!=(const allocator& lhs, const allocator& rhs) noexcept { - return !(lhs == rhs); -} - } // namespace stl - } // namespace arrow #endif // ARROW_STL_H diff --git a/cpp/src/arrow/stl_allocator.h b/cpp/src/arrow/stl_allocator.h new file mode 100644 index 00000000000..b5ad2b53460 --- /dev/null +++ b/cpp/src/arrow/stl_allocator.h @@ -0,0 +1,153 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/memory_pool.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" + +namespace arrow { +namespace stl { + +/// \brief A STL allocator delegating allocations to a Arrow MemoryPool +template +class allocator { + public: + using value_type = T; + using pointer = T*; + using const_pointer = const T*; + using reference = T&; + using const_reference = const T&; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + + template + struct rebind { + using other = allocator; + }; + + /// \brief Construct an allocator from the default MemoryPool + allocator() noexcept : pool_(default_memory_pool()) {} + /// \brief Construct an allocator from the given MemoryPool + explicit allocator(MemoryPool* pool) noexcept : pool_(pool) {} + + template + allocator(const allocator& rhs) noexcept : pool_(rhs.pool()) {} + + ~allocator() { pool_ = NULLPTR; } + + pointer address(reference r) const noexcept { return std::addressof(r); } + + const_pointer address(const_reference r) const noexcept { return std::addressof(r); } + + pointer allocate(size_type n, const void* /*hint*/ = NULLPTR) { + uint8_t* data; + Status s = pool_->Allocate(n * sizeof(T), &data); + if (!s.ok()) throw std::bad_alloc(); + return reinterpret_cast(data); + } + + void deallocate(pointer p, size_type n) { + pool_->Free(reinterpret_cast(p), n * sizeof(T)); + } + + size_type size_max() const noexcept { return size_type(-1) / sizeof(T); } + + template + void construct(U* p, Args&&... args) { + new (reinterpret_cast(p)) U(std::forward(args)...); + } + + template + void destroy(U* p) { + p->~U(); + } + + MemoryPool* pool() const noexcept { return pool_; } + + private: + MemoryPool* pool_; +}; + +/// \brief A MemoryPool implementation delegating allocations to a STL allocator +/// +/// Note that STL allocators don't provide a resizing operation, and therefore +/// any buffer resizes will do a full reallocation and copy. +template > +class STLMemoryPool : public MemoryPool { + public: + /// \brief Construct a memory pool from the given allocator + explicit STLMemoryPool(const Allocator& alloc) : alloc_(alloc) {} + + Status Allocate(int64_t size, uint8_t** out) override { + try { + *out = alloc_.allocate(size); + } catch (std::bad_alloc& e) { + return Status::OutOfMemory(e.what()); + } + stats_.UpdateAllocatedBytes(size); + return Status::OK(); + } + + Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override { + uint8_t* old_ptr = *ptr; + try { + *ptr = alloc_.allocate(new_size); + } catch (std::bad_alloc& e) { + return Status::OutOfMemory(e.what()); + } + memcpy(*ptr, old_ptr, std::min(old_size, new_size)); + alloc_.deallocate(old_ptr, old_size); + stats_.UpdateAllocatedBytes(new_size - old_size); + return Status::OK(); + } + + void Free(uint8_t* buffer, int64_t size) override { + alloc_.deallocate(buffer, size); + stats_.UpdateAllocatedBytes(-size); + } + + int64_t bytes_allocated() const override { return stats_.bytes_allocated(); } + + int64_t max_memory() const override { return stats_.max_memory(); } + + std::string backend_name() const override { return "stl"; } + + private: + Allocator alloc_; + arrow::internal::MemoryPoolStats stats_; +}; + +template +bool operator==(const allocator& lhs, const allocator& rhs) noexcept { + return lhs.pool() == rhs.pool(); +} + +template +bool operator!=(const allocator& lhs, const allocator& rhs) noexcept { + return !(lhs == rhs); +} + +} // namespace stl +} // namespace arrow diff --git a/cpp/src/arrow/stl_test.cc b/cpp/src/arrow/stl_test.cc index b0cef528245..47b60067d99 100644 --- a/cpp/src/arrow/stl_test.cc +++ b/cpp/src/arrow/stl_test.cc @@ -29,6 +29,7 @@ #include "arrow/memory_pool.h" #include "arrow/stl.h" +#include "arrow/stl_allocator.h" #include "arrow/table.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index d1811dc5624..1c5a5dfa9e9 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -215,7 +215,7 @@ ASSERT_EQUAL_IMPL(Schema, Schema, "schemas") #undef ASSERT_EQUAL_IMPL void AssertDatumsEqual(const Datum& expected, const Datum& actual) { - // TODO: Implements better print. + // TODO: Implement better print ASSERT_TRUE(actual.Equals(expected)); } diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 86ab0beb66d..a946aeb0387 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -614,13 +614,10 @@ class Schema::Impl { std::shared_ptr metadata_; }; -Schema::Schema(const std::vector>& fields, - const std::shared_ptr& metadata) - : detail::Fingerprintable(), impl_(new Impl(fields, metadata)) {} - -Schema::Schema(std::vector>&& fields, - const std::shared_ptr& metadata) - : detail::Fingerprintable(), impl_(new Impl(std::move(fields), metadata)) {} +Schema::Schema(const std::vector> fields, + const std::shared_ptr metadata) + : detail::Fingerprintable(), + impl_(new Impl(std::move(fields), std::move(metadata))) {} Schema::Schema(const Schema& schema) : detail::Fingerprintable(), impl_(new Impl(*schema.impl_)) {} @@ -968,14 +965,9 @@ Status SchemaBuilder::AreCompatible(const std::vector>& return Merge(schemas, policy).status(); } -std::shared_ptr schema(const std::vector>& fields, - const std::shared_ptr& metadata) { - return std::make_shared(fields, metadata); -} - -std::shared_ptr schema(std::vector>&& fields, - const std::shared_ptr& metadata) { - return std::make_shared(std::move(fields), metadata); +std::shared_ptr schema(std::vector> fields, + std::shared_ptr metadata) { + return std::make_shared(std::move(fields), std::move(metadata)); } Result> UnifySchemas( @@ -1241,6 +1233,17 @@ std::string MapType::ComputeFingerprint() const { return ""; } +std::string FixedSizeListType::ComputeFingerprint() const { + const auto& child_fingerprint = children_[0]->fingerprint(); + if (!child_fingerprint.empty()) { + std::stringstream ss; + ss << TypeIdFingerprint(*this) << "[" << list_size_ << "]" + << "{" << child_fingerprint << "}"; + return ss.str(); + } + return ""; +} + std::string FixedSizeBinaryType::ComputeFingerprint() const { std::stringstream ss; ss << TypeIdFingerprint(*this) << "[" << byte_width_ << "]"; @@ -1470,10 +1473,11 @@ std::shared_ptr dictionary(const std::shared_ptr& index_type return std::make_shared(index_type, dict_type, ordered); } -std::shared_ptr field(const std::string& name, - const std::shared_ptr& type, bool nullable, - const std::shared_ptr& metadata) { - return std::make_shared(name, type, nullable, metadata); +std::shared_ptr field(std::string name, std::shared_ptr type, + bool nullable, + std::shared_ptr metadata) { + return std::make_shared(std::move(name), std::move(type), nullable, + std::move(metadata)); } std::shared_ptr decimal(int32_t precision, int32_t scale) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 4bcd9f55b01..bcd3a18b7fe 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -339,14 +339,13 @@ class ARROW_EXPORT NestedType : public DataType, public ParametricType { /// which holds arbitrary key-value pairs. class ARROW_EXPORT Field : public detail::Fingerprintable { public: - Field(const std::string& name, const std::shared_ptr& type, - bool nullable = true, - const std::shared_ptr& metadata = NULLPTR) + Field(std::string name, std::shared_ptr type, bool nullable = true, + std::shared_ptr metadata = NULLPTR) : detail::Fingerprintable(), - name_(name), - type_(type), + name_(std::move(name)), + type_(std::move(type)), nullable_(nullable), - metadata_(metadata) {} + metadata_(std::move(metadata)) {} ~Field() override; @@ -430,7 +429,7 @@ class ARROW_EXPORT Field : public detail::Fingerprintable { /// \brief Return the field name const std::string& name() const { return name_; } /// \brief Return the field data type - std::shared_ptr type() const { return type_; } + const std::shared_ptr& type() const { return type_; } /// \brief Return whether the field is nullable bool nullable() const { return nullable_; } @@ -777,6 +776,8 @@ class ARROW_EXPORT FixedSizeListType : public NestedType { int32_t list_size() const { return list_size_; } protected: + std::string ComputeFingerprint() const override; + int32_t list_size_; }; @@ -1229,7 +1230,7 @@ class ARROW_EXPORT TimestampType : public TemporalType, public ParametricType { std::string timezone_; }; -// Base class for the different kinds of intervals. +// Base class for the different kinds of calendar intervals. class ARROW_EXPORT IntervalType : public TemporalType, public ParametricType { public: enum type { MONTHS, DAY_TIME }; @@ -1241,10 +1242,10 @@ class ARROW_EXPORT IntervalType : public TemporalType, public ParametricType { std::string ComputeFingerprint() const override; }; -/// \brief Represents a some number of months. +/// \brief Represents a number of months. /// /// Type representing a number of months. Corresponds to YearMonth type -/// in Schema.fbs (Years are defined as 12 months). +/// in Schema.fbs (years are defined as 12 months). class ARROW_EXPORT MonthIntervalType : public IntervalType { public: static constexpr Type::type type_id = Type::INTERVAL; @@ -1293,8 +1294,7 @@ class ARROW_EXPORT DayTimeIntervalType : public IntervalType { std::string name() const override { return "day_time_interval"; } }; -// \brief Represents an amount of elapsed time without any relation to a calendar -// artifact. +/// \brief Represents an elapsed time without any relation to a calendar artifact. class ARROW_EXPORT DurationType : public TemporalType, public ParametricType { public: using Unit = TimeUnit; @@ -1348,8 +1348,8 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType { DataTypeLayout layout() const override; - std::shared_ptr index_type() const { return index_type_; } - std::shared_ptr value_type() const { return value_type_; } + const std::shared_ptr& index_type() const { return index_type_; } + const std::shared_ptr& value_type() const { return value_type_; } bool ordered() const { return ordered_; } @@ -1406,11 +1406,8 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable, public util::EqualityComparable, public util::ToStringOstreamable { public: - explicit Schema(const std::vector>& fields, - const std::shared_ptr& metadata = NULLPTR); - - explicit Schema(std::vector>&& fields, - const std::shared_ptr& metadata = NULLPTR); + explicit Schema(std::vector> fields, + std::shared_ptr metadata = NULLPTR); Schema(const Schema&); @@ -1626,9 +1623,9 @@ std::shared_ptr dictionary(const std::shared_ptr& index_type /// \param type the field value type /// \param nullable whether the values are nullable, default true /// \param metadata any custom key-value metadata, default null -std::shared_ptr ARROW_EXPORT field( - const std::string& name, const std::shared_ptr& type, bool nullable = true, - const std::shared_ptr& metadata = NULLPTR); +std::shared_ptr ARROW_EXPORT +field(std::string name, std::shared_ptr type, bool nullable = true, + std::shared_ptr metadata = NULLPTR); /// \brief Create a Schema instance /// @@ -1637,18 +1634,8 @@ std::shared_ptr ARROW_EXPORT field( /// \return schema shared_ptr to Schema ARROW_EXPORT std::shared_ptr schema( - const std::vector>& fields, - const std::shared_ptr& metadata = NULLPTR); - -/// \brief Create a Schema instance -/// -/// \param fields the schema's fields (rvalue reference) -/// \param metadata any custom key-value metadata, default null -/// \return schema shared_ptr to Schema -ARROW_EXPORT -std::shared_ptr schema( - std::vector>&& fields, - const std::shared_ptr& metadata = NULLPTR); + std::vector> fields, + std::shared_ptr metadata = NULLPTR); /// @} diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index aa50fc562fb..245e05fe4ab 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -979,6 +979,23 @@ TEST(TestFixedSizeListType, Basics) { ASSERT_EQ("fixed_size_list[3]>[7]", lt2.ToString()); } +TEST(TestFixedSizeListType, Equals) { + auto t1 = fixed_size_list(int8(), 3); + auto t2 = fixed_size_list(int8(), 3); + auto t3 = fixed_size_list(int8(), 4); + auto t4 = fixed_size_list(int16(), 4); + auto t5 = fixed_size_list(list(int16()), 4); + auto t6 = fixed_size_list(list(int16()), 4); + auto t7 = fixed_size_list(list(int32()), 4); + + AssertTypeEqual(t1, t2); + AssertTypeNotEqual(t2, t3); + AssertTypeNotEqual(t3, t4); + AssertTypeNotEqual(t4, t5); + AssertTypeEqual(t5, t6); + AssertTypeNotEqual(t6, t7); +} + TEST(TestDateTypes, Attrs) { auto t1 = date32(); auto t2 = date64(); diff --git a/cpp/src/arrow/util/key_value_metadata.cc b/cpp/src/arrow/util/key_value_metadata.cc index 8c6bd6aa5ca..c645546a676 100644 --- a/cpp/src/arrow/util/key_value_metadata.cc +++ b/cpp/src/arrow/util/key_value_metadata.cc @@ -61,9 +61,9 @@ KeyValueMetadata::KeyValueMetadata( ARROW_CHECK_EQ(keys_.size(), values_.size()); } -KeyValueMetadata::KeyValueMetadata(const std::vector& keys, - const std::vector& values) - : keys_(keys), values_(values) { +KeyValueMetadata::KeyValueMetadata(std::vector keys, + std::vector values) + : keys_(std::move(keys)), values_(std::move(values)) { ARROW_CHECK_EQ(keys.size(), values.size()); } @@ -164,9 +164,9 @@ std::shared_ptr key_value_metadata( return std::make_shared(pairs); } -std::shared_ptr key_value_metadata( - const std::vector& keys, const std::vector& values) { - return std::make_shared(keys, values); +std::shared_ptr key_value_metadata(std::vector keys, + std::vector values) { + return std::make_shared(std::move(keys), std::move(values)); } } // namespace arrow diff --git a/cpp/src/arrow/util/key_value_metadata.h b/cpp/src/arrow/util/key_value_metadata.h index d84e060822d..7152624ce5c 100644 --- a/cpp/src/arrow/util/key_value_metadata.h +++ b/cpp/src/arrow/util/key_value_metadata.h @@ -34,8 +34,7 @@ namespace arrow { class ARROW_EXPORT KeyValueMetadata { public: KeyValueMetadata(); - KeyValueMetadata(const std::vector& keys, - const std::vector& values); + KeyValueMetadata(std::vector keys, std::vector values); explicit KeyValueMetadata(const std::unordered_map& map); virtual ~KeyValueMetadata() = default; @@ -75,8 +74,8 @@ key_value_metadata(const std::unordered_map& pairs); /// /// \param keys sequence of metadata keys /// \param values sequence of corresponding metadata values -std::shared_ptr ARROW_EXPORT key_value_metadata( - const std::vector& keys, const std::vector& values); +std::shared_ptr ARROW_EXPORT +key_value_metadata(std::vector keys, std::vector values); } // namespace arrow diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 892700a20d1..212ff08c13e 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -1196,8 +1196,8 @@ TEST_F(TestNullParquetIO, NullDictionaryColumn) { ASSERT_OK(::arrow::AllocateEmptyBitmap(::arrow::default_memory_pool(), SMALL_SIZE, &null_bitmap)); - std::shared_ptr indices = - std::make_shared<::arrow::Int8Array>(SMALL_SIZE, nullptr, null_bitmap, SMALL_SIZE); + std::shared_ptr indices; + ASSERT_OK(MakeArrayOfNull(::arrow::int8(), SMALL_SIZE, &indices)); std::shared_ptr<::arrow::DictionaryType> dict_type = std::make_shared<::arrow::DictionaryType>(::arrow::int8(), ::arrow::null()); diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 150fc18693c..6f6677d650d 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -27,7 +27,7 @@ #include "arrow/array.h" #include "arrow/builder.h" -#include "arrow/stl.h" +#include "arrow/stl_allocator.h" #include "arrow/util/bit_stream_utils.h" #include "arrow/util/checked_cast.h" #include "arrow/util/hashing.h" diff --git a/docker-compose.yml b/docker-compose.yml index 4b0c3391d83..9c2455c8bb4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -736,6 +736,7 @@ services: command: > /bin/bash -c " /arrow/ci/scripts/cpp_build.sh /arrow /build && + /arrow/ci/scripts/python_build.sh /arrow /build && /arrow/ci/scripts/r_test.sh /arrow" r: diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 20f0ae1be94..2a1e80e8078 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1113,6 +1113,61 @@ cdef class Array(_PandasConvertible): _append_array_buffers(self.sp_array.get().data().get(), res) return res + def _export_to_c(self, uintptr_t out_ptr, uintptr_t out_schema_ptr=0): + """ + Export to a C ArrowArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the array type + is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + with nogil: + check_status(ExportArray(deref(self.sp_array), + out_ptr, + out_schema_ptr)) + + @staticmethod + def _import_from_c(uintptr_t in_ptr, type): + """ + Import Array from a C ArrowArray struct, given its pointer + and the imported array type. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArray struct. + type: DataType or int + Either a DataType object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + cdef: + shared_ptr[CArray] c_array + + c_type = pyarrow_unwrap_data_type(type) + if c_type == nullptr: + # Not a DataType object, perhaps a raw ArrowSchema pointer + type_ptr = type + with nogil: + c_array = GetResultValue(ImportArray( in_ptr, + type_ptr)) + else: + with nogil: + c_array = GetResultValue(ImportArray( in_ptr, + c_type)) + return pyarrow_wrap_array(c_array) + cdef _array_like_to_pandas(obj, options): cdef: diff --git a/python/pyarrow/cffi.py b/python/pyarrow/cffi.py new file mode 100644 index 00000000000..8880c25a035 --- /dev/null +++ b/python/pyarrow/cffi.py @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +import cffi + +c_source = """ + struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; + }; + + struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; + }; + """ + +# TODO use out-of-line mode for faster import and avoid C parsing +ffi = cffi.FFI() +ffi.cdef(c_source) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index c76d5498d7f..e80b0ebc6d6 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1727,3 +1727,31 @@ cdef extern from 'arrow/util/thread_pool.h' namespace 'arrow' nogil: cdef extern from 'arrow/array/concatenate.h' namespace 'arrow' nogil: CStatus Concatenate(const vector[shared_ptr[CArray]]& arrays, CMemoryPool* pool, shared_ptr[CArray]* result) + +cdef extern from 'arrow/c/abi.h': + cdef struct ArrowSchema: + pass + + cdef struct ArrowArray: + pass + +cdef extern from 'arrow/c/bridge.h' namespace 'arrow' nogil: + CStatus ExportType(CDataType&, ArrowSchema* out) + CResult[shared_ptr[CDataType]] ImportType(ArrowSchema*) + + CStatus ExportSchema(CSchema&, ArrowSchema* out) + CResult[shared_ptr[CSchema]] ImportSchema(ArrowSchema*) + + CStatus ExportArray(CArray&, ArrowArray* out) + CStatus ExportArray(CArray&, ArrowArray* out, ArrowSchema* out_schema) + CResult[shared_ptr[CArray]] ImportArray(ArrowArray*, + shared_ptr[CDataType]) + CResult[shared_ptr[CArray]] ImportArray(ArrowArray*, ArrowSchema*) + + CStatus ExportRecordBatch(CRecordBatch&, ArrowArray* out) + CStatus ExportRecordBatch(CRecordBatch&, ArrowArray* out, + ArrowSchema* out_schema) + CResult[shared_ptr[CRecordBatch]] ImportRecordBatch(ArrowArray*, + shared_ptr[CSchema]) + CResult[shared_ptr[CRecordBatch]] ImportRecordBatch(ArrowArray*, + ArrowSchema*) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 5bffcafcef7..a7867ba41a3 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -846,6 +846,61 @@ cdef class RecordBatch(_PandasConvertible): &c_record_batch)) return pyarrow_wrap_batch(c_record_batch) + def _export_to_c(self, uintptr_t out_ptr, uintptr_t out_schema_ptr=0): + """ + Export to a C ArrowArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the record batch + schema is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + with nogil: + check_status(ExportRecordBatch(deref(self.sp_batch), + out_ptr, + out_schema_ptr)) + + @staticmethod + def _import_from_c(uintptr_t in_ptr, schema): + """ + Import RecordBatch from a C ArrowArray struct, given its pointer + and the imported schema. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArray struct. + type: Schema or int + Either a Schema object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + cdef: + shared_ptr[CRecordBatch] c_batch + + c_schema = pyarrow_unwrap_schema(schema) + if c_schema == nullptr: + # Not a Schema object, perhaps a raw ArrowSchema pointer + schema_ptr = schema + with nogil: + c_batch = GetResultValue(ImportRecordBatch( + in_ptr, schema_ptr)) + else: + with nogil: + c_batch = GetResultValue(ImportRecordBatch( + in_ptr, c_schema)) + return pyarrow_wrap_batch(c_batch) + def _reconstruct_record_batch(columns, schema): """ diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index b975eb33003..44e0369aa95 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -326,10 +326,10 @@ def test_array_diff(): +"bar" +null ''' - assert arr1.diff(arr3) == '# Array types differed: string vs int64' - assert arr1.diff(arr3) == '# Array types differed: string vs int64' - assert arr1.diff(arr4) == ('# Array types differed: string vs ' - 'list') + assert arr1.diff(arr3).strip() == '# Array types differed: string vs int64' + assert arr1.diff(arr3).strip() == '# Array types differed: string vs int64' + assert arr1.diff(arr4).strip() == ('# Array types differed: string vs ' + 'list') def test_array_iter(): diff --git a/python/pyarrow/tests/test_cffi.py b/python/pyarrow/tests/test_cffi.py new file mode 100644 index 00000000000..465ab7432ea --- /dev/null +++ b/python/pyarrow/tests/test_cffi.py @@ -0,0 +1,207 @@ +# -*- coding: utf-8 -*- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow as pa +try: + from pyarrow.cffi import ffi +except ImportError: + ffi = None + +import pytest + +needs_cffi = pytest.mark.skipif(ffi is None, + reason="test needs cffi package installed") + + +assert_schema_released = pytest.raises( + ValueError, match="Cannot import released ArrowSchema") + +assert_array_released = pytest.raises( + ValueError, match="Cannot import released ArrowArray") + + +@needs_cffi +def test_export_import_type(): + c_schema = ffi.new("struct ArrowSchema*") + ptr_schema = int(ffi.cast("uintptr_t", c_schema)) + + old_allocated = pa.total_allocated_bytes() + + typ = pa.list_(pa.int32()) + typ._export_to_c(ptr_schema) + assert pa.total_allocated_bytes() > old_allocated + # Delete and recreate C++ object from exported pointer + del typ + assert pa.total_allocated_bytes() > old_allocated + typ_new = pa.DataType._import_from_c(ptr_schema) + assert typ_new == pa.list_(pa.int32()) + assert pa.total_allocated_bytes() == old_allocated + # Now released + with assert_schema_released: + pa.DataType._import_from_c(ptr_schema) + + # Invalid format string + pa.int32()._export_to_c(ptr_schema) + bad_format = ffi.new("char[]", b"zzz") + c_schema.format = bad_format + with pytest.raises(ValueError, + match="Invalid or unsupported format string"): + pa.DataType._import_from_c(ptr_schema) + # Now released + with assert_schema_released: + pa.DataType._import_from_c(ptr_schema) + + +@needs_cffi +def test_export_import_array(): + c_schema = ffi.new("struct ArrowSchema*") + ptr_schema = int(ffi.cast("uintptr_t", c_schema)) + c_array = ffi.new("struct ArrowArray*") + ptr_array = int(ffi.cast("uintptr_t", c_array)) + + old_allocated = pa.total_allocated_bytes() + + # Type is known up front + typ = pa.list_(pa.int32()) + arr = pa.array([[1], [2, 42]], type=typ) + py_value = arr.to_pylist() + arr._export_to_c(ptr_array) + assert pa.total_allocated_bytes() > old_allocated + # Delete recreate C++ object from exported pointer + del arr + arr_new = pa.Array._import_from_c(ptr_array, typ) + assert arr_new.to_pylist() == py_value + assert arr_new.type == pa.list_(pa.int32()) + assert pa.total_allocated_bytes() > old_allocated + del arr_new, typ + assert pa.total_allocated_bytes() == old_allocated + # Now released + with assert_array_released: + pa.Array._import_from_c(ptr_array, pa.list_(pa.int32())) + + # Type is exported and imported at the same time + arr = pa.array([[1], [2, 42]], type=pa.list_(pa.int32())) + py_value = arr.to_pylist() + arr._export_to_c(ptr_array, ptr_schema) + # Delete and recreate C++ objects from exported pointers + del arr + arr_new = pa.Array._import_from_c(ptr_array, ptr_schema) + assert arr_new.to_pylist() == py_value + assert arr_new.type == pa.list_(pa.int32()) + assert pa.total_allocated_bytes() > old_allocated + del arr_new + assert pa.total_allocated_bytes() == old_allocated + # Now released + with assert_schema_released: + pa.Array._import_from_c(ptr_array, ptr_schema) + + +@needs_cffi +def test_export_import_schema(): + c_schema = ffi.new("struct ArrowSchema*") + ptr_schema = int(ffi.cast("uintptr_t", c_schema)) + + def make_schema(): + return pa.schema([('ints', pa.list_(pa.int32()))], + metadata={b'key1': b'value1'}) + + old_allocated = pa.total_allocated_bytes() + + make_schema()._export_to_c(ptr_schema) + assert pa.total_allocated_bytes() > old_allocated + # Delete and recreate C++ object from exported pointer + schema_new = pa.Schema._import_from_c(ptr_schema) + assert schema_new == make_schema() + assert pa.total_allocated_bytes() == old_allocated + del schema_new + assert pa.total_allocated_bytes() == old_allocated + # Now released + with assert_schema_released: + pa.Schema._import_from_c(ptr_schema) + + # Not a struct type + pa.int32()._export_to_c(ptr_schema) + with pytest.raises(ValueError, + match="ArrowSchema describes non-struct type"): + pa.Schema._import_from_c(ptr_schema) + # Now released + with assert_schema_released: + pa.Schema._import_from_c(ptr_schema) + + +@needs_cffi +def test_export_import_batch(): + c_schema = ffi.new("struct ArrowSchema*") + ptr_schema = int(ffi.cast("uintptr_t", c_schema)) + c_array = ffi.new("struct ArrowArray*") + ptr_array = int(ffi.cast("uintptr_t", c_array)) + + def make_schema(): + return pa.schema([('ints', pa.list_(pa.int32()))], + metadata={b'key1': b'value1'}) + + def make_batch(): + return pa.record_batch([[[1], [2, 42]]], make_schema()) + + old_allocated = pa.total_allocated_bytes() + + # Schema is known up front + schema = make_schema() + batch = make_batch() + py_value = batch.to_pydict() + batch._export_to_c(ptr_array) + assert pa.total_allocated_bytes() > old_allocated + # Delete recreate C++ object from exported pointer + del batch + batch_new = pa.RecordBatch._import_from_c(ptr_array, schema) + assert batch_new.to_pydict() == py_value + assert batch_new.schema == schema + assert pa.total_allocated_bytes() > old_allocated + del batch_new, schema + assert pa.total_allocated_bytes() == old_allocated + # Now released + with assert_array_released: + pa.RecordBatch._import_from_c(ptr_array, make_schema()) + + # Type is exported and imported at the same time + batch = make_batch() + py_value = batch.to_pydict() + batch._export_to_c(ptr_array, ptr_schema) + # Delete and recreate C++ objects from exported pointers + del batch + batch_new = pa.RecordBatch._import_from_c(ptr_array, ptr_schema) + assert batch_new.to_pydict() == py_value + print(batch_new.schema) + print(make_schema()) + assert batch_new.schema == make_schema() + assert pa.total_allocated_bytes() > old_allocated + del batch_new + assert pa.total_allocated_bytes() == old_allocated + # Now released + with assert_schema_released: + pa.RecordBatch._import_from_c(ptr_array, ptr_schema) + + # Not a struct type + pa.int32()._export_to_c(ptr_schema) + make_batch()._export_to_c(ptr_array) + with pytest.raises(ValueError, + match="ArrowSchema describes non-struct type"): + pa.RecordBatch._import_from_c(ptr_array, ptr_schema) + # Now released + with assert_schema_released: + pa.RecordBatch._import_from_c(ptr_array, ptr_schema) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index c92f3e45c15..ca1c4e9bc2f 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -189,6 +189,26 @@ cdef class DataType: else: raise NotImplementedError(str(self)) + def _export_to_c(self, uintptr_t out_ptr): + """ + Export to a C ArrowSchema struct, given its pointer. + + Be careful: if you don't pass the ArrowSchema struct to a consumer, + its memory will leak. This is a low-level function intended for + expert users. + """ + check_status(ExportType(deref(self.type), out_ptr)) + + @staticmethod + def _import_from_c(uintptr_t in_ptr): + """ + Import DataType from a C ArrowSchema struct, given its pointer. + + This is a low-level function intended for expert users. + """ + result = GetResultValue(ImportType( in_ptr)) + return pyarrow_wrap_data_type(result) + cdef class DictionaryMemo: """ @@ -1300,6 +1320,27 @@ cdef class Schema: new_schema = self.schema.RemoveMetadata() return pyarrow_wrap_schema(new_schema) + def _export_to_c(self, uintptr_t out_ptr): + """ + Export to a C ArrowSchema struct, given its pointer. + + Be careful: if you don't pass the ArrowSchema struct to a consumer, + its memory will leak. This is a low-level function intended for + expert users. + """ + check_status(ExportSchema(deref(self.schema), out_ptr)) + + @staticmethod + def _import_from_c(uintptr_t in_ptr): + """ + Import Schema from a C ArrowSchema struct, given its pointer. + + This is a low-level function intended for expert users. + """ + with nogil: + result = GetResultValue(ImportSchema( in_ptr)) + return pyarrow_wrap_schema(result) + def __str__(self): cdef: c_string result diff --git a/python/requirements-test.txt b/python/requirements-test.txt index 4894d9c379a..a4f35f7f2d9 100644 --- a/python/requirements-test.txt +++ b/python/requirements-test.txt @@ -1,3 +1,4 @@ +cffi cython hypothesis pandas diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 8aeba49a4b1..6de7ea82212 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -45,6 +45,7 @@ Suggests: knitr, lubridate, pkgload, + reticulate, rmarkdown, testthat, tibble @@ -76,6 +77,7 @@ Collate: 'memory-pool.R' 'message.R' 'parquet.R' + 'py-to-r.R' 'read-record-batch.R' 'read-table.R' 'record-batch-reader.R' diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index ffa79820861..6e3fae35127 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -39,6 +39,10 @@ s3_register(m, cl) } } + s3_register("reticulate::py_to_r", "pyarrow.lib.Array") + s3_register("reticulate::py_to_r", "pyarrow.lib.RecordBatch") + s3_register("reticulate::r_to_py", "Array") + s3_register("reticulate::r_to_py", "RecordBatch") invisible() } diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 68cb3218c03..1e3d041fbc1 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1216,6 +1216,46 @@ parquet___arrow___FileReader__GetSchema <- function(reader){ .Call(`_arrow_parquet___arrow___FileReader__GetSchema` , reader) } +ImportArray <- function(array, schema){ + .Call(`_arrow_ImportArray` , array, schema) +} + +ImportRecordBatch <- function(array, schema){ + .Call(`_arrow_ImportRecordBatch` , array, schema) +} + +allocate_arrow_schema <- function(){ + .Call(`_arrow_allocate_arrow_schema` ) +} + +delete_arrow_schema <- function(ptr){ + invisible(.Call(`_arrow_delete_arrow_schema` , ptr)) +} + +allocate_arrow_array <- function(){ + .Call(`_arrow_allocate_arrow_array` ) +} + +delete_arrow_array <- function(ptr){ + invisible(.Call(`_arrow_delete_arrow_array` , ptr)) +} + +ExportType <- function(type, ptr){ + invisible(.Call(`_arrow_ExportType` , type, ptr)) +} + +ExportSchema <- function(schema, ptr){ + invisible(.Call(`_arrow_ExportSchema` , schema, ptr)) +} + +ExportArray <- function(array, ptr, schema_ptr){ + invisible(.Call(`_arrow_ExportArray` , array, ptr, schema_ptr)) +} + +ExportRecordBatch <- function(batch, ptr, schema_ptr){ + invisible(.Call(`_arrow_ExportRecordBatch` , batch, ptr, schema_ptr)) +} + RecordBatch__num_columns <- function(x){ .Call(`_arrow_RecordBatch__num_columns` , x) } diff --git a/r/R/py-to-r.R b/r/R/py-to-r.R new file mode 100644 index 00000000000..74efedf53d4 --- /dev/null +++ b/r/R/py-to-r.R @@ -0,0 +1,58 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +py_to_r.pyarrow.lib.Array <- function(x, ...) { + schema_ptr <- allocate_arrow_schema() + on.exit(delete_arrow_schema(schema_ptr)) + array_ptr <- allocate_arrow_array() + on.exit(delete_arrow_array(array_ptr)) + + x$`_export_to_c`(array_ptr, schema_ptr) + Array$create(ImportArray(array_ptr, schema_ptr)) +} + +r_to_py.Array <- function(x, convert = FALSE) { + schema_ptr <- allocate_arrow_schema() + on.exit(delete_arrow_schema(schema_ptr)) + array_ptr <- allocate_arrow_array() + on.exit(delete_arrow_array(array_ptr)) + + pa <- reticulate::import("pyarrow", convert = convert) + ExportArray(x, array_ptr, schema_ptr) + pa$Array$`_import_from_c`(array_ptr, schema_ptr) +} + +py_to_r.pyarrow.lib.RecordBatch <- function(x, ...) { + schema_ptr <- allocate_arrow_schema() + on.exit(delete_arrow_schema(schema_ptr)) + array_ptr <- allocate_arrow_array() + on.exit(delete_arrow_array(array_ptr)) + + x$`_export_to_c`(array_ptr, schema_ptr) + shared_ptr(RecordBatch, ImportRecordBatch(array_ptr, schema_ptr)) +} + +r_to_py.RecordBatch <- function(x, convert = FALSE) { + schema_ptr <- allocate_arrow_schema() + on.exit(delete_arrow_schema(schema_ptr)) + array_ptr <- allocate_arrow_array() + on.exit(delete_arrow_array(array_ptr)) + + pa <- reticulate::import("pyarrow", convert = convert) + ExportRecordBatch(x, array_ptr, schema_ptr) + pa$RecordBatch$`_import_from_c`(array_ptr, schema_ptr) +} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 66a85dc0375..9a8b52d822e 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -4753,6 +4753,168 @@ RcppExport SEXP _arrow_parquet___arrow___FileReader__GetSchema(SEXP reader_sexp) } #endif +// py-to-r.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ImportArray(uintptr_t array, uintptr_t schema); +RcppExport SEXP _arrow_ImportArray(SEXP array_sexp, SEXP schema_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter::type array(array_sexp); + Rcpp::traits::input_parameter::type schema(schema_sexp); + return Rcpp::wrap(ImportArray(array, schema)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ImportArray(SEXP array_sexp, SEXP schema_sexp){ + Rf_error("Cannot call ImportArray(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// py-to-r.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ImportRecordBatch(uintptr_t array, uintptr_t schema); +RcppExport SEXP _arrow_ImportRecordBatch(SEXP array_sexp, SEXP schema_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter::type array(array_sexp); + Rcpp::traits::input_parameter::type schema(schema_sexp); + return Rcpp::wrap(ImportRecordBatch(array, schema)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ImportRecordBatch(SEXP array_sexp, SEXP schema_sexp){ + Rf_error("Cannot call ImportRecordBatch(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// py-to-r.cpp +#if defined(ARROW_R_WITH_ARROW) +uintptr_t allocate_arrow_schema(); +RcppExport SEXP _arrow_allocate_arrow_schema(){ +BEGIN_RCPP + return Rcpp::wrap(allocate_arrow_schema()); +END_RCPP +} +#else +RcppExport SEXP _arrow_allocate_arrow_schema(){ + Rf_error("Cannot call allocate_arrow_schema(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// py-to-r.cpp +#if defined(ARROW_R_WITH_ARROW) +void delete_arrow_schema(uintptr_t ptr); +RcppExport SEXP _arrow_delete_arrow_schema(SEXP ptr_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter::type ptr(ptr_sexp); + delete_arrow_schema(ptr); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_delete_arrow_schema(SEXP ptr_sexp){ + Rf_error("Cannot call delete_arrow_schema(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// py-to-r.cpp +#if defined(ARROW_R_WITH_ARROW) +uintptr_t allocate_arrow_array(); +RcppExport SEXP _arrow_allocate_arrow_array(){ +BEGIN_RCPP + return Rcpp::wrap(allocate_arrow_array()); +END_RCPP +} +#else +RcppExport SEXP _arrow_allocate_arrow_array(){ + Rf_error("Cannot call allocate_arrow_array(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// py-to-r.cpp +#if defined(ARROW_R_WITH_ARROW) +void delete_arrow_array(uintptr_t ptr); +RcppExport SEXP _arrow_delete_arrow_array(SEXP ptr_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter::type ptr(ptr_sexp); + delete_arrow_array(ptr); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_delete_arrow_array(SEXP ptr_sexp){ + Rf_error("Cannot call delete_arrow_array(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// py-to-r.cpp +#if defined(ARROW_R_WITH_ARROW) +void ExportType(const std::shared_ptr& type, uintptr_t ptr); +RcppExport SEXP _arrow_ExportType(SEXP type_sexp, SEXP ptr_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type type(type_sexp); + Rcpp::traits::input_parameter::type ptr(ptr_sexp); + ExportType(type, ptr); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_ExportType(SEXP type_sexp, SEXP ptr_sexp){ + Rf_error("Cannot call ExportType(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// py-to-r.cpp +#if defined(ARROW_R_WITH_ARROW) +void ExportSchema(const std::shared_ptr& schema, uintptr_t ptr); +RcppExport SEXP _arrow_ExportSchema(SEXP schema_sexp, SEXP ptr_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type schema(schema_sexp); + Rcpp::traits::input_parameter::type ptr(ptr_sexp); + ExportSchema(schema, ptr); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_ExportSchema(SEXP schema_sexp, SEXP ptr_sexp){ + Rf_error("Cannot call ExportSchema(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// py-to-r.cpp +#if defined(ARROW_R_WITH_ARROW) +void ExportArray(const std::shared_ptr& array, uintptr_t ptr, uintptr_t schema_ptr); +RcppExport SEXP _arrow_ExportArray(SEXP array_sexp, SEXP ptr_sexp, SEXP schema_ptr_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type array(array_sexp); + Rcpp::traits::input_parameter::type ptr(ptr_sexp); + Rcpp::traits::input_parameter::type schema_ptr(schema_ptr_sexp); + ExportArray(array, ptr, schema_ptr); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_ExportArray(SEXP array_sexp, SEXP ptr_sexp, SEXP schema_ptr_sexp){ + Rf_error("Cannot call ExportArray(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// py-to-r.cpp +#if defined(ARROW_R_WITH_ARROW) +void ExportRecordBatch(const std::shared_ptr& batch, uintptr_t ptr, uintptr_t schema_ptr); +RcppExport SEXP _arrow_ExportRecordBatch(SEXP batch_sexp, SEXP ptr_sexp, SEXP schema_ptr_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type batch(batch_sexp); + Rcpp::traits::input_parameter::type ptr(ptr_sexp); + Rcpp::traits::input_parameter::type schema_ptr(schema_ptr_sexp); + ExportRecordBatch(batch, ptr, schema_ptr); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_ExportRecordBatch(SEXP batch_sexp, SEXP ptr_sexp, SEXP schema_ptr_sexp){ + Rf_error("Cannot call ExportRecordBatch(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // recordbatch.cpp #if defined(ARROW_R_WITH_ARROW) int RecordBatch__num_columns(const std::shared_ptr& x); @@ -6005,6 +6167,16 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_parquet___arrow___FileWriter__Close", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__Close, 1}, { "_arrow_parquet___arrow___WriteTable", (DL_FUNC) &_arrow_parquet___arrow___WriteTable, 4}, { "_arrow_parquet___arrow___FileReader__GetSchema", (DL_FUNC) &_arrow_parquet___arrow___FileReader__GetSchema, 1}, + { "_arrow_ImportArray", (DL_FUNC) &_arrow_ImportArray, 2}, + { "_arrow_ImportRecordBatch", (DL_FUNC) &_arrow_ImportRecordBatch, 2}, + { "_arrow_allocate_arrow_schema", (DL_FUNC) &_arrow_allocate_arrow_schema, 0}, + { "_arrow_delete_arrow_schema", (DL_FUNC) &_arrow_delete_arrow_schema, 1}, + { "_arrow_allocate_arrow_array", (DL_FUNC) &_arrow_allocate_arrow_array, 0}, + { "_arrow_delete_arrow_array", (DL_FUNC) &_arrow_delete_arrow_array, 1}, + { "_arrow_ExportType", (DL_FUNC) &_arrow_ExportType, 2}, + { "_arrow_ExportSchema", (DL_FUNC) &_arrow_ExportSchema, 2}, + { "_arrow_ExportArray", (DL_FUNC) &_arrow_ExportArray, 3}, + { "_arrow_ExportRecordBatch", (DL_FUNC) &_arrow_ExportRecordBatch, 3}, { "_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, { "_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, { "_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1}, diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 5facffa10d5..3286619e12c 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -196,6 +196,7 @@ inline std::shared_ptr extract(SEXP x) { #if defined(ARROW_R_WITH_ARROW) #include +#include #include #include #include diff --git a/r/src/py-to-r.cpp b/r/src/py-to-r.cpp new file mode 100644 index 00000000000..491ee5730ea --- /dev/null +++ b/r/src/py-to-r.cpp @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "./arrow_types.h" + +#if defined(ARROW_R_WITH_ARROW) + +// [[arrow::export]] +std::shared_ptr ImportArray(uintptr_t array, uintptr_t schema) { + return VALUE_OR_STOP(arrow::ImportArray(reinterpret_cast(array), + reinterpret_cast(schema))); +} + +// [[arrow::export]] +std::shared_ptr ImportRecordBatch(uintptr_t array, uintptr_t schema) { + return VALUE_OR_STOP( + arrow::ImportRecordBatch(reinterpret_cast(array), + reinterpret_cast(schema))); +} + +// [[arrow::export]] +uintptr_t allocate_arrow_schema() { return reinterpret_cast(new ArrowSchema); } + +// [[arrow::export]] +void delete_arrow_schema(uintptr_t ptr) { + delete reinterpret_cast(ptr); +} + +// [[arrow::export]] +uintptr_t allocate_arrow_array() { return reinterpret_cast(new ArrowArray); } + +// [[arrow::export]] +void delete_arrow_array(uintptr_t ptr) { + delete reinterpret_cast(ptr); +} + +// [[arrow::export]] +void ExportType(const std::shared_ptr& type, uintptr_t ptr) { + STOP_IF_NOT_OK(arrow::ExportType(*type, reinterpret_cast(ptr))); +} + +// [[arrow::export]] +void ExportSchema(const std::shared_ptr& schema, uintptr_t ptr) { + STOP_IF_NOT_OK( + arrow::ExportSchema(*schema, reinterpret_cast(ptr))); +} + +// [[arrow::export]] +void ExportArray(const std::shared_ptr& array, uintptr_t ptr, + uintptr_t schema_ptr) { + STOP_IF_NOT_OK(arrow::ExportArray(*array, reinterpret_cast(ptr), + reinterpret_cast(schema_ptr))); +} + +// [[arrow::export]] +void ExportRecordBatch(const std::shared_ptr& batch, uintptr_t ptr, + uintptr_t schema_ptr) { + STOP_IF_NOT_OK( + arrow::ExportRecordBatch(*batch, reinterpret_cast(ptr), + reinterpret_cast(schema_ptr))); +} + +#endif diff --git a/r/tests/testthat/helper-skip.R b/r/tests/testthat/helper-skip.R index b87c5d6ef45..d6489fb0093 100644 --- a/r/tests/testthat/helper-skip.R +++ b/r/tests/testthat/helper-skip.R @@ -21,3 +21,10 @@ skip_if_not_available <- function(feature) { skip(paste("Arrow C++ not built with support for", feature)) } } + +skip_if_no_pyarrow <- function() { + skip_if_not_installed("reticulate") + if (!reticulate::py_module_available("pyarrow")) { + skip("pyarrow not available for testing") + } +} diff --git a/r/tests/testthat/test-python.R b/r/tests/testthat/test-python.R new file mode 100644 index 00000000000..d78ce2afc5b --- /dev/null +++ b/r/tests/testthat/test-python.R @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +context("To/from Python") + +test_that("Array from Python", { + skip_if_no_pyarrow() + pa <- reticulate::import("pyarrow") + py <- pa$array(c(1, 2, 3)) + expect_equal(py, Array$create(c(1, 2, 3))) +}) + +test_that("Array to Python", { + skip_if_no_pyarrow() + pa <- reticulate::import("pyarrow", convert=FALSE) + r <- Array$create(c(1, 2, 3)) + py <- pa$concat_arrays(list(r)) + expect_is(py, "pyarrow.lib.Array") + expect_equal(reticulate::py_to_r(py), r) +}) + +test_that("RecordBatch to/from Python", { + skip_if_no_pyarrow() + pa <- reticulate::import("pyarrow", convert=FALSE) + batch <- record_batch(col1=c(1, 2, 3), col2=letters[1:3]) + py <- reticulate::r_to_py(batch) + expect_is(py, "pyarrow.lib.RecordBatch") + expect_equal(reticulate::py_to_r(py), batch) +})