Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion c_glib/test/test-array.rb
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def test_diff
def test_different_type
array = build_string_array(["Start", "Shutdown", "Reboot"])
other_array = build_int8_array([2, 3, 6, 10])
assert_equal("# Array types differed: string vs int8",
assert_equal("# Array types differed: string vs int8\n",
array.diff_unified(other_array))
end
end
Expand Down
1 change: 1 addition & 0 deletions ci/conda_env_python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# under the License.

# don't add pandas here, because it is not a mandatory test dependency
cffi
cython
cloudpickle
hypothesis
Expand Down
1 change: 1 addition & 0 deletions ci/conda_env_r.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ r-covr
r-hms
r-lubridate
r-rcmdcheck
r-reticulate
r-rmarkdown
r-testthat
r-tibble
1 change: 1 addition & 0 deletions ci/docker/conda-r.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,6 @@ ENV ARROW_BUILD_STATIC=OFF \
ARROW_ORC=OFF \
ARROW_PARQUET=ON \
ARROW_PLASMA=OFF \
ARROW_USE_CCACHE=ON \
ARROW_USE_GLOG=OFF \
LC_ALL=en_US.UTF-8
20 changes: 19 additions & 1 deletion ci/docker/linux-apt-r.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,11 @@ RUN apt-get update -y && \
# R CMD CHECK --as-cran needs pdflatex to build the package manual
texlive-latex-base \
# Need locales so we can set UTF-8
locales && \
locales \
# Need Python to check py-to-r bridge
python3 \
python3-pip \
python3-dev && \
locale-gen en_US.UTF-8 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
Expand All @@ -63,6 +67,18 @@ COPY ci/scripts/r_deps.sh /arrow/ci/scripts/
COPY r/DESCRIPTION /arrow/r/
RUN /arrow/ci/scripts/r_deps.sh /arrow

# Set up Python 3 and its dependencies
RUN ln -s /usr/bin/python3 /usr/local/bin/python && \
ln -s /usr/bin/pip3 /usr/local/bin/pip

COPY python/requirements.txt \
python/requirements-test.txt \
/arrow/python/

RUN pip install \
-r arrow/python/requirements.txt \
cython setuptools

ENV \
ARROW_BUILD_STATIC=OFF \
ARROW_BUILD_TESTS=OFF \
Expand All @@ -74,5 +90,7 @@ ENV \
ARROW_ORC=OFF \
ARROW_PARQUET=ON \
ARROW_PLASMA=OFF \
ARROW_PYTHON=ON \
ARROW_USE_CCACHE=ON \
ARROW_USE_GLOG=OFF \
LC_ALL=en_US.UTF-8
2 changes: 2 additions & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ set(ARROW_SRCS
tensor.cc
type.cc
visitor.cc
c/bridge.cc
io/buffered.cc
io/compressed.cc
io/file.cc
Expand Down Expand Up @@ -278,6 +279,7 @@ add_subdirectory(testing)
#

add_subdirectory(array)
add_subdirectory(c)
add_subdirectory(io)
add_subdirectory(util)
add_subdirectory(vendored)
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,12 @@ Result<std::shared_ptr<StructArray>> StructArray::Make(
if (offset > length) {
return Status::IndexError("Offset greater than length of child arrays");
}
if (null_bitmap == nullptr) {
if (null_count > 0) {
return Status::Invalid("null_count = ", null_count, " but no null bitmap given");
}
null_count = 0;
}
return std::make_shared<StructArray>(struct_(fields), length - offset, children,
null_bitmap, null_count, offset);
}
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/arrow/array/builder_primitive.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,15 @@ class NumericBuilder : public ArrayBuilder {
/// uninitialized memory access
Status AppendNulls(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
data_builder_.UnsafeAppend(length, static_cast<value_type>(0));
data_builder_.UnsafeAppend(length, value_type{}); // zero
UnsafeSetNull(length);
return Status::OK();
}

/// \brief Append a single null element
Status AppendNull() final {
ARROW_RETURN_NOT_OK(Reserve(1));
data_builder_.UnsafeAppend(static_cast<value_type>(0));
data_builder_.UnsafeAppend(value_type{}); // zero
UnsafeAppendToBitmap(false);
return Status::OK();
}
Expand Down Expand Up @@ -243,7 +243,7 @@ class NumericBuilder : public ArrayBuilder {

void UnsafeAppendNull() {
ArrayBuilder::UnsafeAppendToBitmap(false);
data_builder_.UnsafeAppend(0);
data_builder_.UnsafeAppend(value_type{}); // zero
}

std::shared_ptr<DataType> type() const override { return type_; }
Expand Down
41 changes: 6 additions & 35 deletions cpp/src/arrow/array/builder_time.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,52 +21,23 @@

#include <memory>

#include "arrow/array.h"
#include "arrow/array/builder_base.h"
#include "arrow/array/builder_binary.h"
#include "arrow/array/builder_primitive.h"
#include "arrow/buffer_builder.h"
#include "arrow/status.h"
#include "arrow/type_traits.h"
#include "arrow/util/macros.h"

namespace arrow {

class ARROW_EXPORT DayTimeIntervalBuilder : public ArrayBuilder {
// TODO this class is untested

class ARROW_EXPORT DayTimeIntervalBuilder : public NumericBuilder<DayTimeIntervalType> {
public:
using TypeClass = DayTimeIntervalType;
using DayMilliseconds = DayTimeIntervalType::DayMilliseconds;

explicit DayTimeIntervalBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT)
: DayTimeIntervalBuilder(day_time_interval(), pool) {}

DayTimeIntervalBuilder(std::shared_ptr<DataType> type,
MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT)
: ArrayBuilder(pool), builder_(fixed_size_binary(sizeof(DayMilliseconds)), pool) {}

void Reset() override { builder_.Reset(); }
Status Resize(int64_t capacity) override { return builder_.Resize(capacity); }
Status Append(DayMilliseconds day_millis) {
return builder_.Append(reinterpret_cast<uint8_t*>(&day_millis));
}
void UnsafeAppend(DayMilliseconds day_millis) {
builder_.UnsafeAppend(reinterpret_cast<uint8_t*>(&day_millis));
}
using ArrayBuilder::UnsafeAppendNull;
Status AppendNull() override { return builder_.AppendNull(); }
Status AppendNulls(int64_t length) override { return builder_.AppendNulls(length); }
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
auto result = builder_.FinishInternal(out);
if (*out != NULLPTR) {
(*out)->type = type();
}
return result;
}

std::shared_ptr<DataType> type() const override { return day_time_interval(); }

private:
FixedSizeBinaryBuilder builder_;
explicit DayTimeIntervalBuilder(std::shared_ptr<DataType> type,
MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT)
: NumericBuilder<DayTimeIntervalType>(type, pool) {}
};

} // namespace arrow
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/diff_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ TEST_F(DiffTest, Errors) {
ASSERT_RAISES(TypeError, Diff(*base_, *target_, default_memory_pool()));

ASSERT_FALSE(base_->Equals(*target_, EqualOptions().diff_sink(&formatted)));
ASSERT_EQ(formatted.str(), R"(# Array types differed: int32 vs string)");
ASSERT_EQ(formatted.str(), "# Array types differed: int32 vs string\n");
}

template <typename ArrowType>
Expand Down
16 changes: 10 additions & 6 deletions cpp/src/arrow/array/validate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "arrow/array/validate.h"

#include "arrow/array.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/int_util.h"
#include "arrow/util/logging.h"
#include "arrow/visitor_inline.h"
Expand All @@ -41,11 +42,13 @@ struct ValidateArrayVisitor {
ARROW_RETURN_IF(array.data()->buffers.size() != 2,
Status::Invalid("number of buffers is != 2"));

if (array.length() > 0 && array.data()->buffers[1] == nullptr) {
return Status::Invalid("values buffer is null");
}
if (array.length() > 0 && array.values() == nullptr) {
return Status::Invalid("values is null");
if (array.length() > 0) {
if (array.data()->buffers[1] == nullptr) {
return Status::Invalid("values buffer is null");
}
if (array.values() == nullptr) {
return Status::Invalid("values is null");
}
}
return Status::OK();
}
Expand Down Expand Up @@ -265,7 +268,8 @@ struct ValidateArrayVisitor {

auto value_offsets = array.value_offsets();
if (value_offsets == nullptr) {
if (array.length() != 0) {
// For length 0, an empty offsets array seems accepted as a special case (ARROW-544)
if (array.length() > 0) {
return Status::Invalid("non-empty array but value_offsets_ is null");
}
return Status::OK();
Expand Down
22 changes: 22 additions & 0 deletions cpp/src/arrow/c/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

add_arrow_test(bridge_test PREFIX "arrow-c")

add_arrow_benchmark(bridge_benchmark)

arrow_install_all_headers("arrow/c")
65 changes: 65 additions & 0 deletions cpp/src/arrow/c/abi.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

#define ARROW_FLAG_DICTIONARY_ORDERED 1
#define ARROW_FLAG_NULLABLE 2
#define ARROW_FLAG_MAP_KEYS_SORTED 4

struct ArrowSchema {
// Array type description
const char* format;
const char* name;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed elsewhere that this is declared optional. Why would we make name optional? Isn't that an anti-pattern?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In C++ land, a top-level Arrow type doesn't have a name. There's a distinction between "field" and "type": a field is a type + a name; but a Arrow array only has a type, not a field.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe state that in the definition, that only a top-level name should generally be empty. I don't think all libraries can handle having null names at non schema roots. The language here suggests that it doesn't really matter when most of the time, it should be defined.

Copy link
Member Author

@pitrou pitrou Dec 22, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since a null name can easily be interpreted as an empty name, is it a problem (e.g. in Java) if child nodes have empty names?

const char* metadata;
int64_t flags;
int64_t n_children;
struct ArrowSchema** children;
struct ArrowSchema* dictionary;

// Release callback
void (*release)(struct ArrowSchema*);
// Opaque producer-specific data
void* private_data;
};

struct ArrowArray {
// Array data description
int64_t length;
int64_t null_count;
int64_t offset;
Copy link
Contributor

@jacques-n jacques-n Dec 16, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

offset and n_buffers seem out of place here. What is the rationale for including these? n_buffers is known by definition so why add it here? offset seems like an optimization that could solved by shifting the pointers rather than providing an offset.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note, I understand that offset can allow one to avoid bit-shifting but that seems like something we shouldn't support or tell people to do.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cpp code supports slices of arrays/record-batches without doing a copy, by just tracking the offset (even for the validity buffers). shifting the pointers doesn't work for validity bitmaps.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, without the offset member, zero-copy slicing is simply not possible. This would be a serious deficiency

Copy link
Contributor

@jacques-n jacques-n Dec 16, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is consciously not supported at the arrow format level (unless my memory fails me). I don't see why we should support it here. This addition would make this coupled to the cpp implementation (and incompatible with other implementations), which it shouldn't be.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems clear that we have different opinions about whether this pattern should be supported. I'm strongly against it for both technical reasons and because it isn't supported by the Arrow specification. I'm sure we came to our conclusions through different experiences. Let me provide a little background on my perspective.

The java codebase actually did have this concept at one point (well before it was part of the Arrow project) and we chose to eliminate it as we ultimately came to the conclusion that it made the wrong tradeoffs between efficient calculations and efficient slicing. What we discovered was that any time you were trying to do zero-copy slicing, doing so on an eight value boundary was a much better design choice than have the additional complexity to deal with bit-shifting on the computation side.

I see excluding this attribute/option as a prudent design choice that kernel developers will benefit from. In general, I expect there to be far more computation algorithms and kernels being written than slicing logic. The number of times that computational code becomes more complex on the processing side with the offset field seems to far outweigh any benefits of byte-splitting on the slicing side.

It would be helpful to better understand why you feel this is so crippling? Which specific algorithms do you see impacted by disallowing byte-splitting? How common are these and what do you think the cost is of not supporting this. How does that compare with your sense of the cost of dealing with this in every computational algorithm?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this data interface were restricted to the domain of computational kernel evaluation, then I would agree with you, but there is a wider diversity of use cases that the project supports. Note that in Dremio the slice-and-copy pattern is less onerous because record batches are generally limited in size to 64K elements and less, so the calculus is different when record batches can be arbitrarily large in general.

So, first we must consider it already as a given that most implementations are going to provide zero-copy slicing as a feature for in-memory data. The question then is how to handle slices with offset that is not a multiple of 8. There are 3 options:

  • Disallow such slicing at the C interface boundary, forcing copy-materialization for all use cases
  • Allow slicing, but regularize sliced bitmaps (and the data buffer for sliced BooleanArray) for all computational kernels, i.e. assume an offset of 0 for all kernels
  • Allow slicing and implement kernels for non-zero offsets

The 2nd option seems like the most practical -- the interface is always zero copy and kernel developers don't bother trying to optimize for non-zero offsets. It does mean that some functions that are unable to act on sliced data must check whether the offset is 0 and if not, convert the input to be offset-0 based (allocating a new bitmap and copying over the bits if necessary)

I'm not going to block up the work over this but it would feel like a shame to me to have a C data interface which is not zero copy all of the time.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this would be tenable in C++

Why not? Where is the critical need for allowing non-regularized slicing? Requiring that anything that is slicing the data does so div-8 boundary seems like a pretty low-burden constraint. Are you seeing lots of algorithms where this is high burden?

I'm not arguing here for copy materialization, I'm saying we should make non-div-8 slices invalid and have the ability for zero copy slices and computational simplicity.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Requiring that anything that is slicing the data does so div-8 boundary seems like a pretty low-burden constraint

I think the use-case is a user chooses an arbitrary slice. E.g. I have datatable, and I choose to slice rows 1 to N. the library shouldn't throw an exception here, so it is up to implementors to either do the copy or keep the slice index. But perhaps there is a third option?

Copy link
Member

@wesm wesm Jan 13, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is the critical need for allowing non-regularized slicing?

As background, I started my career in financial analytics where the world is more oriented around time series data where slicing and analytics on large memory-mapped data sets is the general expectation. Think application of rolling time series windows functions. Consider as an example such system oriented around this kind of data https://en.wikipedia.org/wiki/Kdb%2B

This kind of analytics is very different from batch processing SQL based workloads. In particular, with time series data it is not possible to constrain slices to multiples of 8.

Apache Arrow already has a lot of users from the financial analytics world and many of them are going to be using C++. If you start requiring memory copying every time you request an arbitrary window of data (big or small, a window could be 1KB or 1GB) from a large time series you cripple this use case.

int64_t n_buffers;
int64_t n_children;
const void** buffers;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't remember. Do we have required alignment here? Anything we should note in the header?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alignment and padding specification is simply part of the host system's C ABI. By making this a C struct (not an explicitly defined binary encoding), we follow the C ABI.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't talking about the alignment of the information in the C struct, I was talking about the alignment of the buffers themselves. Don't we declare elsewhere that buffers should be 8 or 64 byte aligned (?).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the columnar format, in-memory alignment is a recommendation but not a requirement, so I think the same would hold here. As my intuition, when referencing "other people's memory" in process it would be onerous to require a memory copy to satisfy a pointer alignment requirement.

The IPC message protocol does enforce alignment and padding at the serialization boundary, however.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should that be a flag then? The memory is guaranteed to be aligned. Do all the libraries handle non-aligned memory?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having a flag seems like a good idea. I believe all the libraries do handle non-aligned memory

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Memory alignment nowadays is more an optimization than a hardware requirement, on common architectures at least. x86 happily accepts unaligned memory accesses, and so does ARM64. Legacy architectures like Sparc may have stricter requirements. See also https://news.ycombinator.com/item?id=14769797

I think a flag may just add boilerplate to the implementation (you have to check all buffers are suitably aligned) without any practical value, because consumers are unlikely to do anything with it (why error out if a buffer is unaligned? your CPU / OS combination is still able to handle it fine).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that is true for individual values e.g. int32 but not for wider simd instructions. I also think that this influences the compiler's ability to do certain vectorized optimizations. The flag would be optional so if you are using a library or use case that guarantees this, you'd set it. Otherwise, you wouldn't (even if your buffers could be aligned). I don't think anyone would actually scan buffers in case they are aligned (which is what it seems like you describe).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but not for wider simd instructions

That does not seem to be the case for modern instruction sets and implementations. See x86 and ARM. I may be missing some important data point, though.

In any case, which kind of alignment would this flag enforce? 64-bytes alignment?
(for reference, the C++ implementation probably wouldn't set this flag because, while it makes an effort to allocate aligned buffers, it also accepts slice buffers or buffers allocated by foreign runtimes e.g. CPython)

struct ArrowArray** children;
struct ArrowArray* dictionary;

// Release callback
void (*release)(struct ArrowArray*);
// Opaque producer-specific data
void* private_data;
};

#ifdef __cplusplus
}
#endif
Loading