Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -356,10 +356,10 @@ if(ARROW_COMPUTE)
compute/kernels/scalar_set_lookup.cc
compute/kernels/scalar_string.cc
compute/kernels/scalar_validity.cc
compute/kernels/vector_filter.cc
compute/kernels/util_internal.cc
compute/kernels/vector_hash.cc
compute/kernels/vector_sort.cc
compute/kernels/vector_take.cc)
compute/kernels/vector_selection.cc
compute/kernels/vector_sort.cc)
endif()

if(ARROW_FILESYSTEM)
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/arrow/array/array_binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ class BaseBinaryArray : public FlatArray {
return raw_value_offsets_ + data_->offset;
}

const uint8_t* raw_data() const { return raw_data_; }

/// \brief Return the data buffer absolute offset of the data for the value
/// at the passed index.
///
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/arrow/compute/api_vector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
#include <utility>
#include <vector>

#include "arrow/array/builder_primitive.h"
#include "arrow/compute/exec.h"
#include "arrow/compute/kernels/vector_selection_internal.h"
#include "arrow/datum.h"
#include "arrow/record_batch.h"
#include "arrow/result.h"
Expand Down Expand Up @@ -65,6 +65,9 @@ Result<std::shared_ptr<Array>> ValueCounts(const Datum& value, ExecContext* ctx)
return result.make_array();
}

// ----------------------------------------------------------------------
// Filter- and take-related selection functions

Result<Datum> Filter(const Datum& values, const Datum& filter,
const FilterOptions& options, ExecContext* ctx) {
// Invoke metafunction which deals with Datum kinds other than just Array,
Expand Down
23 changes: 22 additions & 1 deletion cpp/src/arrow/compute/api_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@ struct FilterOptions : public FunctionOptions {
EMIT_NULL,
};

static FilterOptions Defaults() { return FilterOptions{}; }
explicit FilterOptions(NullSelectionBehavior null_selection = DROP)
: null_selection_behavior(null_selection) {}

static FilterOptions Defaults() { return FilterOptions(); }

NullSelectionBehavior null_selection_behavior = DROP;
};
Expand All @@ -64,6 +67,24 @@ Result<Datum> Filter(const Datum& values, const Datum& filter,
const FilterOptions& options = FilterOptions::Defaults(),
ExecContext* ctx = NULLPTR);

namespace internal {

// These internal functions are implemented in kernels/vector_selection.cc

/// \brief Return the number of selected indices in the boolean filter
ARROW_EXPORT
int64_t GetFilterOutputSize(const ArrayData& filter,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably be extracted as a ScalarFunction named popcount or so (follow up)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK

FilterOptions::NullSelectionBehavior null_selection);

/// \brief Compute uint64 selection indices for use with Take given a boolean
/// filter
ARROW_EXPORT
Result<std::shared_ptr<ArrayData>> GetTakeIndices(
const ArrayData& filter, FilterOptions::NullSelectionBehavior null_selection,
MemoryPool* memory_pool = default_memory_pool());

} // namespace internal

struct ARROW_EXPORT TakeOptions : public FunctionOptions {
explicit TakeOptions(bool boundscheck = true) : boundscheck(boundscheck) {}

Expand Down
4 changes: 3 additions & 1 deletion cpp/src/arrow/compute/benchmark_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@
#include "arrow/util/cpu_info.h"

namespace arrow {
namespace compute {

using internal::CpuInfo;

namespace compute {

static CpuInfo* cpu_info = CpuInfo::GetInstance();

static const int64_t kL1Size = cpu_info->CacheSize(CpuInfo::L1_CACHE);
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/arrow/compute/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,8 @@ add_arrow_benchmark(scalar_string_benchmark PREFIX "arrow-compute")

add_arrow_compute_test(vector_test
SOURCES
vector_filter_test.cc
vector_hash_test.cc
vector_take_test.cc
vector_selection_test.cc
vector_sort_test.cc
test_util.cc)

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ std::string MakeArray(Elements... elements) {
std::copy(elements_as_strings.begin(), elements_as_strings.end(),
elements_as_views.begin());

return "[" + internal::JoinStrings(elements_as_views, ",") + "]";
return "[" + ::arrow::internal::JoinStrings(elements_as_views, ",") + "]";
}

template <typename T>
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/arrow/compute/kernels/scalar_cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,11 @@
#include "arrow/compute/kernels/test_util.h"

namespace arrow {
namespace compute {

using internal::checked_cast;

namespace compute {

static constexpr const char* kInvalidUtf8 = "\xa0\xa1";

static std::vector<std::shared_ptr<DataType>> kNumericTypes = {
Expand Down
11 changes: 7 additions & 4 deletions cpp/src/arrow/compute/kernels/scalar_compare_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
#include "arrow/util/checked_cast.h"

namespace arrow {

using internal::BitmapReader;

namespace compute {

using util::string_view;
Expand Down Expand Up @@ -115,8 +118,8 @@ Datum SimpleScalarArrayCompare(CompareOptions options, const Datum& lhs,
ArrayFromVector<BooleanType>(bitmap, &result);
} else {
std::vector<bool> null_bitmap(array->length());
auto reader = internal::BitmapReader(array->null_bitmap_data(), array->offset(),
array->length());
auto reader =
BitmapReader(array->null_bitmap_data(), array->offset(), array->length());
for (int64_t i = 0; i < array->length(); i++, reader.Next()) {
null_bitmap[i] = reader.IsSet();
}
Expand Down Expand Up @@ -146,8 +149,8 @@ Datum SimpleScalarArrayCompare<StringType>(CompareOptions options, const Datum&
ArrayFromVector<BooleanType>(bitmap, &result);
} else {
std::vector<bool> null_bitmap(array->length());
auto reader = internal::BitmapReader(array->null_bitmap_data(), array->offset(),
array->length());
auto reader =
BitmapReader(array->null_bitmap_data(), array->offset(), array->length());
for (int64_t i = 0; i < array->length(); i++, reader.Next()) {
null_bitmap[i] = reader.IsSet();
}
Expand Down
11 changes: 7 additions & 4 deletions cpp/src/arrow/compute/kernels/test_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@
// IWYU pragma: end_exports

namespace arrow {

using internal::checked_cast;

namespace compute {

template <typename Type, typename T>
Expand All @@ -65,8 +68,8 @@ struct DatumEqual<Type, enable_if_floating_point<Type>> {
static void EnsureEqual(const Datum& lhs, const Datum& rhs) {
ASSERT_EQ(lhs.kind(), rhs.kind());
if (lhs.kind() == Datum::SCALAR) {
auto left = internal::checked_cast<const ScalarType*>(lhs.scalar().get());
auto right = internal::checked_cast<const ScalarType*>(rhs.scalar().get());
auto left = checked_cast<const ScalarType*>(lhs.scalar().get());
auto right = checked_cast<const ScalarType*>(rhs.scalar().get());
ASSERT_EQ(left->is_valid, right->is_valid);
ASSERT_EQ(left->type->id(), right->type->id());
ASSERT_NEAR(left->value, right->value, kArbitraryDoubleErrorBound);
Expand All @@ -80,8 +83,8 @@ struct DatumEqual<Type, enable_if_integer<Type>> {
static void EnsureEqual(const Datum& lhs, const Datum& rhs) {
ASSERT_EQ(lhs.kind(), rhs.kind());
if (lhs.kind() == Datum::SCALAR) {
auto left = internal::checked_cast<const ScalarType*>(lhs.scalar().get());
auto right = internal::checked_cast<const ScalarType*>(rhs.scalar().get());
auto left = checked_cast<const ScalarType*>(lhs.scalar().get());
auto right = checked_cast<const ScalarType*>(rhs.scalar().get());
ASSERT_EQ(*left, *right);
}
}
Expand Down
62 changes: 62 additions & 0 deletions cpp/src/arrow/compute/kernels/util_internal.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/compute/kernels/util_internal.h"

#include <cstdint>

#include "arrow/array/data.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"

namespace arrow {

using internal::checked_cast;

namespace compute {
namespace internal {

const uint8_t* GetValidityBitmap(const ArrayData& data) {
const uint8_t* bitmap = nullptr;
if (data.buffers[0]) {
bitmap = data.buffers[0]->data();
}
return bitmap;
}

int GetBitWidth(const DataType& type) {
return checked_cast<const FixedWidthType&>(type).bit_width();
}

PrimitiveArg GetPrimitiveArg(const ArrayData& arr) {
PrimitiveArg arg;
arg.is_valid = GetValidityBitmap(arr);
arg.data = arr.buffers[1]->data();
arg.bit_width = GetBitWidth(*arr.type);
arg.offset = arr.offset;
arg.length = arr.length;
if (arg.bit_width > 1) {
arg.data += arr.offset * arg.bit_width / 8;
}
// This may be kUnknownNullCount
arg.null_count = arr.null_count.load();
return arg;
}

} // namespace internal
} // namespace compute
} // namespace arrow
55 changes: 55 additions & 0 deletions cpp/src/arrow/compute/kernels/util_internal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <cstdint>

#include "arrow/buffer.h"

namespace arrow {
namespace compute {
namespace internal {

// An internal data structure for unpacking a primitive argument to pass to a
// kernel implementation
struct PrimitiveArg {
const uint8_t* is_valid;
// If the bit_width is a multiple of 8 (i.e. not boolean), then "data" should
// be shifted by offset * (bit_width / 8). For bit-packed data, the offset
// must be used when indexing.
const uint8_t* data;
int bit_width;
int64_t length;
int64_t offset;
// This may be kUnknownNullCount if the null_count has not yet been computed,
// so use null_count != 0 to determine "may have nulls".
int64_t null_count;
};

// Get validity bitmap data or return nullptr if there is no validity buffer
const uint8_t* GetValidityBitmap(const ArrayData& data);

int GetBitWidth(const DataType& type);

// Reduce code size by dealing with the unboxing of the kernel inputs once
// rather than duplicating compiled code to do all these in each kernel.
PrimitiveArg GetPrimitiveArg(const ArrayData& arr);

} // namespace internal
} // namespace compute
} // namespace arrow
Loading