Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
66e1c1f
ComputeNullValues
R-JunmingChen Nov 12, 2023
f6e8958
add test
R-JunmingChen Nov 12, 2023
5faa77c
change name
R-JunmingChen Nov 12, 2023
4a9621f
lint
R-JunmingChen Nov 12, 2023
0adeb6b
Update cpp/src/arrow/array/array_dict_test.cc
R-JunmingChen Nov 13, 2023
7015905
Update cpp/src/arrow/array/array_dict_test.cc
R-JunmingChen Nov 13, 2023
00198a3
Update cpp/src/arrow/array/array_dict.cc
R-JunmingChen Nov 13, 2023
3050800
Update cpp/src/arrow/array/array_dict.cc
R-JunmingChen Nov 13, 2023
8b1bd8e
fix comment
R-JunmingChen Nov 13, 2023
4219044
init dict_util
R-JunmingChen Nov 14, 2023
4e3c7fe
indices_null_bit_map
R-JunmingChen Nov 14, 2023
60bcd37
sequence
R-JunmingChen Nov 14, 2023
186b047
add test
R-JunmingChen Nov 15, 2023
8663dbc
Revert "fix comment"
R-JunmingChen Nov 15, 2023
b5553a9
Revert "Update cpp/src/arrow/array/array_dict.cc"
R-JunmingChen Nov 15, 2023
5072e98
Revert "Update cpp/src/arrow/array/array_dict.cc"
R-JunmingChen Nov 15, 2023
dd95225
Revert "Update cpp/src/arrow/array/array_dict_test.cc"
R-JunmingChen Nov 15, 2023
5f75ef4
Revert "Update cpp/src/arrow/array/array_dict_test.cc"
R-JunmingChen Nov 15, 2023
fcc1505
Revert "lint"
R-JunmingChen Nov 15, 2023
34d8856
Revert "change name"
R-JunmingChen Nov 15, 2023
fecda42
Revert "add test"
R-JunmingChen Nov 15, 2023
1cb3381
Revert "ComputeNullValues"
R-JunmingChen Nov 15, 2023
686670a
lint
R-JunmingChen Nov 15, 2023
e6d83c4
Merge branch 'main' into ARROW-38457
R-JunmingChen Nov 15, 2023
3c0b315
Update cpp/src/arrow/array/array_test.cc
R-JunmingChen Nov 16, 2023
ed46f28
slice
R-JunmingChen Nov 16, 2023
18a7cd9
Merge branch 'ARROW-38457' of https://github.com/R-JunmingChen/arrow …
R-JunmingChen Nov 16, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ set(ARROW_SRCS
util/debug.cc
util/decimal.cc
util/delimiting.cc
util/dict_util.cc
util/float16.cc
util/formatting.cc
util/future.cc
Expand Down
57 changes: 57 additions & 0 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,22 @@ class TestArray : public ::testing::Test {
MemoryPool* pool_;
};

void CheckDictionaryNullCount(const std::shared_ptr<DataType>& dict_type,
const std::string& input_dictionary_json,
const std::string& input_index_json,
const int64_t& expected_null_count,
const int64_t& expected_logical_null_count,
bool expected_may_have_nulls,
bool expected_may_have_logical_nulls) {
std::shared_ptr<arrow::Array> arr =
DictArrayFromJSON(dict_type, input_index_json, input_dictionary_json);

ASSERT_EQ(arr->null_count(), expected_null_count);
ASSERT_EQ(arr->ComputeLogicalNullCount(), expected_logical_null_count);
ASSERT_EQ(arr->data()->MayHaveNulls(), expected_may_have_nulls);
ASSERT_EQ(arr->data()->MayHaveLogicalNulls(), expected_may_have_logical_nulls);
}

TEST_F(TestArray, TestNullCount) {
// These are placeholders
auto data = std::make_shared<Buffer>(nullptr, 0);
Expand Down Expand Up @@ -127,6 +143,37 @@ TEST_F(TestArray, TestNullCount) {
ASSERT_EQ(0, ree_no_nulls->ComputeLogicalNullCount());
ASSERT_FALSE(ree_no_nulls->data()->MayHaveNulls());
ASSERT_FALSE(ree_no_nulls->data()->MayHaveLogicalNulls());

// Dictionary type
std::shared_ptr<arrow::DataType> type;
std::shared_ptr<arrow::DataType> dict_type;

for (const auto& index_type : all_dictionary_index_types()) {
ARROW_SCOPED_TRACE("index_type = ", index_type->ToString());

type = boolean();
dict_type = dictionary(index_type, type);
// no null value
CheckDictionaryNullCount(dict_type, "[]", "[]", 0, 0, false, false);
CheckDictionaryNullCount(dict_type, "[true, false]", "[0, 1, 0]", 0, 0, false, false);

// only indices contain null value
CheckDictionaryNullCount(dict_type, "[true, false]", "[null, 0, 1]", 1, 1, true,
true);
CheckDictionaryNullCount(dict_type, "[true, false]", "[null, null]", 2, 2, true,
true);

// only dictionary contains null value
CheckDictionaryNullCount(dict_type, "[null, true]", "[]", 0, 0, false, true);
CheckDictionaryNullCount(dict_type, "[null, true, false]", "[0, 1, 0]", 0, 2, false,
true);

// both indices and dictionary contain null value
CheckDictionaryNullCount(dict_type, "[null, true, false]", "[0, 1, 0, null]", 1, 3,
true, true);
CheckDictionaryNullCount(dict_type, "[null, true, null, false]", "[null, 1, 0, 2, 3]",
1, 3, true, true);
}
}

TEST_F(TestArray, TestSlicePreservesAllNullCount) {
Expand All @@ -137,6 +184,16 @@ TEST_F(TestArray, TestSlicePreservesAllNullCount) {
Int32Array arr(/*length=*/100, data, null_bitmap,
/*null_count*/ 100);
EXPECT_EQ(arr.Slice(1, 99)->data()->null_count, arr.Slice(1, 99)->length());

// Dictionary type
std::shared_ptr<arrow::DataType> dict_type = dictionary(int64(), boolean());
std::shared_ptr<arrow::Array> dict_arr =
DictArrayFromJSON(dict_type, /*indices=*/"[null, 0, 0, 0, 0, 0, 1, 2, 0, 0]",
/*dictionary=*/"[null, true, false]");
ASSERT_EQ(dict_arr->null_count(), 1);
ASSERT_EQ(dict_arr->ComputeLogicalNullCount(), 8);
ASSERT_EQ(dict_arr->Slice(2, 8)->null_count(), 0);
ASSERT_EQ(dict_arr->Slice(2, 8)->ComputeLogicalNullCount(), 6);
}

TEST_F(TestArray, TestLength) {
Expand Down
14 changes: 13 additions & 1 deletion cpp/src/arrow/array/data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "arrow/type_traits.h"
#include "arrow/util/binary_view_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/dict_util.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/ree_util.h"
Expand Down Expand Up @@ -93,6 +94,10 @@ bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data) {
return ArraySpan(data).MayHaveLogicalNulls();
}

bool DictionaryMayHaveLogicalNulls(const ArrayData& data) {
return ArraySpan(data).MayHaveLogicalNulls();
}

BufferSpan PackVariadicBuffers(util::span<const std::shared_ptr<Buffer>> buffers) {
return {const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(buffers.data())),
static_cast<int64_t>(buffers.size() * sizeof(std::shared_ptr<Buffer>))};
Expand Down Expand Up @@ -174,7 +179,7 @@ int64_t ArrayData::GetNullCount() const {
}

int64_t ArrayData::ComputeLogicalNullCount() const {
if (this->buffers[0]) {
if (this->buffers[0] && this->type->id() != Type::DICTIONARY) {
return GetNullCount();
}
return ArraySpan(*this).ComputeLogicalNullCount();
Expand Down Expand Up @@ -520,6 +525,9 @@ int64_t ArraySpan::ComputeLogicalNullCount() const {
if (t == Type::RUN_END_ENCODED) {
return ree_util::LogicalNullCount(*this);
}
if (t == Type::DICTIONARY) {
return dict_util::LogicalNullCount(*this);
}
return GetNullCount();
}

Expand Down Expand Up @@ -617,6 +625,10 @@ bool ArraySpan::RunEndEncodedMayHaveLogicalNulls() const {
return ree_util::ValuesArray(*this).MayHaveLogicalNulls();
}

bool ArraySpan::DictionaryMayHaveLogicalNulls() const {
return this->GetNullCount() != 0 || this->dictionary().GetNullCount() != 0;
}

// ----------------------------------------------------------------------
// Implement internal::GetArrayView

Expand Down
14 changes: 11 additions & 3 deletions cpp/src/arrow/array/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,15 @@ struct ArrayData;

namespace internal {
// ----------------------------------------------------------------------
// Null handling for types without a validity bitmap
// Null handling for types without a validity bitmap and the dictionary type

ARROW_EXPORT bool IsNullSparseUnion(const ArrayData& data, int64_t i);
ARROW_EXPORT bool IsNullDenseUnion(const ArrayData& data, int64_t i);
ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data, int64_t i);

ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data);
ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data);
ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data);
} // namespace internal

// When slicing, we do not know the null count of the sliced range without
Expand Down Expand Up @@ -280,7 +281,7 @@ struct ARROW_EXPORT ArrayData {

/// \brief Return true if the validity bitmap may have 0's in it, or if the
/// child arrays (in the case of types without a validity bitmap) may have
/// nulls
/// nulls, or if the dictionary of dictionay array may have nulls.
///
/// This is not a drop-in replacement for MayHaveNulls, as historically
/// MayHaveNulls() has been used to check for the presence of a validity
Expand Down Expand Up @@ -325,6 +326,9 @@ struct ARROW_EXPORT ArrayData {
if (t == Type::RUN_END_ENCODED) {
return internal::RunEndEncodedMayHaveLogicalNulls(*this);
}
if (t == Type::DICTIONARY) {
return internal::DictionaryMayHaveLogicalNulls(*this);
}
return null_count.load() != 0;
}

Expand Down Expand Up @@ -505,7 +509,7 @@ struct ARROW_EXPORT ArraySpan {

/// \brief Return true if the validity bitmap may have 0's in it, or if the
/// child arrays (in the case of types without a validity bitmap) may have
/// nulls
/// nulls, or if the dictionary of dictionay array may have nulls.
///
/// \see ArrayData::MayHaveLogicalNulls
bool MayHaveLogicalNulls() const {
Expand All @@ -519,6 +523,9 @@ struct ARROW_EXPORT ArraySpan {
if (t == Type::RUN_END_ENCODED) {
return RunEndEncodedMayHaveLogicalNulls();
}
if (t == Type::DICTIONARY) {
return DictionaryMayHaveLogicalNulls();
}
return null_count != 0;
}

Expand Down Expand Up @@ -560,6 +567,7 @@ struct ARROW_EXPORT ArraySpan {

bool UnionMayHaveLogicalNulls() const;
bool RunEndEncodedMayHaveLogicalNulls() const;
bool DictionaryMayHaveLogicalNulls() const;
};

namespace internal {
Expand Down
81 changes: 81 additions & 0 deletions cpp/src/arrow/util/dict_util.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/util/dict_util.h"
#include "arrow/array/array_dict.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/checked_cast.h"

namespace arrow {
namespace dict_util {

namespace {

template <typename IndexArrowType>
int64_t LogicalNullCount(const ArraySpan& span) {
const auto* indices_null_bit_map = span.buffers[0].data;
const auto& dictionary_span = span.dictionary();
const auto* dictionary_null_bit_map = dictionary_span.buffers[0].data;

using CType = typename IndexArrowType::c_type;
const CType* indices_data = span.GetValues<CType>(1);
int64_t null_count = 0;
for (int64_t i = 0; i < span.length; i++) {
if (indices_null_bit_map != nullptr &&
!bit_util::GetBit(indices_null_bit_map, i + span.offset)) {
null_count++;
continue;
}

CType current_index = indices_data[i];
if (!bit_util::GetBit(dictionary_null_bit_map,
current_index + dictionary_span.offset)) {
null_count++;
}
}
return null_count;
}

} // namespace

int64_t LogicalNullCount(const ArraySpan& span) {
if (span.dictionary().GetNullCount() == 0 || span.length == 0) {
return span.GetNullCount();
}

const auto& dict_array_type = internal::checked_cast<const DictionaryType&>(*span.type);
switch (dict_array_type.index_type()->id()) {
case Type::UINT8:
return LogicalNullCount<UInt8Type>(span);
case Type::INT8:
return LogicalNullCount<Int8Type>(span);
case Type::UINT16:
return LogicalNullCount<UInt16Type>(span);
case Type::INT16:
return LogicalNullCount<Int16Type>(span);
case Type::UINT32:
return LogicalNullCount<UInt32Type>(span);
case Type::INT32:
return LogicalNullCount<Int32Type>(span);
case Type::UINT64:
return LogicalNullCount<UInt64Type>(span);
default:
return LogicalNullCount<Int64Type>(span);
}
}
} // namespace dict_util
} // namespace arrow
28 changes: 28 additions & 0 deletions cpp/src/arrow/util/dict_util.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include "arrow/array/data.h"

namespace arrow {
namespace dict_util {

int64_t LogicalNullCount(const ArraySpan& span);

} // namespace dict_util
} // namespace arrow