Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
06b428d
ARROW-13573: [C++] Add DictScalarFromJSON
lidavidm Aug 26, 2021
2ec1132
ARROW-13573: [C++] Check that dictionary array has dictionary
lidavidm Aug 26, 2021
e3b7f93
ARROW-13573: [C++] Handle simple dictionary cases
lidavidm Aug 26, 2021
7a57c91
ARROW-13573: [C++] Transpose dictionaries in case_when
lidavidm Aug 27, 2021
17230ee
ARROW-13573: [C++] Handle nested dictionaries
lidavidm Aug 30, 2021
16fe210
ARROW-13691: [C++] Rebase
lidavidm Aug 31, 2021
8e0e333
ARROW-13573: [C++] Always unify dictionaries
lidavidm Sep 7, 2021
60ffb02
ARROW-13573: [C++] Handle nulls before unifying, refactor
lidavidm Sep 7, 2021
a10888c
ARROW-13573: [C++] Test dictionaries with nulls
lidavidm Sep 7, 2021
5cbe6d5
ARROW-13573: [C++] Address feedback
lidavidm Sep 7, 2021
345388f
ARROW-13573: [C++] Add a direct test of dispatch
lidavidm Sep 7, 2021
8abb93f
ARROW-13573: [C++] Fix mistakes
lidavidm Sep 7, 2021
45563d1
ARROW-13573: [C++] Fix undefined behavior
lidavidm Sep 7, 2021
5fc1a1f
ARROW-13573: [C++] See if turning off unity builds fixes R CI
lidavidm Sep 10, 2021
f2a0a9e
ARROW-13573: [C++] Try bumping timeout
lidavidm Sep 13, 2021
a5b6078
ARROW-13573: [C++] Should fix MinGW32
lidavidm Sep 13, 2021
29a2f87
ARROW-13573: [C++] Make CMAKE_UNITY_BUILD depend on the rtools version
lidavidm Sep 14, 2021
d81773d
ARROW-13573: [C++] RTools40 build is very slow without unity build
lidavidm Sep 14, 2021
15a64a2
ARROW-13573: [C++] Add clarifying comments
lidavidm Sep 14, 2021
26230a4
ARROW-13573: [C++] Address feedback
lidavidm Sep 20, 2021
ba39d83
ARROW-13573: [C++] Explicitly indicate when we expect dictionary-enco…
lidavidm Sep 20, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ jobs:
name: AMD64 Windows MinGW ${{ matrix.mingw-n-bits }} C++
runs-on: windows-latest
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
timeout-minutes: 45
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/r.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ jobs:
name: AMD64 Ubuntu ${{ matrix.ubuntu }} R ${{ matrix.r }}
runs-on: ubuntu-latest
if: ${{ !contains(github.event.pull_request.title, 'WIP') }}
timeout-minutes: 60
timeout-minutes: 75
strategy:
fail-fast: false
matrix:
Expand Down
6 changes: 5 additions & 1 deletion ci/scripts/PKGBUILD
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,13 @@ build() {
export LIBS="-L${MINGW_PREFIX}/libs"
export ARROW_S3=OFF
export ARROW_WITH_RE2=OFF
# Without this, some dataset functionality segfaults
export CMAKE_UNITY_BUILD=ON
else
export ARROW_S3=ON
export ARROW_WITH_RE2=ON
# Without this, some compute functionality segfaults in tests
export CMAKE_UNITY_BUILD=OFF
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean it segfaults during compilation?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It segfaults in the tests. I wasn't really able to debug this on Windows; it disappears once you build with debuginfo.

fi

MSYS2_ARG_CONV_EXCL="-DCMAKE_INSTALL_PREFIX=" \
Expand Down Expand Up @@ -115,7 +119,7 @@ build() {
-DARROW_CXXFLAGS="${CPPFLAGS}" \
-DCMAKE_BUILD_TYPE="release" \
-DCMAKE_INSTALL_PREFIX=${MINGW_PREFIX} \
-DCMAKE_UNITY_BUILD=ON \
-DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \
-DCMAKE_VERBOSE_MAKEFILE=ON

make -j3
Expand Down
12 changes: 5 additions & 7 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,7 @@ TEST_F(TestArray, TestValidateNullCount) {
void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr<Scalar>& scalar) {
std::unique_ptr<arrow::ArrayBuilder> builder;
auto null_scalar = MakeNullScalar(scalar->type);
ASSERT_OK(MakeBuilder(pool, scalar->type, &builder));
ASSERT_OK(MakeBuilderExactIndex(pool, scalar->type, &builder));
ASSERT_OK(builder->AppendScalar(*scalar));
ASSERT_OK(builder->AppendScalar(*scalar));
ASSERT_OK(builder->AppendScalar(*null_scalar));
Expand All @@ -471,15 +471,18 @@ void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr<Scalar>& scalar)
ASSERT_EQ(out->length(), 9);

const bool can_check_nulls = internal::HasValidityBitmap(out->type()->id());
// For a dictionary builder, the output dictionary won't necessarily be the same
const bool can_check_values = !is_dictionary(out->type()->id());

if (can_check_nulls) {
ASSERT_EQ(out->null_count(), 4);
}

for (const auto index : {0, 1, 3, 5, 6}) {
ASSERT_FALSE(out->IsNull(index));
ASSERT_OK_AND_ASSIGN(auto scalar_i, out->GetScalar(index));
ASSERT_OK(scalar_i->ValidateFull());
AssertScalarsEqual(*scalar, *scalar_i, /*verbose=*/true);
if (can_check_values) AssertScalarsEqual(*scalar, *scalar_i, /*verbose=*/true);
}
for (const auto index : {2, 4, 7, 8}) {
ASSERT_EQ(out->IsNull(index), can_check_nulls);
Expand Down Expand Up @@ -575,8 +578,6 @@ TEST_F(TestArray, TestMakeArrayFromScalar) {
}

for (auto scalar : scalars) {
// TODO(ARROW-13197): appending dictionary scalars not implemented
if (is_dictionary(scalar->type->id())) continue;
AssertAppendScalar(pool_, scalar);
}
}
Expand Down Expand Up @@ -634,9 +635,6 @@ TEST_F(TestArray, TestMakeArrayFromMapScalar) {
TEST_F(TestArray, TestAppendArraySlice) {
auto scalars = GetScalars();
for (const auto& scalar : scalars) {
// TODO(ARROW-13573): appending dictionary arrays not implemented
if (is_dictionary(scalar->type->id())) continue;

ARROW_SCOPED_TRACE(*scalar->type);
ASSERT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*scalar, 16));
ASSERT_OK_AND_ASSIGN(auto nulls, MakeArrayOfNull(scalar->type, 16));
Expand Down
10 changes: 1 addition & 9 deletions cpp/src/arrow/array/builder_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <vector>

#include "arrow/array/array_base.h"
#include "arrow/array/builder_dict.h"
#include "arrow/array/data.h"
#include "arrow/array/util.h"
#include "arrow/buffer.h"
Expand Down Expand Up @@ -268,15 +269,6 @@ struct AppendScalarImpl {

} // namespace

Status ArrayBuilder::AppendScalar(const Scalar& scalar) {
if (!scalar.type->Equals(type())) {
return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
" to builder for type ", type()->ToString());
}
std::shared_ptr<Scalar> shared{const_cast<Scalar*>(&scalar), [](Scalar*) {}};
return AppendScalarImpl{&shared, &shared + 1, /*n_repeats=*/1, this}.Convert();
}

Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) {
if (!scalar.type->Equals(type())) {
return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
Expand Down
13 changes: 10 additions & 3 deletions cpp/src/arrow/array/builder_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,9 @@ class ARROW_EXPORT ArrayBuilder {
virtual Status AppendEmptyValues(int64_t length) = 0;

/// \brief Append a value from a scalar
Status AppendScalar(const Scalar& scalar);
Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
Status AppendScalars(const ScalarVector& scalars);
Status AppendScalar(const Scalar& scalar) { return AppendScalar(scalar, 1); }
virtual Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
virtual Status AppendScalars(const ScalarVector& scalars);

/// \brief Append a range of values from an array.
///
Expand Down Expand Up @@ -282,6 +282,13 @@ ARROW_EXPORT
Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
std::unique_ptr<ArrayBuilder>* out);

/// \brief Construct an empty ArrayBuilder corresponding to the data
/// type, where any top-level or nested dictionary builders return the
/// exact index type specified by the type.
ARROW_EXPORT
Status MakeBuilderExactIndex(MemoryPool* pool, const std::shared_ptr<DataType>& type,
std::unique_ptr<ArrayBuilder>* out);

/// \brief Construct an empty DictionaryBuilder initialized optionally
/// with a pre-existing dictionary
/// \param[in] pool the MemoryPool to use for allocations
Expand Down
39 changes: 24 additions & 15 deletions cpp/src/arrow/array/builder_dict.cc
Original file line number Diff line number Diff line change
Expand Up @@ -159,23 +159,32 @@ DictionaryMemoTable::DictionaryMemoTable(MemoryPool* pool,

DictionaryMemoTable::~DictionaryMemoTable() = default;

#define GET_OR_INSERT(C_TYPE) \
Status DictionaryMemoTable::GetOrInsert( \
const typename CTypeTraits<C_TYPE>::ArrowType*, C_TYPE value, int32_t* out) { \
return impl_->GetOrInsert<typename CTypeTraits<C_TYPE>::ArrowType>(value, out); \
#define GET_OR_INSERT(ARROW_TYPE) \
Status DictionaryMemoTable::GetOrInsert( \
const ARROW_TYPE*, typename ARROW_TYPE::c_type value, int32_t* out) { \
return impl_->GetOrInsert<ARROW_TYPE>(value, out); \
}

GET_OR_INSERT(bool)
GET_OR_INSERT(int8_t)
GET_OR_INSERT(int16_t)
GET_OR_INSERT(int32_t)
GET_OR_INSERT(int64_t)
GET_OR_INSERT(uint8_t)
GET_OR_INSERT(uint16_t)
GET_OR_INSERT(uint32_t)
GET_OR_INSERT(uint64_t)
GET_OR_INSERT(float)
GET_OR_INSERT(double)
GET_OR_INSERT(BooleanType)
GET_OR_INSERT(Int8Type)
GET_OR_INSERT(Int16Type)
GET_OR_INSERT(Int32Type)
GET_OR_INSERT(Int64Type)
GET_OR_INSERT(UInt8Type)
GET_OR_INSERT(UInt16Type)
GET_OR_INSERT(UInt32Type)
GET_OR_INSERT(UInt64Type)
GET_OR_INSERT(FloatType)
GET_OR_INSERT(DoubleType)
GET_OR_INSERT(DurationType);
GET_OR_INSERT(TimestampType);
GET_OR_INSERT(Date32Type);
GET_OR_INSERT(Date64Type);
GET_OR_INSERT(Time32Type);
GET_OR_INSERT(Time64Type);
GET_OR_INSERT(MonthDayNanoIntervalType);
GET_OR_INSERT(DayTimeIntervalType);
GET_OR_INSERT(MonthIntervalType);

#undef GET_OR_INSERT

Expand Down
110 changes: 110 additions & 0 deletions cpp/src/arrow/array/builder_dict.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "arrow/util/decimal.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
#include "arrow/visitor_inline.h"

namespace arrow {

Expand Down Expand Up @@ -97,6 +98,17 @@ class ARROW_EXPORT DictionaryMemoTable {
Status GetOrInsert(const UInt16Type*, uint16_t value, int32_t* out);
Status GetOrInsert(const UInt32Type*, uint32_t value, int32_t* out);
Status GetOrInsert(const UInt64Type*, uint64_t value, int32_t* out);
Status GetOrInsert(const DurationType*, int64_t value, int32_t* out);
Status GetOrInsert(const TimestampType*, int64_t value, int32_t* out);
Status GetOrInsert(const Date32Type*, int32_t value, int32_t* out);
Status GetOrInsert(const Date64Type*, int64_t value, int32_t* out);
Status GetOrInsert(const Time32Type*, int32_t value, int32_t* out);
Status GetOrInsert(const Time64Type*, int64_t value, int32_t* out);
Status GetOrInsert(const MonthDayNanoIntervalType*,
MonthDayNanoIntervalType::MonthDayNanos value, int32_t* out);
Status GetOrInsert(const DayTimeIntervalType*,
DayTimeIntervalType::DayMilliseconds value, int32_t* out);
Status GetOrInsert(const MonthIntervalType*, int32_t value, int32_t* out);
Status GetOrInsert(const FloatType*, float value, int32_t* out);
Status GetOrInsert(const DoubleType*, double value, int32_t* out);

Expand Down Expand Up @@ -282,6 +294,73 @@ class DictionaryBuilderBase : public ArrayBuilder {
return indices_builder_.AppendEmptyValues(length);
}

Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override {
if (!scalar.is_valid) return AppendNulls(n_repeats);

const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*scalar.type);
const DictionaryScalar& dict_scalar =
internal::checked_cast<const DictionaryScalar&>(scalar);
const auto& dict = internal::checked_cast<const typename TypeTraits<T>::ArrayType&>(
*dict_scalar.value.dictionary);
ARROW_RETURN_NOT_OK(Reserve(n_repeats));
switch (dict_ty.index_type()->id()) {
case Type::UINT8:
return AppendScalarImpl<UInt8Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::INT8:
return AppendScalarImpl<Int8Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::UINT16:
return AppendScalarImpl<UInt16Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::INT16:
return AppendScalarImpl<Int16Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::UINT32:
return AppendScalarImpl<UInt32Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::INT32:
return AppendScalarImpl<Int32Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::UINT64:
return AppendScalarImpl<UInt64Type>(dict, *dict_scalar.value.index, n_repeats);
case Type::INT64:
return AppendScalarImpl<Int64Type>(dict, *dict_scalar.value.index, n_repeats);
default:
return Status::TypeError("Invalid index type: ", dict_ty);
}
return Status::OK();
}

Status AppendScalars(const ScalarVector& scalars) override {
for (const auto& scalar : scalars) {
ARROW_RETURN_NOT_OK(AppendScalar(*scalar, /*n_repeats=*/1));
}
return Status::OK();
}

Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final {
// Visit the indices and insert the unpacked values.
const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*array.type);
const typename TypeTraits<T>::ArrayType dict(array.dictionary);
ARROW_RETURN_NOT_OK(Reserve(length));
switch (dict_ty.index_type()->id()) {
case Type::UINT8:
return AppendArraySliceImpl<uint8_t>(dict, array, offset, length);
case Type::INT8:
return AppendArraySliceImpl<int8_t>(dict, array, offset, length);
case Type::UINT16:
return AppendArraySliceImpl<uint16_t>(dict, array, offset, length);
case Type::INT16:
return AppendArraySliceImpl<int16_t>(dict, array, offset, length);
case Type::UINT32:
return AppendArraySliceImpl<uint32_t>(dict, array, offset, length);
case Type::INT32:
return AppendArraySliceImpl<int32_t>(dict, array, offset, length);
case Type::UINT64:
return AppendArraySliceImpl<uint64_t>(dict, array, offset, length);
case Type::INT64:
return AppendArraySliceImpl<int64_t>(dict, array, offset, length);
default:
return Status::TypeError("Invalid index type: ", dict_ty);
}
return Status::OK();
}

/// \brief Insert values into the dictionary's memo, but do not append any
/// indices. Can be used to initialize a new builder with known dictionary
/// values
Expand Down Expand Up @@ -376,6 +455,37 @@ class DictionaryBuilderBase : public ArrayBuilder {
}

protected:
template <typename c_type>
Status AppendArraySliceImpl(const typename TypeTraits<T>::ArrayType& dict,
const ArrayData& array, int64_t offset, int64_t length) {
const c_type* values = array.GetValues<c_type>(1) + offset;
return VisitBitBlocks(
array.buffers[0], array.offset + offset, length,
[&](const int64_t position) {
const int64_t index = static_cast<int64_t>(values[position]);
if (dict.IsValid(index)) {
return Append(dict.GetView(index));
}
return AppendNull();
},
[&]() { return AppendNull(); });
}

template <typename IndexType>
Status AppendScalarImpl(const typename TypeTraits<T>::ArrayType& dict,
const Scalar& index_scalar, int64_t n_repeats) {
using ScalarType = typename TypeTraits<IndexType>::ScalarType;
const auto index = internal::checked_cast<const ScalarType&>(index_scalar).value;
if (index_scalar.is_valid && dict.IsValid(index)) {
const auto& value = dict.GetView(index);
for (int64_t i = 0; i < n_repeats; i++) {
ARROW_RETURN_NOT_OK(Append(value));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not for this PR, but it sounds like offering a two-step API on DictionaryBuilder would allow for performance improvements:

/// Ensure `value` is in the dict, and return its index, but doesn't append it
Result<int64_t> Encode(c_type value);
/// Append the given dictionary index
Status AppendIndex(int64_t index);
Status AppendIndices(int64_t index, int64_t nrepeats);

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I filed ARROW-14042.

}
return Status::OK();
}
return AppendNulls(n_repeats);
}

Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
std::shared_ptr<ArrayData> dictionary;
ARROW_RETURN_NOT_OK(FinishWithDictOffset(/*offset=*/0, out, &dictionary));
Expand Down
Loading