From ef48a54cd8119a92476c60a71912ce7247f5b664 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 31 May 2020 19:41:22 -0500 Subject: [PATCH 1/5] Do not instantiate diff implementation for each type --- cpp/src/arrow/array/diff.cc | 272 +++++++++++++++--------------------- 1 file changed, 113 insertions(+), 159 deletions(-) diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index dc6bee95f60..fd421c080e7 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -35,7 +35,6 @@ #include "arrow/util/logging.h" #include "arrow/util/range.h" #include "arrow/util/string.h" -#include "arrow/util/variant.h" #include "arrow/util/visibility.h" #include "arrow/vendored/datetime.h" #include "arrow/visitor_inline.h" @@ -89,81 +88,63 @@ static UnitSlice GetView(const UnionArray& array, int64_t index) { return UnitSlice{&array, index}; } -struct NullTag { - constexpr bool operator==(const NullTag& other) const { return true; } - constexpr bool operator!=(const NullTag& other) const { return false; } +struct ValueComparator { + virtual ~ValueComparator() = default; + virtual bool operator()(int64_t base_index, int64_t target_index) = 0; }; -template -class NullOr { - public: - using VariantType = util::variant; +template +struct ValueComparatorImpl : public ValueComparator { + using ArrayType = typename TypeTraits::ArrayType; - NullOr() : variant_(NullTag{}) {} - explicit NullOr(T t) : variant_(std::move(t)) {} + ValueComparatorImpl(const Array& base, const Array& target) + : base(MakeArray(base.data())), target(MakeArray(target.data())) {} - template - NullOr(const ArrayType& array, int64_t index) { - if (array.IsNull(index)) { - variant_.emplace(NullTag{}); - } else { - variant_.emplace(GetView(array, index)); + bool operator()(int64_t base_index, int64_t target_index) override { + bool base_null = base->IsNull(base_index); + bool target_null = target->IsNull(target_index); + if (base_null && target_null) { + return true; + } else if (base_null || target_null) { + return false; } + return (GetView(checked_cast(*base), base_index) == + GetView(checked_cast(*target), target_index)); } - bool operator==(const NullOr& other) const { return variant_ == other.variant_; } - bool operator!=(const NullOr& other) const { return variant_ != other.variant_; } - - private: - VariantType variant_; + std::shared_ptr base; + std::shared_ptr target; }; -template -using ViewType = decltype(GetView(std::declval(), 0)); +struct ValueComparatorVisitor { + ValueComparatorVisitor(const Array& base, const Array& target) + : base(base), target(target) {} -template -class ViewGenerator { - public: - using View = ViewType; - - explicit ViewGenerator(const Array& array) - : array_(checked_cast(array)) { - DCHECK_EQ(array.null_count(), 0); + template + Status Visit(const T&) { + out.reset(new ValueComparatorImpl(base, target)); + return Status::OK(); } - View operator()(int64_t index) const { return GetView(array_, index); } + Status Visit(const NullType&) { return Status::NotImplemented("extension type"); } - private: - const ArrayType& array_; -}; + Status Visit(const ExtensionType&) { return Status::NotImplemented("extension type"); } -template -internal::LazyRange> MakeViewRange(const Array& array) { - using Generator = ViewGenerator; - return internal::LazyRange(Generator(array), array.length()); -} + Status Visit(const DictionaryType&) { return Status::NotImplemented("extension type"); } -template -class NullOrViewGenerator { - public: - using View = ViewType; - - explicit NullOrViewGenerator(const Array& array) - : array_(checked_cast(array)) {} - - NullOr operator()(int64_t index) const { - return array_.IsNull(index) ? NullOr() : NullOr(GetView(array_, index)); + std::unique_ptr Create() { + DCHECK_OK(VisitTypeInline(*base.type(), this)); + return std::move(out); } - private: - const ArrayType& array_; + const Array& base; + const Array& target; + std::unique_ptr out; }; -template -internal::LazyRange> MakeNullOrViewRange( - const Array& array) { - using Generator = NullOrViewGenerator; - return internal::LazyRange(Generator(array), array.length()); +std::unique_ptr GetValueComparator(const Array& base, + const Array& target) { + return ValueComparatorVisitor(base, target).Create(); } /// A generic sequence difference algorithm, based on @@ -181,18 +162,38 @@ internal::LazyRange> MakeNullOrViewRange( /// representation is minimal in the common case where the sequences differ only slightly, /// since most of the elements are shared between base and target and are represented /// implicitly. -template class QuadraticSpaceMyersDiff { public: // represents an intermediate state in the comparison of two arrays struct EditPoint { - Iterator base, target; - + int64_t base, target; bool operator==(EditPoint other) const { return base == other.base && target == other.target; } }; + QuadraticSpaceMyersDiff(const Array& base, const Array& target, MemoryPool* pool) + : base_(base), + target_(target), + pool_(pool), + value_comparator_(GetValueComparator(base, target)), + base_begin_(0), + base_end_(base.length()), + target_begin_(0), + target_end_(target.length()), + endpoint_base_({ExtendFrom({base_begin_, target_begin_}).base}), + insert_({true}) { + if ((base_end_ - base_begin_ == target_end_ - target_begin_) && + endpoint_base_[0] == base_end_) { + // trivial case: base == target + finish_index_ = 0; + } + } + + bool ValuesEqual(int64_t left, int64_t right) const { + return (*value_comparator_)(left, right); + } + // increment the position within base (the element pointed to was deleted) // then extend maximally EditPoint DeleteOne(EditPoint p) const { @@ -215,29 +216,13 @@ class QuadraticSpaceMyersDiff { // present in both sequences) EditPoint ExtendFrom(EditPoint p) const { for (; p.base != base_end_ && p.target != target_end_; ++p.base, ++p.target) { - if (*p.base != *p.target) { + if (!ValuesEqual(p.base, p.target)) { break; } } return p; } - QuadraticSpaceMyersDiff(Iterator base_begin, Iterator base_end, Iterator target_begin, - Iterator target_end) - : base_begin_(base_begin), - base_end_(base_end), - target_begin_(target_begin), - target_end_(target_end), - endpoint_base_({ExtendFrom({base_begin_, target_begin_}).base}), - insert_({true}) { - if (std::distance(base_begin_, base_end_) == - std::distance(target_begin_, target_end_) && - endpoint_base_[0] == base_end_) { - // trivial case: base == target - finish_index_ = 0; - } - } - // beginning of a range for storing per-edit state in endpoint_base_ and insert_ int64_t StorageOffset(int64_t edit_count) const { return edit_count * (edit_count + 1) / 2; @@ -342,106 +327,75 @@ class QuadraticSpaceMyersDiff { {field("insert", boolean()), field("run_length", int64())}); } - private: - int64_t finish_index_ = -1; - int64_t edit_count_ = 0; - Iterator base_begin_, base_end_; - Iterator target_begin_, target_end_; - // each element of endpoint_base_ is the furthest position in base reachable given an - // edit_count and (# insertions) - (# deletions). Each bit of insert_ records whether - // the corresponding furthest position was reached via an insertion or a deletion - // (followed by a run of shared elements). See StorageOffset for the - // layout of these vectors - std::vector endpoint_base_; - std::vector insert_; -}; - -struct DiffImpl { - Status Visit(const NullType&) { - bool insert = base_.length() < target_.length(); - auto run_length = std::min(base_.length(), target_.length()); - auto edit_count = std::max(base_.length(), target_.length()) - run_length; - - TypedBufferBuilder insert_builder(pool_); - RETURN_NOT_OK(insert_builder.Resize(edit_count + 1)); - insert_builder.UnsafeAppend(false); - TypedBufferBuilder run_length_builder(pool_); - RETURN_NOT_OK(run_length_builder.Resize(edit_count + 1)); - run_length_builder.UnsafeAppend(run_length); - if (edit_count > 0) { - insert_builder.UnsafeAppend(edit_count, insert); - run_length_builder.UnsafeAppend(edit_count, 0); - } - - std::shared_ptr insert_buf, run_length_buf; - RETURN_NOT_OK(insert_builder.Finish(&insert_buf)); - RETURN_NOT_OK(run_length_builder.Finish(&run_length_buf)); - - ARROW_ASSIGN_OR_RAISE( - out_, - StructArray::Make({std::make_shared(edit_count + 1, insert_buf), - std::make_shared(edit_count + 1, run_length_buf)}, - {field("insert", boolean()), field("run_length", int64())})); - return Status::OK(); - } - - template - Status Visit(const T&) { - using ArrayType = typename TypeTraits::ArrayType; - if (base_.null_count() == 0 && target_.null_count() == 0) { - auto base = MakeViewRange(base_); - auto target = MakeViewRange(target_); - ARROW_ASSIGN_OR_RAISE(out_, - Diff(base.begin(), base.end(), target.begin(), target.end())); - } else { - auto base = MakeNullOrViewRange(base_); - auto target = MakeNullOrViewRange(target_); - ARROW_ASSIGN_OR_RAISE(out_, - Diff(base.begin(), base.end(), target.begin(), target.end())); - } - return Status::OK(); - } - - Status Visit(const ExtensionType&) { - auto base = checked_cast(base_).storage(); - auto target = checked_cast(target_).storage(); - ARROW_ASSIGN_OR_RAISE(out_, arrow::Diff(*base, *target, pool_)); - return Status::OK(); - } - - Status Visit(const DictionaryType& t) { - return Status::NotImplemented("diffing arrays of type ", t); - } - Result> Diff() { - RETURN_NOT_OK(VisitTypeInline(*base_.type(), this)); - return out_; - } - - template - Result> Diff(Iterator base_begin, Iterator base_end, - Iterator target_begin, Iterator target_end) { - QuadraticSpaceMyersDiff impl(base_begin, base_end, target_begin, - target_end); + QuadraticSpaceMyersDiff impl(base_, target_, pool_); while (!impl.Done()) { impl.Next(); } return impl.GetEdits(pool_); } + private: const Array& base_; const Array& target_; MemoryPool* pool_; - std::shared_ptr out_; + std::unique_ptr value_comparator_; + int64_t finish_index_ = -1; + int64_t edit_count_ = 0; + int64_t base_begin_, base_end_; + int64_t target_begin_, target_end_; + // each element of endpoint_base_ is the furthest position in base reachable given an + // edit_count and (# insertions) - (# deletions). Each bit of insert_ records whether + // the corresponding furthest position was reached via an insertion or a deletion + // (followed by a run of shared elements). See StorageOffset for the + // layout of these vectors + std::vector endpoint_base_; + std::vector insert_; }; +Result> NullDiff(const Array& base, const Array& target, + MemoryPool* pool) { + bool insert = base.length() < target.length(); + auto run_length = std::min(base.length(), target.length()); + auto edit_count = std::max(base.length(), target.length()) - run_length; + + TypedBufferBuilder insert_builder(pool); + RETURN_NOT_OK(insert_builder.Resize(edit_count + 1)); + insert_builder.UnsafeAppend(false); + TypedBufferBuilder run_length_builder(pool); + RETURN_NOT_OK(run_length_builder.Resize(edit_count + 1)); + run_length_builder.UnsafeAppend(run_length); + if (edit_count > 0) { + insert_builder.UnsafeAppend(edit_count, insert); + run_length_builder.UnsafeAppend(edit_count, 0); + } + + std::shared_ptr insert_buf, run_length_buf; + RETURN_NOT_OK(insert_builder.Finish(&insert_buf)); + RETURN_NOT_OK(run_length_builder.Finish(&run_length_buf)); + + return StructArray::Make({std::make_shared(edit_count + 1, insert_buf), + std::make_shared(edit_count + 1, run_length_buf)}, + {field("insert", boolean()), field("run_length", int64())}); +} + Result> Diff(const Array& base, const Array& target, MemoryPool* pool) { if (!base.type()->Equals(target.type())) { return Status::TypeError("only taking the diff of like-typed arrays is supported."); } - return DiffImpl{base, target, pool, nullptr}.Diff(); + if (base.type()->id() == Type::NA) { + return NullDiff(base, target, pool); + } else if (base.type()->id() == Type::EXTENSION) { + auto base_storage = checked_cast(base).storage(); + auto target_storage = checked_cast(target).storage(); + return QuadraticSpaceMyersDiff(*base_storage, *target_storage, pool).Diff(); + } else if (base.type()->id() == Type::EXTENSION) { + return Status::NotImplemented("diffing arrays of type ", *base.type()); + } else { + return QuadraticSpaceMyersDiff(base, target, pool).Diff(); + } } using Formatter = std::function; From 152abbf94f20a829fcde69def97783e4da2b8462 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 31 May 2020 19:49:03 -0500 Subject: [PATCH 2/5] fix error message --- cpp/src/arrow/array/diff.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index fd421c080e7..6a47af8e47c 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -126,11 +126,13 @@ struct ValueComparatorVisitor { return Status::OK(); } - Status Visit(const NullType&) { return Status::NotImplemented("extension type"); } + Status Visit(const NullType&) { return Status::NotImplemented("null type"); } Status Visit(const ExtensionType&) { return Status::NotImplemented("extension type"); } - Status Visit(const DictionaryType&) { return Status::NotImplemented("extension type"); } + Status Visit(const DictionaryType&) { + return Status::NotImplemented("dictionary type"); + } std::unique_ptr Create() { DCHECK_OK(VisitTypeInline(*base.type(), this)); From f4a4f437d89424d6689cca24e701c25269a473ae Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 31 May 2020 20:01:36 -0500 Subject: [PATCH 3/5] Simplify further by using lambda --- cpp/src/arrow/array/diff.cc | 127 ++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 71 deletions(-) diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index 6a47af8e47c..8ddb99fda0d 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -88,41 +88,17 @@ static UnitSlice GetView(const UnionArray& array, int64_t index) { return UnitSlice{&array, index}; } -struct ValueComparator { - virtual ~ValueComparator() = default; - virtual bool operator()(int64_t base_index, int64_t target_index) = 0; -}; - -template -struct ValueComparatorImpl : public ValueComparator { - using ArrayType = typename TypeTraits::ArrayType; - - ValueComparatorImpl(const Array& base, const Array& target) - : base(MakeArray(base.data())), target(MakeArray(target.data())) {} - - bool operator()(int64_t base_index, int64_t target_index) override { - bool base_null = base->IsNull(base_index); - bool target_null = target->IsNull(target_index); - if (base_null && target_null) { - return true; - } else if (base_null || target_null) { - return false; - } - return (GetView(checked_cast(*base), base_index) == - GetView(checked_cast(*target), target_index)); - } - - std::shared_ptr base; - std::shared_ptr target; -}; +using ValueComparator = std::function; struct ValueComparatorVisitor { - ValueComparatorVisitor(const Array& base, const Array& target) - : base(base), target(target) {} - template Status Visit(const T&) { - out.reset(new ValueComparatorImpl(base, target)); + using ArrayType = typename TypeTraits::ArrayType; + out = [](const Array& base, int64_t base_index, const Array& target, + int64_t target_index) { + return (GetView(checked_cast(base), base_index) == + GetView(checked_cast(target), target_index)); + }; return Status::OK(); } @@ -134,21 +110,27 @@ struct ValueComparatorVisitor { return Status::NotImplemented("dictionary type"); } - std::unique_ptr Create() { - DCHECK_OK(VisitTypeInline(*base.type(), this)); - return std::move(out); + ValueComparator Create(const DataType& type) { + DCHECK_OK(VisitTypeInline(type, this)); + return out; } - const Array& base; - const Array& target; - std::unique_ptr out; + ValueComparator out; }; -std::unique_ptr GetValueComparator(const Array& base, - const Array& target) { - return ValueComparatorVisitor(base, target).Create(); +ValueComparator GetValueComparator(const DataType& type) { + ValueComparatorVisitor type_visitor; + return type_visitor.Create(type); } +// represents an intermediate state in the comparison of two arrays +struct EditPoint { + int64_t base, target; + bool operator==(EditPoint other) const { + return base == other.base && target == other.target; + } +}; + /// A generic sequence difference algorithm, based on /// /// E. W. Myers, "An O(ND) difference algorithm and its variations," @@ -166,19 +148,11 @@ std::unique_ptr GetValueComparator(const Array& base, /// implicitly. class QuadraticSpaceMyersDiff { public: - // represents an intermediate state in the comparison of two arrays - struct EditPoint { - int64_t base, target; - bool operator==(EditPoint other) const { - return base == other.base && target == other.target; - } - }; - QuadraticSpaceMyersDiff(const Array& base, const Array& target, MemoryPool* pool) : base_(base), target_(target), pool_(pool), - value_comparator_(GetValueComparator(base, target)), + value_comparator_(GetValueComparator(*base.type())), base_begin_(0), base_end_(base.length()), target_begin_(0), @@ -192,8 +166,16 @@ class QuadraticSpaceMyersDiff { } } - bool ValuesEqual(int64_t left, int64_t right) const { - return (*value_comparator_)(left, right); + bool ValuesEqual(int64_t base_index, int64_t target_index) const { + bool base_null = base_.IsNull(base_index); + bool target_null = target_.IsNull(target_index); + if (base_null && target_null) { + return true; + } else if (base_null || target_null) { + return false; + } else { + return value_comparator_(base_, base_index, target_, target_index); + } } // increment the position within base (the element pointed to was deleted) @@ -216,14 +198,7 @@ class QuadraticSpaceMyersDiff { // increment the position within base and target (the elements skipped in this way were // present in both sequences) - EditPoint ExtendFrom(EditPoint p) const { - for (; p.base != base_end_ && p.target != target_end_; ++p.base, ++p.target) { - if (!ValuesEqual(p.base, p.target)) { - break; - } - } - return p; - } + EditPoint ExtendFrom(EditPoint p) const; // beginning of a range for storing per-edit state in endpoint_base_ and insert_ int64_t StorageOffset(int64_t edit_count) const { @@ -232,17 +207,7 @@ class QuadraticSpaceMyersDiff { // given edit_count and index, augment endpoint_base_[index] with the corresponding // position in target (which is only implicitly represented in edit_count, index) - EditPoint GetEditPoint(int64_t edit_count, int64_t index) const { - DCHECK_GE(index, StorageOffset(edit_count)); - DCHECK_LT(index, StorageOffset(edit_count + 1)); - auto insertions_minus_deletions = - 2 * (index - StorageOffset(edit_count)) - edit_count; - auto maximal_base = endpoint_base_[index]; - auto maximal_target = std::min( - target_begin_ + ((maximal_base - base_begin_) + insertions_minus_deletions), - target_end_); - return {maximal_base, maximal_target}; - } + EditPoint GetEditPoint(int64_t edit_count, int64_t index) const; void Next() { ++edit_count_; @@ -341,7 +306,7 @@ class QuadraticSpaceMyersDiff { const Array& base_; const Array& target_; MemoryPool* pool_; - std::unique_ptr value_comparator_; + ValueComparator value_comparator_; int64_t finish_index_ = -1; int64_t edit_count_ = 0; int64_t base_begin_, base_end_; @@ -355,6 +320,26 @@ class QuadraticSpaceMyersDiff { std::vector insert_; }; +EditPoint QuadraticSpaceMyersDiff::ExtendFrom(EditPoint p) const { + for (; p.base != base_end_ && p.target != target_end_; ++p.base, ++p.target) { + if (!ValuesEqual(p.base, p.target)) { + break; + } + } + return p; +} + +EditPoint QuadraticSpaceMyersDiff::GetEditPoint(int64_t edit_count, int64_t index) const { + DCHECK_GE(index, StorageOffset(edit_count)); + DCHECK_LT(index, StorageOffset(edit_count + 1)); + auto insertions_minus_deletions = 2 * (index - StorageOffset(edit_count)) - edit_count; + auto maximal_base = endpoint_base_[index]; + auto maximal_target = std::min( + target_begin_ + ((maximal_base - base_begin_) + insertions_minus_deletions), + target_end_); + return {maximal_base, maximal_target}; +} + Result> NullDiff(const Array& base, const Array& target, MemoryPool* pool) { bool insert = base.length() < target.length(); From 96f5ae93e7020a37a3f576cdf035d0a421b9b0e8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 31 May 2020 20:06:19 -0500 Subject: [PATCH 4/5] Simplify --- cpp/src/arrow/array/diff.cc | 50 +++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index 8ddb99fda0d..67e08fbfd9c 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -198,7 +198,14 @@ class QuadraticSpaceMyersDiff { // increment the position within base and target (the elements skipped in this way were // present in both sequences) - EditPoint ExtendFrom(EditPoint p) const; + EditPoint ExtendFrom(EditPoint p) const { + for (; p.base != base_end_ && p.target != target_end_; ++p.base, ++p.target) { + if (!ValuesEqual(p.base, p.target)) { + break; + } + } + return p; + } // beginning of a range for storing per-edit state in endpoint_base_ and insert_ int64_t StorageOffset(int64_t edit_count) const { @@ -207,7 +214,17 @@ class QuadraticSpaceMyersDiff { // given edit_count and index, augment endpoint_base_[index] with the corresponding // position in target (which is only implicitly represented in edit_count, index) - EditPoint GetEditPoint(int64_t edit_count, int64_t index) const; + EditPoint GetEditPoint(int64_t edit_count, int64_t index) const { + DCHECK_GE(index, StorageOffset(edit_count)); + DCHECK_LT(index, StorageOffset(edit_count + 1)); + auto insertions_minus_deletions = + 2 * (index - StorageOffset(edit_count)) - edit_count; + auto maximal_base = endpoint_base_[index]; + auto maximal_target = std::min( + target_begin_ + ((maximal_base - base_begin_) + insertions_minus_deletions), + target_end_); + return {maximal_base, maximal_target}; + } void Next() { ++edit_count_; @@ -295,11 +312,10 @@ class QuadraticSpaceMyersDiff { } Result> Diff() { - QuadraticSpaceMyersDiff impl(base_, target_, pool_); - while (!impl.Done()) { - impl.Next(); + while (!Done()) { + Next(); } - return impl.GetEdits(pool_); + return GetEdits(pool_); } private: @@ -320,26 +336,6 @@ class QuadraticSpaceMyersDiff { std::vector insert_; }; -EditPoint QuadraticSpaceMyersDiff::ExtendFrom(EditPoint p) const { - for (; p.base != base_end_ && p.target != target_end_; ++p.base, ++p.target) { - if (!ValuesEqual(p.base, p.target)) { - break; - } - } - return p; -} - -EditPoint QuadraticSpaceMyersDiff::GetEditPoint(int64_t edit_count, int64_t index) const { - DCHECK_GE(index, StorageOffset(edit_count)); - DCHECK_LT(index, StorageOffset(edit_count + 1)); - auto insertions_minus_deletions = 2 * (index - StorageOffset(edit_count)) - edit_count; - auto maximal_base = endpoint_base_[index]; - auto maximal_target = std::min( - target_begin_ + ((maximal_base - base_begin_) + insertions_minus_deletions), - target_end_); - return {maximal_base, maximal_target}; -} - Result> NullDiff(const Array& base, const Array& target, MemoryPool* pool) { bool insert = base.length() < target.length(); @@ -377,7 +373,7 @@ Result> Diff(const Array& base, const Array& target } else if (base.type()->id() == Type::EXTENSION) { auto base_storage = checked_cast(base).storage(); auto target_storage = checked_cast(target).storage(); - return QuadraticSpaceMyersDiff(*base_storage, *target_storage, pool).Diff(); + return Diff(*base_storage, *target_storage, pool); } else if (base.type()->id() == Type::EXTENSION) { return Status::NotImplemented("diffing arrays of type ", *base.type()); } else { From 40f235dc32fe0078e87ae17f53a1de11b53811f1 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 1 Jun 2020 11:31:03 -0500 Subject: [PATCH 5/5] Code review comments. Add test to show that diffing DictionaryArray fails --- cpp/src/arrow/array/diff.cc | 12 +++++------- cpp/src/arrow/array/diff.h | 2 +- cpp/src/arrow/array/diff_test.cc | 3 +++ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index 67e08fbfd9c..7cc1a8e69ca 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -169,13 +169,11 @@ class QuadraticSpaceMyersDiff { bool ValuesEqual(int64_t base_index, int64_t target_index) const { bool base_null = base_.IsNull(base_index); bool target_null = target_.IsNull(target_index); - if (base_null && target_null) { - return true; - } else if (base_null || target_null) { - return false; - } else { - return value_comparator_(base_, base_index, target_, target_index); + if (base_null || target_null) { + // If only one is null, then this is false, otherwise true + return base_null && target_null; } + return value_comparator_(base_, base_index, target_, target_index); } // increment the position within base (the element pointed to was deleted) @@ -374,7 +372,7 @@ Result> Diff(const Array& base, const Array& target auto base_storage = checked_cast(base).storage(); auto target_storage = checked_cast(target).storage(); return Diff(*base_storage, *target_storage, pool); - } else if (base.type()->id() == Type::EXTENSION) { + } else if (base.type()->id() == Type::DICTIONARY) { return Status::NotImplemented("diffing arrays of type ", *base.type()); } else { return QuadraticSpaceMyersDiff(base, target, pool).Diff(); diff --git a/cpp/src/arrow/array/diff.h b/cpp/src/arrow/array/diff.h index 7c091ee5dae..e0b85fb90f1 100644 --- a/cpp/src/arrow/array/diff.h +++ b/cpp/src/arrow/array/diff.h @@ -54,7 +54,7 @@ class MemoryPool; /// \return an edit script array which can be applied to base to produce target ARROW_EXPORT Result> Diff(const Array& base, const Array& target, - MemoryPool* pool); + MemoryPool* pool = default_memory_pool()); /// \brief visitor interface for easy traversal of an edit script /// diff --git a/cpp/src/arrow/array/diff_test.cc b/cpp/src/arrow/array/diff_test.cc index 4917d4524d1..827c8894ac6 100644 --- a/cpp/src/arrow/array/diff_test.cc +++ b/cpp/src/arrow/array/diff_test.cc @@ -586,6 +586,9 @@ TEST_F(DiffTest, DictionaryDiffFormatter) { )"; ASSERT_EQ(formatted.str(), formatted_expected_indices); + // Note: Diff doesn't work at the moment with dictionary arrays + ASSERT_RAISES(NotImplemented, Diff(*base_, *target_)); + // differing dictionaries target_dict = ArrayFromJSON(utf8(), R"(["b", "c", "a"])"); target_indices = base_indices;