From 3d34e984a29853088c7c39250e61a84b702afe07 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 29 Mar 2021 13:06:33 -0400 Subject: [PATCH 1/7] ARROW-12134: [C++] Add match_substring_regex kernel --- .../arrow/compute/kernels/scalar_string.cc | 98 +++++++++++++++---- .../compute/kernels/scalar_string_test.cc | 21 ++++ 2 files changed, 102 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 39869879561..ab9ce0cd6b2 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -411,40 +411,104 @@ void TransformMatchSubstring(const uint8_t* pattern, int64_t pattern_length, using MatchSubstringState = OptionsWrapper; -template +template class Matcher> struct MatchSubstring { using offset_type = typename Type::offset_type; static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - MatchSubstringOptions arg = MatchSubstringState::Get(ctx); - const uint8_t* pat = reinterpret_cast(arg.pattern.c_str()); - const int64_t pat_size = arg.pattern.length(); + // TODO Cache matcher accross invocations (for regex compilation) + Matcher matcher(ctx, MatchSubstringState::Get(ctx)); + if (ctx->HasError()) return; StringBoolTransform( ctx, batch, - [pat, pat_size](const void* offsets, const uint8_t* data, int64_t length, - int64_t output_offset, uint8_t* output) { - TransformMatchSubstring( - pat, pat_size, reinterpret_cast(offsets), data, length, - output_offset, output); + [&matcher](const void* offsets, const uint8_t* data, int64_t length, + int64_t output_offset, uint8_t* output) { + matcher.Match(reinterpret_cast(offsets), data, length, + output_offset, output); }, out); } }; +template +struct PlainSubstringMatcher { + const MatchSubstringOptions& options_; + + PlainSubstringMatcher(KernelContext* ctx, const MatchSubstringOptions& options) + : options_(options) {} + + void Match(const offset_type* offsets, const uint8_t* data, int64_t length, + int64_t output_offset, uint8_t* output) { + const uint8_t* pat = reinterpret_cast(options_.pattern.c_str()); + const int64_t pat_size = options_.pattern.length(); + TransformMatchSubstring(pat, pat_size, offsets, data, length, + output_offset, output); + } +}; + +template +struct RegexSubstringMatcher { + const MatchSubstringOptions& options_; + const RE2 regex_match_; + + RegexSubstringMatcher(KernelContext* ctx, const MatchSubstringOptions& options) + : options_(options), regex_match_(options_.pattern) { + if (!regex_match_.ok()) { + ctx->SetStatus(Status::Invalid("Regular expression error")); + } + } + + void Match(const offset_type* offsets, const uint8_t* data, int64_t length, + int64_t output_offset, uint8_t* output) { + FirstTimeBitmapWriter bitmap_writer(output, output_offset, length); + for (int64_t i = 0; i < length; ++i) { + const char* current_data = reinterpret_cast(data + offsets[i]); + int64_t current_length = offsets[i + 1] - offsets[i]; + auto piece = re2::StringPiece(current_data, current_length); + if (re2::RE2::PartialMatch(piece, regex_match_)) { + bitmap_writer.Set(); + } + bitmap_writer.Next(); + } + bitmap_writer.Finish(); + } +}; + const FunctionDoc match_substring_doc( "Match strings against literal pattern", ("For each string in `strings`, emit true iff it contains a given pattern.\n" "Null inputs emit null. The pattern must be given in MatchSubstringOptions."), {"strings"}, "MatchSubstringOptions"); +const FunctionDoc match_substring_regex_doc( + "Match strings against regex pattern", + ("For each string in `strings`, emit true iff it matches a given pattern at any " + "position.\n" + "Null inputs emit null. The pattern must be given in MatchSubstringOptions."), + {"strings"}, "MatchSubstringOptions"); + void AddMatchSubstring(FunctionRegistry* registry) { - auto func = std::make_shared("match_substring", Arity::Unary(), - &match_substring_doc); - auto exec_32 = MatchSubstring::Exec; - auto exec_64 = MatchSubstring::Exec; - DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init)); - DCHECK_OK( - func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init)); - DCHECK_OK(registry->AddFunction(std::move(func))); + { + auto func = std::make_shared("match_substring", Arity::Unary(), + &match_substring_doc); + auto exec_32 = MatchSubstring::Exec; + auto exec_64 = MatchSubstring::Exec; + DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init)); + DCHECK_OK( + func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init)); + DCHECK_OK(registry->AddFunction(std::move(func))); + } +#ifdef ARROW_WITH_RE2 + { + auto func = std::make_shared("match_substring_regex", Arity::Unary(), + &match_substring_regex_doc); + auto exec_32 = MatchSubstring::Exec; + auto exec_64 = MatchSubstring::Exec; + DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init)); + DCHECK_OK( + func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init)); + DCHECK_OK(registry->AddFunction(std::move(func))); + } +#endif } // IsAlpha/Digit etc diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 88622e842d1..2dd0a4d8c74 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -348,6 +348,27 @@ TYPED_TEST(TestStringKernels, MatchSubstring) { &options_double_char_2); } +#ifdef ARROW_WITH_RE2 +TYPED_TEST(TestStringKernels, MatchSubstringRegex) { + MatchSubstringOptions options{"ab"}; + this->CheckUnary("match_substring_regex", "[]", boolean(), "[]", &options); + this->CheckUnary("match_substring_regex", R"(["abc", "acb", "cab", null, "bac"])", + boolean(), "[true, false, true, null, false]", &options); + MatchSubstringOptions options_repeated{"(ab){2}"}; + this->CheckUnary("match_substring_regex", R"(["abab", "ab", "cababc", null, "bac"])", + boolean(), "[true, false, true, null, false]", &options_repeated); + MatchSubstringOptions options_digit{"\\d"}; + this->CheckUnary("match_substring_regex", R"(["aacb", "a2ab", "", "24"])", boolean(), + "[false, true, false, true]", &options_digit); + MatchSubstringOptions options_star{"a*b"}; + this->CheckUnary("match_substring_regex", R"(["aacb", "aab", "dab", "caaab", "b", ""])", + boolean(), "[true, true, true, true, true, false]", &options_star); + MatchSubstringOptions options_plus{"a+b"}; + this->CheckUnary("match_substring_regex", R"(["aacb", "aab", "dab", "caaab", "b", ""])", + boolean(), "[false, true, true, true, false, false]", &options_plus); +} +#endif + TYPED_TEST(TestStringKernels, SplitBasics) { SplitPatternOptions options{" "}; // basics From b99f09bb726be6f77ce339b39af640abcd19428b Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 30 Mar 2021 08:40:40 -0400 Subject: [PATCH 2/7] ARROW-12134: [Python] Add match_substring_regex to Python, docs --- cpp/src/arrow/compute/api_scalar.h | 2 +- docs/source/cpp/compute.rst | 31 ++++++++++++++++------------ python/pyarrow/compute.py | 18 ++++++++++++++++ python/pyarrow/tests/test_compute.py | 7 +++++++ 4 files changed, 44 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 730836bd118..f59426d8f1b 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -45,7 +45,7 @@ struct ArithmeticOptions : public FunctionOptions { struct ARROW_EXPORT MatchSubstringOptions : public FunctionOptions { explicit MatchSubstringOptions(std::string pattern) : pattern(std::move(pattern)) {} - /// The exact substring to look for inside input values. + /// The exact substring (or regex, depending on kernel) to look for inside input values. std::string pattern; }; diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 065b80736aa..a95af93ff77 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -522,26 +522,31 @@ These functions trim off characters on both sides (trim), or the left (ltrim) or Containment tests ~~~~~~~~~~~~~~~~~ -+--------------------+------------+------------------------------------+---------------+----------------------------------------+ -| Function name | Arity | Input types | Output type | Options class | -+====================+============+====================================+===============+========================================+ -| match_substring | Unary | String-like | Boolean (1) | :struct:`MatchSubstringOptions` | -+--------------------+------------+------------------------------------+---------------+----------------------------------------+ -| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (2) | :struct:`SetLookupOptions` | -| | | Binary- and String-like | | | -+--------------------+------------+------------------------------------+---------------+----------------------------------------+ -| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (3) | :struct:`SetLookupOptions` | -| | | Binary- and String-like | | | -+--------------------+------------+------------------------------------+---------------+----------------------------------------+ ++---------------------------+------------+------------------------------------+---------------+----------------------------------------+ +| Function name | Arity | Input types | Output type | Options class | ++===========================+============+====================================+===============+========================================+ +| match_substring | Unary | String-like | Boolean (1) | :struct:`MatchSubstringOptions` | ++---------------------------+------------+------------------------------------+---------------+----------------------------------------+ +| match_substring_regex | Unary | String-like | Boolean (1) | :struct:`MatchSubstringOptions` | ++---------------------------+------------+------------------------------------+---------------+----------------------------------------+ +| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (2) | :struct:`SetLookupOptions` | +| | | Binary- and String-like | | | ++---------------------------+------------+------------------------------------+---------------+----------------------------------------+ +| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (3) | :struct:`SetLookupOptions` | +| | | Binary- and String-like | | | ++---------------------------+------------+------------------------------------+---------------+----------------------------------------+ * \(1) Output is true iff :member:`MatchSubstringOptions::pattern` is a substring of the corresponding input element. -* \(2) Output is the index of the corresponding input element in +* \(2) Output is true iff :member:`MatchSubstringOptions::pattern` + matches the corresponding input element at any position. + +* \(3) Output is the index of the corresponding input element in :member:`SetLookupOptions::value_set`, if found there. Otherwise, output is null. -* \(3) Output is true iff the corresponding input element is equal to one +* \(4) Output is true iff the corresponding input element is equal to one of the elements in :member:`SetLookupOptions::value_set`. diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 1b46a08c402..3928b9cb904 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -306,6 +306,24 @@ def match_substring(array, pattern): MatchSubstringOptions(pattern)) +def match_substring_regex(array, pattern): + """ + Test if regex *pattern* matches at any position a value of a string array. + + Parameters + ---------- + array : pyarrow.Array or pyarrow.ChunkedArray + pattern : str + regex pattern to search + + Returns + ------- + result : pyarrow.Array or pyarrow.ChunkedArray + """ + return call_function("match_substring_regex", [array], + MatchSubstringOptions(pattern)) + + def sum(array): """ Sum the values in a numerical (chunked) array. diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 160375f93bd..94a6189f41c 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -279,6 +279,13 @@ def test_match_substring(): assert expected.equals(result) +def test_match_substring_regex(): + arr = pa.array(["ab", "abc", "ba", "c", None]) + result = pc.match_substring_regex(arr, "^a?b") + expected = pa.array([True, True, True, False, None]) + assert expected.equals(result) + + def test_trim(): # \u3000 is unicode whitespace arr = pa.array([" foo", None, " \u3000foo bar \t"]) From d15f37ab48b46ba13ce70a7439e39e006cf154a1 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 30 Mar 2021 08:41:46 -0400 Subject: [PATCH 3/7] ARROW-12134: [C++] Fix typos --- cpp/src/arrow/compute/kernels/scalar_string.cc | 4 ++-- docs/source/python/api/compute.rst | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index ab9ce0cd6b2..b2e3eaa101c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -415,7 +415,7 @@ template class Matcher> struct MatchSubstring { using offset_type = typename Type::offset_type; static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - // TODO Cache matcher accross invocations (for regex compilation) + // TODO Cache matcher across invocations (for regex compilation) Matcher matcher(ctx, MatchSubstringState::Get(ctx)); if (ctx->HasError()) return; StringBoolTransform( @@ -1310,7 +1310,7 @@ struct ReplaceSubString { using State = OptionsWrapper; static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - // TODO Cache replacer accross invocations (for regex compilation) + // TODO Cache replacer across invocations (for regex compilation) Replacer replacer{ctx, State::Get(ctx)}; if (!ctx->HasError()) { Replace(ctx, batch, &replacer, out); diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 2dafbd23c08..d6efc6a5fea 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -155,6 +155,7 @@ Containment tests index_in is_in match_substring + match_substring_regex Conversions ----------- From e00cc4c465ee6eba515852468a09fff104265ec6 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 31 Mar 2021 07:45:05 -0400 Subject: [PATCH 4/7] ARROW-12134: [C++] Add necessary ARROW_WITH_RE2 guard --- cpp/src/arrow/compute/kernels/scalar_string.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index b2e3eaa101c..50bc0b4f6a4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -445,6 +445,13 @@ struct PlainSubstringMatcher { } }; +const FunctionDoc match_substring_doc( + "Match strings against literal pattern", + ("For each string in `strings`, emit true iff it contains a given pattern.\n" + "Null inputs emit null. The pattern must be given in MatchSubstringOptions."), + {"strings"}, "MatchSubstringOptions"); + +#ifdef ARROW_WITH_RE2 template struct RegexSubstringMatcher { const MatchSubstringOptions& options_; @@ -473,18 +480,13 @@ struct RegexSubstringMatcher { } }; -const FunctionDoc match_substring_doc( - "Match strings against literal pattern", - ("For each string in `strings`, emit true iff it contains a given pattern.\n" - "Null inputs emit null. The pattern must be given in MatchSubstringOptions."), - {"strings"}, "MatchSubstringOptions"); - const FunctionDoc match_substring_regex_doc( "Match strings against regex pattern", ("For each string in `strings`, emit true iff it matches a given pattern at any " "position.\n" "Null inputs emit null. The pattern must be given in MatchSubstringOptions."), {"strings"}, "MatchSubstringOptions"); +#endif void AddMatchSubstring(FunctionRegistry* registry) { { From 7b37ecf9033f2a5fead0e6dc573f9d9b05fc31a9 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 31 Mar 2021 09:34:19 -0400 Subject: [PATCH 5/7] ARROW-12134: [C++] Don't template the string matcher --- .../arrow/compute/kernels/scalar_string.cc | 117 ++++++++---------- 1 file changed, 49 insertions(+), 68 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 50bc0b4f6a4..f5689ce2107 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -368,80 +368,72 @@ void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch, } } -template -void TransformMatchSubstring(const uint8_t* pattern, int64_t pattern_length, - const offset_type* offsets, const uint8_t* data, - int64_t length, int64_t output_offset, uint8_t* output) { - // This is an implementation of the Knuth-Morris-Pratt algorithm - - // Phase 1: Build the prefix table - std::vector prefix_table(pattern_length + 1); - offset_type prefix_length = -1; - prefix_table[0] = -1; - for (offset_type pos = 0; pos < pattern_length; ++pos) { - // The prefix cannot be expanded, reset. - while (prefix_length >= 0 && pattern[pos] != pattern[prefix_length]) { - prefix_length = prefix_table[prefix_length]; - } - prefix_length++; - prefix_table[pos + 1] = prefix_length; - } - - // Phase 2: Find the prefix in the data - FirstTimeBitmapWriter bitmap_writer(output, output_offset, length); - for (int64_t i = 0; i < length; ++i) { - const uint8_t* current_data = data + offsets[i]; - int64_t current_length = offsets[i + 1] - offsets[i]; - - int64_t pattern_pos = 0; - for (int64_t k = 0; k < current_length; k++) { - while ((pattern_pos >= 0) && (pattern[pattern_pos] != current_data[k])) { - pattern_pos = prefix_table[pattern_pos]; - } - pattern_pos++; - if (pattern_pos == pattern_length) { - bitmap_writer.Set(); - break; - } - } - bitmap_writer.Next(); - } - bitmap_writer.Finish(); -} - using MatchSubstringState = OptionsWrapper; -template class Matcher> +template struct MatchSubstring { using offset_type = typename Type::offset_type; static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { // TODO Cache matcher across invocations (for regex compilation) - Matcher matcher(ctx, MatchSubstringState::Get(ctx)); + Matcher matcher(ctx, MatchSubstringState::Get(ctx)); if (ctx->HasError()) return; StringBoolTransform( ctx, batch, - [&matcher](const void* offsets, const uint8_t* data, int64_t length, + [&matcher](const void* raw_offsets, const uint8_t* data, int64_t length, int64_t output_offset, uint8_t* output) { - matcher.Match(reinterpret_cast(offsets), data, length, - output_offset, output); + const offset_type* offsets = reinterpret_cast(raw_offsets); + FirstTimeBitmapWriter bitmap_writer(output, output_offset, length); + for (int64_t i = 0; i < length; ++i) { + const char* current_data = reinterpret_cast(data + offsets[i]); + int64_t current_length = offsets[i + 1] - offsets[i]; + if (matcher.Match(util::string_view(current_data, current_length))) { + bitmap_writer.Set(); + } + bitmap_writer.Next(); + } + bitmap_writer.Finish(); }, out); } }; -template +// This is an implementation of the Knuth-Morris-Pratt algorithm struct PlainSubstringMatcher { const MatchSubstringOptions& options_; + std::vector prefix_table; PlainSubstringMatcher(KernelContext* ctx, const MatchSubstringOptions& options) - : options_(options) {} + : options_(options) { + // Phase 1: Build the prefix table + const auto pattern_length = options_.pattern.size(); + prefix_table.reserve(pattern_length + 1); + int64_t prefix_length = -1; + prefix_table[0] = -1; + for (size_t pos = 0; pos < pattern_length; ++pos) { + // The prefix cannot be expanded, reset. + while (prefix_length >= 0 && + options_.pattern[pos] != options_.pattern[prefix_length]) { + prefix_length = prefix_table[prefix_length]; + } + prefix_length++; + prefix_table[pos + 1] = prefix_length; + } + } - void Match(const offset_type* offsets, const uint8_t* data, int64_t length, - int64_t output_offset, uint8_t* output) { - const uint8_t* pat = reinterpret_cast(options_.pattern.c_str()); - const int64_t pat_size = options_.pattern.length(); - TransformMatchSubstring(pat, pat_size, offsets, data, length, - output_offset, output); + bool Match(util::string_view current) { + // Phase 2: Find the prefix in the data + const auto pattern_length = options_.pattern.size(); + int64_t pattern_pos = 0; + for (const auto c : current) { + while ((pattern_pos >= 0) && (options_.pattern[pattern_pos] != c)) { + pattern_pos = prefix_table[pattern_pos]; + } + pattern_pos++; + if (static_cast(pattern_pos) == pattern_length) { + return true; + } + } + return false; } }; @@ -452,7 +444,6 @@ const FunctionDoc match_substring_doc( {"strings"}, "MatchSubstringOptions"); #ifdef ARROW_WITH_RE2 -template struct RegexSubstringMatcher { const MatchSubstringOptions& options_; const RE2 regex_match_; @@ -464,19 +455,9 @@ struct RegexSubstringMatcher { } } - void Match(const offset_type* offsets, const uint8_t* data, int64_t length, - int64_t output_offset, uint8_t* output) { - FirstTimeBitmapWriter bitmap_writer(output, output_offset, length); - for (int64_t i = 0; i < length; ++i) { - const char* current_data = reinterpret_cast(data + offsets[i]); - int64_t current_length = offsets[i + 1] - offsets[i]; - auto piece = re2::StringPiece(current_data, current_length); - if (re2::RE2::PartialMatch(piece, regex_match_)) { - bitmap_writer.Set(); - } - bitmap_writer.Next(); - } - bitmap_writer.Finish(); + bool Match(util::string_view current) { + auto piece = re2::StringPiece(current.data(), current.length()); + return re2::RE2::PartialMatch(piece, regex_match_); } }; From 83b159d84a56ede3ad2d3e51e31d98e30a79f3fc Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 31 Mar 2021 12:47:32 -0400 Subject: [PATCH 6/7] ARROW-12134: [C++] Fix numbering in docs --- docs/source/cpp/compute.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index a95af93ff77..715d5036964 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -527,12 +527,12 @@ Containment tests +===========================+============+====================================+===============+========================================+ | match_substring | Unary | String-like | Boolean (1) | :struct:`MatchSubstringOptions` | +---------------------------+------------+------------------------------------+---------------+----------------------------------------+ -| match_substring_regex | Unary | String-like | Boolean (1) | :struct:`MatchSubstringOptions` | +| match_substring_regex | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` | +---------------------------+------------+------------------------------------+---------------+----------------------------------------+ -| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (2) | :struct:`SetLookupOptions` | +| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (3) | :struct:`SetLookupOptions` | | | | Binary- and String-like | | | +---------------------------+------------+------------------------------------+---------------+----------------------------------------+ -| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (3) | :struct:`SetLookupOptions` | +| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (4) | :struct:`SetLookupOptions` | | | | Binary- and String-like | | | +---------------------------+------------+------------------------------------+---------------+----------------------------------------+ From e3e31bd2af39f5b2830d35ca2838e1c5a6e41559 Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 1 Apr 2021 08:07:02 -0400 Subject: [PATCH 7/7] ARROW-12134: [C++] Properly zero-initialize prefix table --- cpp/src/arrow/compute/kernels/scalar_string.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index f5689ce2107..9ec1fe005d4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -406,7 +406,7 @@ struct PlainSubstringMatcher { : options_(options) { // Phase 1: Build the prefix table const auto pattern_length = options_.pattern.size(); - prefix_table.reserve(pattern_length + 1); + prefix_table.resize(pattern_length + 1, /*value=*/0); int64_t prefix_length = -1; prefix_table[0] = -1; for (size_t pos = 0; pos < pattern_length; ++pos) {