Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ci/conda_env_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ ninja
pkg-config
python
rapidjson
re2
snappy
thrift-cpp>=0.11.0
zlib
Expand Down
1 change: 0 additions & 1 deletion ci/conda_env_gandiva.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,3 @@

clangdev=11
llvmdev=11
re2
1 change: 0 additions & 1 deletion ci/conda_env_gandiva_win.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,3 @@
# llvmdev=9 or later require Visual Studio 2017
clangdev=8
llvmdev=8
re2
12 changes: 12 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,10 @@ if(ARROW_BUILD_BENCHMARKS
set(ARROW_TESTING ON)
endif()

if(ARROW_GANDIVA)
set(ARROW_WITH_RE2 ON)
endif()

if(ARROW_CUDA
OR ARROW_FLIGHT
OR ARROW_PARQUET
Expand Down Expand Up @@ -746,6 +750,14 @@ if(ARROW_WITH_UTF8PROC)
endif()
endif()

if(ARROW_WITH_RE2)
list(APPEND ARROW_LINK_LIBS RE2::re2)
list(APPEND ARROW_STATIC_LINK_LIBS RE2::re2)
if(utf8proc_SOURCE STREQUAL "SYSTEM")
list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS RE2::re2)
endif()
endif()

add_custom_target(arrow_dependencies)
add_custom_target(arrow_benchmark_dependencies)
add_custom_target(arrow_test_dependencies)
Expand Down
2 changes: 2 additions & 0 deletions cpp/cmake_modules/DefineOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,8 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")

define_option(ARROW_WITH_UTF8PROC
"Build with support for Unicode properties using the utf8proc library" ON)
define_option(ARROW_WITH_RE2
"Build with support for regular expressions using the re2 library" ON)

#----------------------------------------------------------------------
if(MSVC_TOOLCHAIN)
Expand Down
6 changes: 5 additions & 1 deletion cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,9 @@ if(NOT ARROW_COMPUTE)
# utf8proc is only potentially used in kernels for now
set(ARROW_WITH_UTF8PROC OFF)
endif()
if((NOT ARROW_COMPUTE) AND (NOT ARROW_GANDIVA))
set(ARROW_WITH_RE2 OFF)
endif()

# ----------------------------------------------------------------------
# Versions and URLs for toolchain builds, which also can be used to configure
Expand Down Expand Up @@ -2090,8 +2093,9 @@ macro(build_re2)
list(APPEND ARROW_BUNDLED_STATIC_LIBS RE2::re2)
endmacro()

if(ARROW_GANDIVA)
if(ARROW_WITH_RE2)
resolve_dependency(RE2)
add_definitions(-DARROW_WITH_RE2)

# TODO: Don't use global includes but rather target_include_directories
get_target_property(RE2_INCLUDE_DIR RE2::re2 INTERFACE_INCLUDE_DIRECTORIES)
Expand Down
7 changes: 7 additions & 0 deletions cpp/src/arrow/compute/api_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,13 @@ struct ARROW_EXPORT SplitPatternOptions : public SplitOptions {
std::string pattern;
};

struct ARROW_EXPORT RE2Options : public FunctionOptions {
explicit RE2Options(std::string regex) : regex(regex) {}

/// Regular expression
std::string regex;
};

/// Options for IsIn and IndexIn functions
struct ARROW_EXPORT SetLookupOptions : public FunctionOptions {
explicit SetLookupOptions(Datum value_set, bool skip_nulls)
Expand Down
119 changes: 119 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,12 @@
#include <utf8proc.h>
#endif

#include <re2/re2.h>
#include "arrow/array/builder_binary.h"
#include "arrow/array/builder_nested.h"
#include "arrow/buffer_builder.h"

#include "arrow/builder.h"
#include "arrow/compute/api_scalar.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/util/utf8.h"
Expand Down Expand Up @@ -1194,6 +1197,121 @@ void AddSplit(FunctionRegistry* registry) {
#endif
}

// ----------------------------------------------------------------------
// re2 regex

template <typename Type>
struct ExtractRE2 {
using ArrayType = typename TypeTraits<Type>::ArrayType;
using ScalarType = typename TypeTraits<Type>::ScalarType;
using BuilderType = typename TypeTraits<Type>::BuilderType;
using State = OptionsWrapper<RE2Options>;

static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
RE2Options options = State::Get(ctx);
RE2 regex(options.regex);

if (!regex.ok()) {
ctx->SetStatus(Status::Invalid("Regular expression error"));
return;
}
std::vector<std::shared_ptr<Field>> fields;
int group_count = regex.NumberOfCapturingGroups();
fields.reserve(group_count);
const std::map<int, std::string> name_map = regex.CapturingGroupNames();

// We need to pass RE2 a Args* array, which all point to a std::string
std::vector<std::string> found_values(group_count);
std::vector<re2::RE2::Arg> args;
std::vector<re2::RE2::Arg*> args_pointers;
args.reserve(group_count);
args_pointers.reserve(group_count);

for (int i = 0; i < group_count; i++) {
auto item = name_map.find(i + 1); // re2 starts counting from 1
if (item == name_map.end()) {
ctx->SetStatus(Status::Invalid("Regular expression contains unnamed groups"));
return;
}
fields.emplace_back(new Field(item->second, batch[0].type()));
args.emplace_back(&found_values[i]);
// since we reserved capacity, we're guaranteed std::vector does not reallocate
// (which would cause the pointer to be invalid)
args_pointers.push_back(&args[i]);
}
auto type = struct_(fields);

if (batch[0].kind() == Datum::ARRAY) {
std::unique_ptr<ArrayBuilder> array_builder_tmp;
MakeBuilder(ctx->memory_pool(), type, &array_builder_tmp);
std::shared_ptr<StructBuilder> struct_builder;
struct_builder.reset(checked_cast<StructBuilder*>(array_builder_tmp.release()));

const ArrayData& input = *batch[0].array();
KERNEL_RETURN_IF_ERROR(
ctx,
VisitArrayDataInline<Type>(
input,
[&](util::string_view s) {
re2::StringPiece piece(s.data(), s.length());
if (re2::RE2::FullMatchN(piece, regex, &args_pointers[0], group_count)) {
for (int i = 0; i < group_count; i++) {
BuilderType* builder =
static_cast<BuilderType*>(struct_builder->field_builder(i));
RETURN_NOT_OK(builder->Append(found_values[i]));
}
RETURN_NOT_OK(struct_builder->Append());
} else {
RETURN_NOT_OK(struct_builder->AppendNull());
}
return Status::OK();
},
[&]() {
RETURN_NOT_OK(struct_builder->AppendNull());
return Status::OK();
}));
std::shared_ptr<StructArray> struct_array =
std::make_shared<StructArray>(out->array());
KERNEL_RETURN_IF_ERROR(ctx, struct_builder->Finish(&struct_array));
ArrayData* output = out->mutable_array();
output->type = type;
output->child_data = struct_array->data()->child_data;

} else {
const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
auto result = std::make_shared<StructScalar>(type);
if (input.is_valid) {
util::string_view s = static_cast<util::string_view>(*input.value);
re2::StringPiece piece(s.data(), s.length());
if (re2::RE2::FullMatchN(piece, regex, &args_pointers[0], group_count)) {
for (int i = 0; i < group_count; i++) {
result->value.push_back(std::make_shared<ScalarType>(found_values[i]));
}
result->is_valid = true;
} else {
result->is_valid = false;
}
} else {
result->is_valid = false;
}
out->value = result;
}
}
};

const FunctionDoc utf8_extract_re2_doc("Extract", ("Long.."), {"strings"}, "RE2Options");

void AddExtractRE2(FunctionRegistry* registry) {
auto func = std::make_shared<ScalarFunction>("utf8_extract_re2", Arity::Unary(),
&utf8_extract_re2_doc);
using t32 = ExtractRE2<StringType>;
using t64 = ExtractRE2<LargeStringType>;
DCHECK_OK(func->AddKernel({utf8()}, {struct_({})}, t32::Exec, t32::State::Init));
DCHECK_OK(func->AddKernel({large_utf8()}, {struct_({})}, t64::Exec, t64::State::Init));
DCHECK_OK(registry->AddFunction(std::move(func)));
}
void AddRE2(FunctionRegistry* registry) { AddExtractRE2(registry); }

// ----------------------------------------------------------------------
// strptime string parsing

Expand Down Expand Up @@ -1496,6 +1614,7 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
#endif

AddSplit(registry);
AddRE2(registry);
AddBinaryLength(registry);
AddMatchSubstring(registry);
AddStrptime(registry);
Expand Down
14 changes: 14 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,20 @@ TYPED_TEST(TestStringKernels, SplitWhitespaceUTF8Reverse) {
&options_max);
}

TYPED_TEST(TestStringKernels, ExtractRE2) {
RE2Options options{"(?P<letter>[ab])(?P<digit>\\d)"};
auto type = struct_({field("letter", this->type()), field("digit", this->type())});
// TODO: enable test when the following issue is fixed:
// https://issues.apache.org/jira/browse/ARROW-10208
// this->CheckUnary(
// "utf8_extract_re2", R"(["a1", "b2", "c3", null])", type,
// R"([{"letter": "a", "digit": "1"}, {"letter": "b", "digit": "2"}, null,
// null])", &options);
this->CheckUnary("utf8_extract_re2", R"(["a1", "b2"])", type,
R"([{"letter": "a", "digit": "1"}, {"letter": "b", "digit": "2"}])",
&options);
}

TYPED_TEST(TestStringKernels, Strptime) {
std::string input1 = R"(["5/1/2020", null, "12/11/1900"])";
std::string output1 = R"(["2020-05-01", null, "1900-12-11"])";
Expand Down
14 changes: 14 additions & 0 deletions docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,20 @@ when a positive ``max_splits`` is given.
(``'\t'``, ``'\n'``, ``'\v'``, ``'\f'``, ``'\r'`` and ``' '``) is seen
as separator.

String extraction
~~~~~~~~~~~~~~~~~

+--------------------+------------+------------------------------------+---------------+----------------------------------------+
| Function name | Arity | Input types | Output type | Options class |
+====================+============+====================================+===============+========================================+
| utf8_extract_re2 | Unary | String-like | Struct (1) | :struct:`RE2Options` |
+--------------------+------------+------------------------------------+---------------+----------------------------------------+

* \(1) Extract substrings defined by a regular expression using the Google RE2
library. Struct field names refer to the named groups, e.g. 'letter' and 'digit'
for following regular expression: '(?P<letter>[ab])(?P<digit>\\d)'.



Structural transforms
~~~~~~~~~~~~~~~~~~~~~
Expand Down
12 changes: 12 additions & 0 deletions python/pyarrow/_compute.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,18 @@ class MatchSubstringOptions(_MatchSubstringOptions):
self._set_options(pattern)


cdef class RE2Options(FunctionOptions):
cdef:
unique_ptr[CRE2Options] match_substring_options

def __init__(self, regex):
self.match_substring_options.reset(
new CRE2Options(tobytes(regex)))

cdef const CFunctionOptions* get_options(self) except NULL:
return self.match_substring_options.get()


cdef class _FilterOptions(FunctionOptions):
cdef:
CFilterOptions filter_options
Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
MinMaxOptions,
ModeOptions,
PartitionNthOptions,
RE2Options,
SetLookupOptions,
StrptimeOptions,
TakeOptions,
Expand Down
5 changes: 5 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -1716,6 +1716,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
c_bool reverse)
c_string pattern

cdef cppclass CRE2Options \
"arrow::compute::RE2Options"(CFunctionOptions):
CRE2Options(c_string regex)
c_string regex

cdef cppclass CCastOptions" arrow::compute::CastOptions"(CFunctionOptions):
CCastOptions()
CCastOptions(c_bool safe)
Expand Down
6 changes: 6 additions & 0 deletions python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,12 @@ def test_string_py_compat_boolean(function_name, variant):
assert arrow_func(ar)[0].as_py() == getattr(c, py_name)()


def test_extract_re2():
ar = pa.array(['a1', 'b2'])
struct = pc.utf8_extract_re2(ar, regex='(?P<letter>[ab])(?P<digit>\\d)')
assert struct.tolist() == [{'letter': 'a', 'digit': '1'}, {'letter': 'b', 'digit': '2'}]


@pytest.mark.parametrize(('ty', 'values'), all_array_types)
def test_take(ty, values):
arr = pa.array(values, type=ty)
Expand Down
2 changes: 1 addition & 1 deletion testing