From 8cf906a83b9f715663476065791d4dd9b05dd613 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 4 Sep 2021 08:15:34 +0900 Subject: [PATCH] ARROW-13889: [Ruby] Implement slicer by compute kernels --- c_glib/arrow-glib/compute.cpp | 163 +++++++++++++++++++++ c_glib/arrow-glib/compute.h | 16 +++ c_glib/arrow-glib/compute.hpp | 4 + c_glib/test/test-is-in.rb | 24 ++++ c_glib/test/test-set-lookup-options.rb | 43 ++++++ ruby/red-arrow/lib/arrow/datum.rb | 2 + ruby/red-arrow/lib/arrow/slicer.rb | 187 ++++++------------------- ruby/red-arrow/test/test-slicer.rb | 11 +- 8 files changed, 301 insertions(+), 149 deletions(-) create mode 100644 c_glib/test/test-set-lookup-options.rb diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 825d296dd26..b0839799d9a 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -165,6 +165,9 @@ G_BEGIN_DECLS * #GArrowSortOptions is a class to customize the `sort_indices` * function. * + * #GArrowSetLookupOptions is a class to customize the `is_in` function + * and `index_in` function. + * * There are many functions to compute data on an array. */ @@ -2417,6 +2420,157 @@ garrow_sort_options_set_sort_keys(GArrowSortOptions *options, } +typedef struct GArrowSetLookupOptionsPrivate_ { + GArrowDatum *value_set; +} GArrowSetLookupOptionsPrivate; + +enum { + PROP_SET_LOOKUP_OPTIONS_VALUE_SET = 1, + PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowSetLookupOptions, + garrow_set_lookup_options, + GARROW_TYPE_FUNCTION_OPTIONS) + +#define GARROW_SET_LOOKUP_OPTIONS_GET_PRIVATE(object) \ + static_cast( \ + garrow_set_lookup_options_get_instance_private( \ + GARROW_SET_LOOKUP_OPTIONS(object))) + +static void +garrow_set_lookup_options_dispose(GObject *object) +{ + auto priv = GARROW_SET_LOOKUP_OPTIONS_GET_PRIVATE(object); + + if (priv->value_set) { + g_object_unref(priv->value_set); + priv->value_set = NULL; + } + + G_OBJECT_CLASS(garrow_set_lookup_options_parent_class)->dispose(object); +} + +static void +garrow_set_lookup_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_SET_LOOKUP_OPTIONS_GET_PRIVATE(object); + auto options = + garrow_set_lookup_options_get_raw(GARROW_SET_LOOKUP_OPTIONS(object)); + + switch (prop_id) { + case PROP_SET_LOOKUP_OPTIONS_VALUE_SET: + priv->value_set = GARROW_DATUM(g_value_dup_object(value)); + options->value_set = garrow_datum_get_raw(priv->value_set); + break; + case PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS: + options->skip_nulls = g_value_get_boolean(value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_set_lookup_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_SET_LOOKUP_OPTIONS_GET_PRIVATE(object); + auto options = + garrow_set_lookup_options_get_raw(GARROW_SET_LOOKUP_OPTIONS(object)); + + switch (prop_id) { + case PROP_SET_LOOKUP_OPTIONS_VALUE_SET: + g_value_set_object(value, priv->value_set); + break; + case PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS: + g_value_set_boolean(value, options->skip_nulls); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_set_lookup_options_init(GArrowSetLookupOptions *object) +{ + auto priv = GARROW_FUNCTION_OPTIONS_GET_PRIVATE(object); + priv->options = static_cast( + new arrow::compute::SetLookupOptions()); +} + +static void +garrow_set_lookup_options_class_init(GArrowSetLookupOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = garrow_set_lookup_options_dispose; + gobject_class->set_property = garrow_set_lookup_options_set_property; + gobject_class->get_property = garrow_set_lookup_options_get_property; + + + arrow::compute::SetLookupOptions options; + + GParamSpec *spec; + /** + * GArrowSetLookupOptions:value-set: + * + * The set of values to look up input values into. + * + * Since: 6.0.0 + */ + spec = g_param_spec_object("value-set", + "Value set", + "The set of values to look up input values into", + GARROW_TYPE_DATUM, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, + PROP_SET_LOOKUP_OPTIONS_VALUE_SET, + spec); + + /** + * GArrowSetLookupOptions:skip-nulls: + * + * Whether NULLs are skipped or not. + * + * Since: 6.0.0 + */ + spec = g_param_spec_boolean("skip-nulls", + "Skip NULLs", + "Whether NULLs are skipped or not", + options.skip_nulls, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS, + spec); +} + +/** + * garrow_set_lookup_options_new: + * @value_set: A #GArrowArrayDatum or #GArrowChunkedArrayDatum to be looked up. + * + * Returns: A newly created #GArrowSetLookupOptions. + * + * Since: 6.0.0 + */ +GArrowSetLookupOptions * +garrow_set_lookup_options_new(GArrowDatum *value_set) +{ + return GARROW_SET_LOOKUP_OPTIONS( + g_object_new(GARROW_TYPE_SET_LOOKUP_OPTIONS, + "value-set", value_set, + NULL)); +} + + /** * garrow_array_cast: * @array: A #GArrowArray. @@ -3755,3 +3909,12 @@ garrow_sort_options_get_raw(GArrowSortOptions *options) return static_cast( garrow_function_options_get_raw(GARROW_FUNCTION_OPTIONS(options))); } + +arrow::compute::SetLookupOptions * +garrow_set_lookup_options_get_raw(GArrowSetLookupOptions *options) +{ + return static_cast( + garrow_function_options_get_raw(GARROW_FUNCTION_OPTIONS(options))); +} + + diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h index 108b27ff7ba..239cc50f9e5 100644 --- a/c_glib/arrow-glib/compute.h +++ b/c_glib/arrow-glib/compute.h @@ -436,6 +436,22 @@ garrow_sort_options_add_sort_key(GArrowSortOptions *options, GArrowSortKey *sort_key); +#define GARROW_TYPE_SET_LOOKUP_OPTIONS (garrow_set_lookup_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowSetLookupOptions, + garrow_set_lookup_options, + GARROW, + SET_LOOKUP_OPTIONS, + GArrowFunctionOptions) +struct _GArrowSetLookupOptionsClass +{ + GArrowFunctionOptionsClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +GArrowSetLookupOptions * +garrow_set_lookup_options_new(GArrowDatum *value_set); + + GArrowArray *garrow_array_cast(GArrowArray *array, GArrowDataType *target_data_type, GArrowCastOptions *options, diff --git a/c_glib/arrow-glib/compute.hpp b/c_glib/arrow-glib/compute.hpp index 01265eee2a8..c616f6c0226 100644 --- a/c_glib/arrow-glib/compute.hpp +++ b/c_glib/arrow-glib/compute.hpp @@ -89,3 +89,7 @@ garrow_sort_key_get_raw(GArrowSortKey *sort_key); arrow::compute::SortOptions * garrow_sort_options_get_raw(GArrowSortOptions *options); + + +arrow::compute::SetLookupOptions * +garrow_set_lookup_options_get_raw(GArrowSetLookupOptions *options); diff --git a/c_glib/test/test-is-in.rb b/c_glib/test/test-is-in.rb index ba44075d6b3..590b5e3798a 100644 --- a/c_glib/test/test-is-in.rb +++ b/c_glib/test/test-is-in.rb @@ -46,6 +46,16 @@ def test_null_in_both assert_equal(build_boolean_array([false, true, true, true]), left.is_in(right)) end + + def test_options + left = build_int16_array([1, 0, nil, 2]) + right = build_int16_array([2, 0, nil]) + is_in = Arrow::Function.find("is_in") + options = Arrow::SetLookupOptions.new(Arrow::ArrayDatum.new(right)) + assert_equal(build_boolean_array([false, true, true, true]), + is_in.execute([Arrow::ArrayDatum.new(left)], + options).value) + end end sub_test_case("ChunkedArray") do @@ -92,5 +102,19 @@ def test_null_in_both assert_equal(build_boolean_array([false, true, true, true]), left.is_in_chunked_array(right)) end + + def test_options + left = build_int16_array([1, 0, nil, 2]) + chunks = [ + build_int16_array([2, 0]), + build_int16_array([3, nil]) + ] + right = Arrow::ChunkedArray.new(chunks) + is_in = Arrow::Function.find("is_in") + options = Arrow::SetLookupOptions.new(Arrow::ChunkedArrayDatum.new(right)) + assert_equal(build_boolean_array([false, true, true, true]), + is_in.execute([Arrow::ArrayDatum.new(left)], + options).value) + end end end diff --git a/c_glib/test/test-set-lookup-options.rb b/c_glib/test/test-set-lookup-options.rb new file mode 100644 index 00000000000..779bacef683 --- /dev/null +++ b/c_glib/test/test-set-lookup-options.rb @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestSetLookupOptions < Test::Unit::TestCase + include Helper::Buildable + + def test_new + value_set = Arrow::ArrayDatum.new(build_int8_array([1, 2, 3])) + options = Arrow::SetLookupOptions.new(value_set) + assert_equal(value_set, options.value_set) + end + + sub_test_case("instance methods") do + def setup + value_set = Arrow::ArrayDatum.new(build_int8_array([1, 2, 3])) + @options = Arrow::SetLookupOptions.new(value_set) + end + + def test_skip_nulls + assert do + not @options.skip_nulls? + end + @options.skip_nulls = true + assert do + @options.skip_nulls? + end + end + end +end diff --git a/ruby/red-arrow/lib/arrow/datum.rb b/ruby/red-arrow/lib/arrow/datum.rb index 99d1dae32f8..196a18f54ff 100644 --- a/ruby/red-arrow/lib/arrow/datum.rb +++ b/ruby/red-arrow/lib/arrow/datum.rb @@ -21,6 +21,8 @@ class << self # @api private def try_convert(value) case value + when Table + TableDatum.new(value) when Array ArrayDatum.new(value) when ChunkedArray diff --git a/ruby/red-arrow/lib/arrow/slicer.rb b/ruby/red-arrow/lib/arrow/slicer.rb index fa834766866..6cca7f75e9b 100644 --- a/ruby/red-arrow/lib/arrow/slicer.rb +++ b/ruby/red-arrow/lib/arrow/slicer.rb @@ -16,9 +16,6 @@ # under the License. module Arrow - # Experimental - # - # TODO: Almost codes should be implemented in Apache Arrow C++. class Slicer def initialize(table) @table = table @@ -43,6 +40,21 @@ def method_missing(name, *args, &block) super end + module Helper + class << self + def ensure_boolean(column) + case column.data_type + when Arrow::BooleanDataType + column.data + else + options = CastOptions.new + options.to_data_type = Arrow::BooleanDataType.new + Function.find("cast").execute([column.data], options).value + end + end + end + end + class Condition def evaluate message = "Slicer::Condition must define \#evaluate: #{inspect}" @@ -69,43 +81,28 @@ def initialize(condition1, condition2) end def evaluate - values1 = @condition1.evaluate.each - values2 = @condition2.evaluate.each - raw_array = [] - begin - loop do - value1 = values1.next - value2 = values2.next - if value1.nil? or value2.nil? - raw_array << nil - else - raw_array << evaluate_value(value1, value2) - end - end - rescue StopIteration - end - BooleanArray.new(raw_array) + function.execute([@condition1.evaluate, @condition2.evaluate]).value end end class AndCondition < LogicalCondition private - def evaluate_value(value1, value2) - value1 and value2 + def function + Function.find("and") end end class OrCondition < LogicalCondition private - def evaluate_value(value1, value2) - value1 or value2 + def function + Function.find("or") end end class XorCondition < LogicalCondition private - def evaluate_value(value1, value2) - value1 ^ value2 + def function + Function.find("xor") end end @@ -115,21 +112,7 @@ def initialize(column) end def evaluate - data = @column.data - - case @column.data_type - when BooleanDataType - data - else - if data.n_chunks == 1 - data.get_chunk(0).cast(BooleanDataType.new, nil) - else - arrays = data.each_chunk.collect do |chunk| - chunk.cast(BooleanDataType.new, nil) - end - ChunkedArray.new(arrays) - end - end + Helper.ensure_boolean(@column) end def !@ @@ -187,23 +170,8 @@ def initialize(column) end def evaluate - data = @column.data - raw_array = [] - data.each_chunk do |chunk| - if chunk.is_a?(BooleanArray) - boolean_array = chunk - else - boolean_array = chunk.cast(BooleanDataType.new, nil) - end - boolean_array.each do |value| - if value.nil? - raw_array << value - else - raw_array << !value - end - end - end - BooleanArray.new(raw_array) + data = Helper.ensure_boolean(@column) + Function.find("invert").execute([data]).value end def !@ @@ -222,19 +190,10 @@ def !@ end def evaluate - case @value - when nil - raw_array = @column.collect(&:nil?) - BooleanArray.new(raw_array) + if @value.nil? + Function.find("is_null").execute([@column.data]).value else - raw_array = @column.collect do |value| - if value.nil? - nil - else - @value == value - end - end - BooleanArray.new(raw_array) + Function.find("equal").execute([@column.data, @value]).value end end end @@ -250,25 +209,10 @@ def !@ end def evaluate - case @value - when nil - if @column.n_nulls.zero? - raw_array = [true] * @column.n_rows - else - raw_array = @column.n_rows.times.collect do |i| - @column.valid?(i) - end - end - BooleanArray.new(raw_array) + if @value.nil? + Function.find("is_valid").execute([@column.data]).value else - raw_array = @column.collect do |value| - if value.nil? - nil - else - @value != value - end - end - BooleanArray.new(raw_array) + Function.find("not_equal").execute([@column.data, @value]).value end end end @@ -284,14 +228,7 @@ def !@ end def evaluate - raw_array = @column.collect do |value| - if value.nil? - nil - else - @value > value - end - end - BooleanArray.new(raw_array) + Function.find("less").execute([@column.data, @value]).value end end @@ -306,14 +243,7 @@ def !@ end def evaluate - raw_array = @column.collect do |value| - if value.nil? - nil - else - @value >= value - end - end - BooleanArray.new(raw_array) + Function.find("less_equal").execute([@column.data, @value]).value end end @@ -328,14 +258,7 @@ def !@ end def evaluate - raw_array = @column.collect do |value| - if value.nil? - nil - else - @value < value - end - end - BooleanArray.new(raw_array) + Function.find("greater").execute([@column.data, @value]).value end end @@ -350,14 +273,7 @@ def !@ end def evaluate - raw_array = @column.collect do |value| - if value.nil? - nil - else - @value <= value - end - end - BooleanArray.new(raw_array) + Function.find("greater_equal").execute([@column.data, @value]).value end end @@ -372,18 +288,10 @@ def !@ end def evaluate - values_index = {} - @values.each do |value| - values_index[value] = true - end - raw_array = @column.collect do |value| - if value.nil? - nil - else - values_index.key?(value) - end - end - BooleanArray.new(raw_array) + values = @values + values = Array.new(values) unless values.is_a?(Array) + options = SetLookupOptions.new(values) + Function.find("is_in").execute([@column.data], options).value end end @@ -398,18 +306,11 @@ def !@ end def evaluate - values_index = {} - @values.each do |value| - values_index[value] = true - end - raw_array = @column.collect do |value| - if value.nil? - nil - else - not values_index.key?(value) - end - end - BooleanArray.new(raw_array) + values = @values + values = Array.new(values) unless values.is_a?(Array) + options = SetLookupOptions.new(values) + booleans = Function.find("is_in").execute([@column.data], options).value + Function.find("invert").execute([booleans]).value end end diff --git a/ruby/red-arrow/test/test-slicer.rb b/ruby/red-arrow/test/test-slicer.rb index b0f2dfa32c6..420086690a0 100644 --- a/ruby/red-arrow/test/test-slicer.rb +++ b/ruby/red-arrow/test/test-slicer.rb @@ -349,12 +349,11 @@ def setup slicer.count.in?([1, 4, 16, 64]) end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 1 true -1 4 (null) -2 16 true -3 64 (null) -4 (null) (null) + count visible +0 1 true +1 4 (null) +2 16 true +3 64 (null) TABLE end