From d9d7270a8fa33ee58cea3d71b4bd882fd1175dc1 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Mon, 15 Apr 2013 12:59:45 -0500 Subject: [PATCH 01/36] Upgrade bundled gems, convert rubygems.org to SSL, point build notifications at #etsydoop on IRC --- .travis.yml | 3 +-- Gemfile | 4 ++-- Gemfile.lock | 6 +++--- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 09f916d..1348cf0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,5 +2,4 @@ language: ruby rvm: - jruby-18mode notifications: - recipients: - - mwalker@etsy.com + irc: "irc.freenode.org#etsydoop" diff --git a/Gemfile b/Gemfile index a65514d..ae50466 100644 --- a/Gemfile +++ b/Gemfile @@ -1,6 +1,6 @@ -source :rubygems +source 'https://rubygems.org' group :test do - gem 'rake', '0.8.7' + gem 'rake', '10.0.3' gem 'rspec', '1.1.11' end diff --git a/Gemfile.lock b/Gemfile.lock index bda8983..7b6363d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,12 +1,12 @@ GEM - remote: http://rubygems.org/ + remote: https://rubygems.org/ specs: - rake (0.8.7) + rake (10.0.3) rspec (1.1.11) PLATFORMS java DEPENDENCIES - rake (= 0.8.7) + rake (= 10.0.3) rspec (= 1.1.11) From 23369d6d846de2dcd99346f525ddb9d2d25aedd2 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Mon, 15 Apr 2013 13:14:05 -0500 Subject: [PATCH 02/36] Update specs to pass against JRuby 1.7.3, add to supported list --- README.md | 2 +- spec/jruby_version_spec.rb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c6bf9c2..7cfa943 100644 --- a/README.md +++ b/README.md @@ -32,4 +32,4 @@ For operations you can apply to your dataflow within a pipe assembly, see the [A Note that the Ruby code you write merely constructs a Cascading job, so no JRuby runtime is required on your cluster. This stands in contrast with writing [Hadoop streaming jobs in Ruby](http://www.quora.com/How-do-the-different-options-for-Ruby-on-Hadoop-compare). To run cascading.jruby applications on a Hadoop cluster, you must use [Jading](https://github.com/etsy/jading) to package them into a job jar. -cascading.jruby has been tested on JRuby versions 1.2.0, 1.4.0, 1.5.3, 1.6.5, and 1.6.7.2. +cascading.jruby has been tested on JRuby versions 1.2.0, 1.4.0, 1.5.3, 1.6.5, 1.6.7.2, 1.7.0, and 1.7.3. diff --git a/spec/jruby_version_spec.rb b/spec/jruby_version_spec.rb index f6bf3a4..e78f9fa 100644 --- a/spec/jruby_version_spec.rb +++ b/spec/jruby_version_spec.rb @@ -19,7 +19,7 @@ end thrown.should == 'InvocationTargetException' - if JRUBY_VERSION == '1.7.0' + if JRUBY_VERSION == '1.7.0' || JRUBY_VERSION == '1.7.3' exception.java_class.should be Java::JavaLangReflect::InvocationTargetException.java_class else # How can this be? A nil exception? @@ -58,7 +58,7 @@ result = e.validate result.should == 0 end - when '1.5.3', '1.6.5', '1.6.7.2', '1.7.0' + when '1.5.3', '1.6.5', '1.6.7.2', '1.7.0', '1.7.3' it 'should handle Fixnum -> Integer for ExprStub#eval' do e = ExprStub.new('x:int + y:int') result = e.eval(:x => 2, :y => 3) From 73dcad9f1c019a24ef26f342cf9a18c04e1d5dca Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Mon, 15 Apr 2013 16:07:15 -0500 Subject: [PATCH 03/36] Begin removing *args in Assembly; this is a non-backwards compatible change --- cascading.jruby.gemspec | 2 +- lib/cascading.rb | 2 + lib/cascading/aggregations.rb | 7 +- lib/cascading/assembly.rb | 322 ++++++++------------------- lib/cascading/ext/array.rb | 17 ++ lib/cascading/identity_operations.rb | 82 +++++++ lib/cascading/operations.rb | 50 ----- lib/cascading/regex_operations.rb | 128 +++++++++++ samples/branch.rb | 3 +- samples/group_by.rb | 2 +- samples/join.rb | 6 +- samples/logwordcount.rb | 2 +- samples/project.rb | 2 +- samples/rename.rb | 2 +- samples/replace.rb | 16 ++ samples/scorenames.rb | 2 +- samples/splitter.rb | 2 +- samples/sub_assembly.rb | 2 +- samples/ungroup.rb | 2 +- samples/union.rb | 2 +- samples/unique.rb | 2 +- spec/cascading_spec.rb | 4 +- spec/scope_spec.rb | 4 +- spec/spec_util.rb | 4 +- test/mock_assemblies.rb | 4 +- test/test_assembly.rb | 26 ++- test/test_local_execution.rb | 14 +- 27 files changed, 396 insertions(+), 315 deletions(-) create mode 100644 lib/cascading/identity_operations.rb create mode 100644 lib/cascading/regex_operations.rb create mode 100755 samples/replace.rb diff --git a/cascading.jruby.gemspec b/cascading.jruby.gemspec index a9c27c2..607df80 100644 --- a/cascading.jruby.gemspec +++ b/cascading.jruby.gemspec @@ -9,7 +9,7 @@ Gem::Specification.new do |s| s.description = "cascading.jruby is a small DSL above Cascading, written in JRuby" s.email = "mwalker@etsy.com" s.extra_rdoc_files = ["LICENSE.txt"] - s.files = ["lib/cascading.rb", "lib/cascading/aggregations.rb", "lib/cascading/assembly.rb", "lib/cascading/base.rb", "lib/cascading/cascade.rb", "lib/cascading/cascading.rb", "lib/cascading/cascading_exception.rb", "lib/cascading/expr_stub.rb", "lib/cascading/ext/array.rb", "lib/cascading/flow.rb", "lib/cascading/mode.rb", "lib/cascading/operations.rb", "lib/cascading/scope.rb", "lib/cascading/sub_assembly.rb", "lib/cascading/tap.rb"] + s.files = Dir.glob("lib/**/*.rb") s.homepage = "http://github.com/etsy/cascading.jruby" s.rdoc_options = ["--main", "README.md"] s.require_paths = ["lib"] diff --git a/lib/cascading.rb b/lib/cascading.rb index 64e61b3..a294b0b 100644 --- a/lib/cascading.rb +++ b/lib/cascading.rb @@ -14,6 +14,8 @@ module Cascading require 'cascading/flow' require 'cascading/mode' require 'cascading/operations' +require 'cascading/identity_operations' +require 'cascading/regex_operations' require 'cascading/scope' require 'cascading/tap' diff --git a/lib/cascading/aggregations.rb b/lib/cascading/aggregations.rb index 2980748..12a28de 100644 --- a/lib/cascading/aggregations.rb +++ b/lib/cascading/aggregations.rb @@ -5,8 +5,7 @@ module Cascading # Rules enforced by Aggregations: # * Contains either 1 Buffer or >= 1 Aggregator (explicitly checked) - # * No GroupBys, CoGroups, Joins, or Merges (methods for these pipes do not - # exist on Aggregations) + # * No GroupBys, CoGroups, Joins, or Merges (methods for these pipes do not exist on Aggregations) # * No Eaches (Aggregations#each does not exist) # * Aggregations may not branch (Aggregations#branch does not exist) # @@ -15,9 +14,7 @@ module Cascading # * Must follow a GroupBy or CoGroup (not a Join or Merge) # # Optimizations: - # * If the leading Group is a GroupBy and all subsequent Everies are - # Aggregators that have a corresponding AggregateBy, Aggregations can replace - # the GroupBy/Aggregator pipe with a single composite AggregateBy + # * If the leading Group is a GroupBy and all subsequent Everies are Aggregators that have a corresponding AggregateBy, Aggregations can replace the GroupBy/Aggregator pipe with a single composite AggregateBy class Aggregations include Operations diff --git a/lib/cascading/assembly.rb b/lib/cascading/assembly.rb index 684b4c1..a20d8ea 100644 --- a/lib/cascading/assembly.rb +++ b/lib/cascading/assembly.rb @@ -1,10 +1,30 @@ require 'cascading/base' require 'cascading/operations' +require 'cascading/identity_operations' +require 'cascading/regex_operations' require 'cascading/aggregations' require 'cascading/sub_assembly' require 'cascading/ext/array' module Cascading + # An Assembly is a sequence of Cascading pipes (Each, GroupBy, CoGroup, + # Every, and SubAssembly). This class will serve as your primary mechanism + # for doing work within a flow and contains all the functions and filters you + # will apply to a pipe (Eaches), as well as group_by, union, and join. For + # aggregators and buffers, please see Aggregations. + # + # Function and filter DSL rules: + # * Use positional arguments for required parameters + # * Use params = {} for optional parameters + # * Use *args sparingly, specifically when you need to accept a varying length list of fields + # * If you require both a varying length list of fields and optional parameters, then see the Array#extract_options! extension + # * If you choose to name a required parameter, add it to params = {} and throw an exception if the caller does not provide it + # * If you have a require parameter that is provided by one of a set of params names, throw an exception if the caller does not provide at least one value (see :function and :filter in Assembly#each for an example) + # + # Function and filter DSL standard optional parameter names: + # [input] c.p.Each argument selector + # [into] c.o.Operation field declaration + # [output] c.p.Each output selector class Assembly < Cascading::Node include Operations @@ -105,12 +125,10 @@ def to_s "#{name} : head pipe : #{head_pipe} - tail pipe: #{tail_pipe}" end - def prepare_join(*args, &block) - options = args.extract_options! - - pipes, _ = populate_incoming_scopes(args) + def prepare_join(assembly_names, params, &block) + pipes, _ = populate_incoming_scopes(assembly_names) - group_fields_args = options[:on] + group_fields_args = params[:on] raise 'join requires :on parameter' unless group_fields_args if group_fields_args.kind_of?(String) @@ -131,9 +149,9 @@ def prepare_join(*args, &block) raise 'join requires non-empty :on parameter' if group_fields_args.empty? group_fields = group_fields.to_java(Java::CascadingTuple::Fields) incoming_fields = @incoming_scopes.map{ |s| s.values_fields } - declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields)) - joiner = options[:joiner] - is_hash_join = options[:hash] || false + declared_fields = fields(params[:declared_fields] || dedup_fields(*incoming_fields)) + joiner = params[:joiner] + is_hash_join = params[:hash] || false case joiner when :inner, 'inner', nil @@ -182,49 +200,47 @@ def prepare_join(*args, &block) # Builds a HashJoin pipe. This should be used carefully, as the right side # of the join is accumulated entirely in memory. Requires a list of assembly # names to join and :on to specify the join_fields. - def hash_join(*args, &block) - options = args.extract_options! - options[:hash] = true - args << options - prepare_join(*args, &block) + def hash_join(*args_with_params, &block) + params, assembly_names = args_with_params.extract_options!, args_with_params + params[:hash] = true + prepare_join(assembly_names, params, &block) end # Builds a join (CoGroup) pipe. Requires a list of assembly names to join # and :on to specify the group_fields. - def join(*args, &block) - options = args.extract_options! - options[:hash] = false - args << options - prepare_join(*args, &block) + def join(*args_with_params, &block) + params, assembly_names = args_with_params.extract_options!, args_with_params + params[:hash] = false + prepare_join(assembly_names, params, &block) end alias co_group join - def inner_join(*args, &block) - options = args.extract_options! - options[:joiner] = :inner - args << options - join(*args, &block) + def inner_join(*args_with_params, &block) + params = args_with_params.extract_options! + params[:joiner] = :inner + args_with_params << params + join(*args_with_params, &block) end - def left_join(*args, &block) - options = args.extract_options! - options[:joiner] = :left - args << options - join(*args, &block) + def left_join(*args_with_params, &block) + params = args_with_params.extract_options! + params[:joiner] = :left + args_with_params << params + join(*args_with_params, &block) end - def right_join(*args, &block) - options = args.extract_options! - options[:joiner] = :right - args << options - join(*args, &block) + def right_join(*args_with_params, &block) + params = args_with_params.extract_options! + params[:joiner] = :right + args_with_params << params + join(*args_with_params, &block) end - def outer_join(*args, &block) - options = args.extract_options! - options[:joiner] = :outer - args << options - join(*args, &block) + def outer_join(*args_with_params, &block) + params = args_with_params.extract_options! + params[:joiner] = :outer + args_with_params << params + join(*args_with_params, &block) end # Builds a new branch. @@ -236,13 +252,13 @@ def branch(name, &block) assembly end - # Builds a new GroupBy pipe that groups on the fields given in args. - # Any block passed to this method should contain only Everies. - def group_by(*args, &block) - options = args.extract_options! - group_fields = fields(args) - sort_fields = fields(options[:sort_by]) - reverse = options[:reverse] + # Builds a new GroupBy pipe that groups on the fields given in + # args_with_params. Any block passed to this method should contain only + # Everies. + def group_by(*args_with_params, &block) + params, group_fields = args_with_params.extract_options!, fields(args_with_params) + sort_fields = fields(params[:sort_by]) + reverse = params[:reverse] parameters = [tail_pipe, group_fields, sort_fields, reverse].compact apply_aggregations(Java::CascadingPipe::GroupBy.new(*parameters), [scope], &block) @@ -254,13 +270,13 @@ def group_by(*args, &block) # aggregations. # # By default, groups only on the first field (see line 189 of GroupBy.java) - def union(*args, &block) - options = args.extract_options! - group_fields = fields(options[:on]) - sort_fields = fields(options[:sort_by]) - reverse = options[:reverse] + def union(*args_with_params, &block) + params, assembly_names = args_with_params.extract_options!, args_with_params + group_fields = fields(params[:on]) + sort_fields = fields(params[:sort_by]) + reverse = params[:reverse] - pipes, _ = populate_incoming_scopes(args) + pipes, _ = populate_incoming_scopes(assembly_names) # Must provide group_fields to ensure field name propagation group_fields = fields(@incoming_scopes.first.values_fields.get(0)) unless group_fields @@ -287,85 +303,44 @@ def sub_assembly(sub_assembly, pipes = [tail_pipe], incoming_scopes = [scope]) sub_assembly end - # Builds a basic _each_ pipe, and adds it to the current assembly. - # -- - # Example: - # each 'line', :function => regex_splitter(['name', 'val1', 'val2', 'id'], :pattern => /[.,]*\s+/), :output => ['id', 'name', 'val1', 'val2'] - def each(*args) - options = args.extract_options! - - in_fields = fields(args) - out_fields = fields(options[:output]) - - operation = options[:filter] || options[:function] - raise 'c.p.Each does not support applying an output selector to a c.o.Filter' if options[:filter] && options[:output] + # Builds a basic each pipe, and adds it to the current assembly. + # + # Default arguments are all_fields, a default inherited from c.o.Each. + def each(*args_with_params) + params, in_fields = args_with_params.extract_options!, fields(args_with_params) + out_fields = fields(params[:output]) # Default Fields.RESULTS from c.o.Each + operation = params[:filter] || params[:function] + raise 'each requires either :filter or :function' unless operation + raise 'c.p.Each does not support applying an output selector to a c.o.Filter' if params[:filter] && params[:output] parameters = [tail_pipe, in_fields, operation, out_fields].compact each = make_pipe(Java::CascadingPipe::Each, parameters) - raise ':function specified but c.o.Filter provided' if options[:function] && each.is_filter - raise ':filter specified but c.o.Function provided' if options[:filter] && each.is_function + raise ':function specified but c.o.Filter provided' if params[:function] && each.is_filter + raise ':filter specified but c.o.Function provided' if params[:filter] && each.is_function each end - # Restricts the current assembly to the specified fields. - # -- - # Example: - # project "field1", "field2" - def project(*args) - each fields(args), :function => Java::CascadingOperation::Identity.new - end - - # Removes the specified fields from the current assembly. - # -- - # Example: - # discard "field1", "field2" - def discard(*args) - discard_fields = fields(args) - keep_fields = difference_fields(scope.values_fields, discard_fields) - project(*keep_fields.to_a) - end - - # Renames fields according to the mapping provided. - # -- - # Example: - # rename "old_name" => "new_name" - def rename(name_map) - old_names = scope.values_fields.to_a - new_names = old_names.map{ |name| name_map[name] || name } - invalid = name_map.keys.sort - old_names - raise "invalid names: #{invalid.inspect}" unless invalid.empty? + include IdentityOperations + include RegexOperations - each all_fields, :function => Java::CascadingOperation::Identity.new(fields(new_names)) - end - - def cast(type_map) - names = type_map.keys.sort - types = JAVA_TYPE_MAP.values_at(*type_map.values_at(*names)) - fields = fields(names) - types = types.to_java(java.lang.Class) - each fields, :function => Java::CascadingOperation::Identity.new(fields, types) - end + def assert(assertion, params = {}) + assertion_level = params[:level] || Java::CascadingOperation::AssertionLevel::STRICT - def copy(*args) - options = args.extract_options! - from = args[0] || all_fields - into = args[1] || options[:into] || all_fields - each fields(from), :function => Java::CascadingOperation::Identity.new(fields(into)), :output => all_fields + parameters = [tail_pipe, assertion_level, assertion] + make_pipe(Java::CascadingPipe::Each, parameters) end - # A pipe that does nothing. - def pass(*args) - each all_fields, :function => Java::CascadingOperation::Identity.new + # Builds a pipe that assert the size of the tuple is the size specified in parameter. + def assert_size_equals(size, params = {}) + assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(size) + assert(assertion, params) end - def assert(*args) - options = args.extract_options! - assertion = args[0] - assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT - - parameters = [tail_pipe, assertion_level, assertion] - make_pipe(Java::CascadingPipe::Each, parameters) + # Builds a pipe that assert the none of the fields in the tuple are null. + def assert_not_null(params = {}) + assertion = Java::CascadingOperationAssertion::AssertNotNull.new + assert(assertion, params) end # Builds a debugging pipe. @@ -376,97 +351,14 @@ def assert(*args) # The other named options are: # * :print_fields a boolean. If is set to true, then it prints every 10 tuples. # - def debug(*args) - options = args.extract_options! - print_fields = options[:print_fields] || true - parameters = [print_fields].compact - debug = Java::CascadingOperation::Debug.new(*parameters) - debug.print_tuple_every = options[:tuple_interval] || 1 - debug.print_fields_every = options[:fields_interval] || 10 + def debug(params = {}) + print_fields = params[:print_fields] || true + debug = Java::CascadingOperation::Debug.new(print_fields) + debug.print_tuple_every = params[:tuple_interval] || 1 + debug.print_fields_every = params[:fields_interval] || 10 each(all_fields, :filter => debug) end - # Builds a pipe that assert the size of the tuple is the size specified in parameter. - # - # The method accept an unique uname argument : a number indicating the size expected. - def assert_size_equals(*args) - options = args.extract_options! - assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(args[0]) - assert(assertion, options) - end - - # Builds a pipe that assert the none of the fields in the tuple are null. - def assert_not_null(*args) - options = args.extract_options! - assertion = Java::CascadingOperationAssertion::AssertNotNull.new - assert(assertion, options) - end - - # Builds a _parse_ pipe. This pipe will parse the fields specified in input (first unamed arguments), - # using a specified regex pattern. - # - # If provided, the unamed arguments must be the fields to be parsed. If not provided, then all incoming - # fields are used. - # - # The named options are: - # * :pattern a string or regex. Specifies the regular expression used for parsing the argument fields. - # * :output a string or array of strings. Specifies the outgoing fields (all fields will be output by default) - def parse(*args) - options = args.extract_options! - fields = args || all_fields - pattern = options[:pattern] - output = options[:output] || all_fields - each(fields, :function => regex_parser(pattern, options), :output => output) - end - - # Builds a pipe that splits a field into other fields, using a specified regular expression. - # - # The first unnamed argument is the field to be split. - # The second unnamed argument is an array of strings indicating the fields receiving the result of the split. - # - # The named options are: - # * :pattern a string or regex. Specifies the regular expression used for splitting the argument fields. - # * :output a string or array of strings. Specifies the outgoing fields (all fields will be output by default) - def split(*args) - options = args.extract_options! - fields = options[:into] || args[1] - pattern = options[:pattern] || /[.,]*\s+/ - output = options[:output] || all_fields - each(args[0], :function => regex_splitter(fields, :pattern => pattern), :output=>output) - end - - # Builds a pipe that splits a field into new rows, using a specified regular expression. - # - # The first unnamed argument is the field to be split. - # The second unnamed argument is the field receiving the result of the split. - # - # The named options are: - # * :pattern a string or regex. Specifies the regular expression used for splitting the argument fields. - # * :output a string or array of strings. Specifies the outgoing fields (all fields will be output by default) - def split_rows(*args) - options = args.extract_options! - fields = options[:into] || args[1] - pattern = options[:pattern] || /[.,]*\s+/ - output = options[:output] || all_fields - each(args[0], :function => regex_split_generator(fields, :pattern => pattern), :output=>output) - end - - # Builds a pipe that emits a new row for each regex group matched in a field, using a specified regular expression. - # - # The first unnamed argument is the field to be matched against. - # The second unnamed argument is the field receiving the result of the match. - # - # The named options are: - # * :pattern a string or regex. Specifies the regular expression used for matching the argument fields. - # * :output a string or array of strings. Specifies the outgoing fields (all fields will be output by default) - def match_rows(*args) - options = args.extract_options! - fields = options[:into] || args[1] - pattern = options[:pattern] || /[\w]+/ - output = options[:output] || all_fields - each(args[0], :function => regex_generator(fields, :pattern => pattern), :output=>output) - end - # Builds a pipe that parses the specified field as a date using hte provided format string. # The unamed argument specifies the field to format. # @@ -503,26 +395,6 @@ def format_date(*args) each args[0], :function => date_formatter(field, pattern, options[:timezone]), :output => output end - # Builds a pipe that perform a query/replace based on a regular expression. - # - # The first unamed argument specifies the input field. - # - # The named options are: - # * :pattern a string or regex. Specifies the pattern to look for in the input field. This non-optional argument - # can also be specified as a second _unamed_ argument. - # * :replacement a string. Specifies the replacement. - # * :output a string or array of strings. Specifies the outgoing fields (all fields will be output by default) - def replace(*args) - options = args.extract_options! - - pattern = options[:pattern] || args[1] - replacement = options[:replacement] || args[2] - into = options[:into] || "#{args[0]}_replaced" - output = options[:output] || all_fields - - each args[0], :function => regex_replace(into, pattern, replacement), :output => output - end - # Builds a pipe that inserts values into the current tuple. # # The method takes a hash as parameter. This hash contains as keys the names of the fields to insert diff --git a/lib/cascading/ext/array.rb b/lib/cascading/ext/array.rb index 9f7fd56..e6f636c 100644 --- a/lib/cascading/ext/array.rb +++ b/lib/cascading/ext/array.rb @@ -1,8 +1,25 @@ +# Extensions to Arrays in support of variable length lists of field names. This +# is not pretty, but supports DSL features like: +# group_by 'field1', 'field2', :sort_by => 'field3' do +# ... +# end +# +# The most obvious limitation of the approach is that function definitions of +# the form f(*args_with_params) are not self-documenting. To compensate for +# this, documentation of all arguments and optional parameters must be provided +# on the DSL method. class Array + # Use this extension to extract the optional parameters from a + # *args_with_params argument. + # So if you have a function: + # def f(*args_with_params) + # You can destructively process the args_with_params as follows: + # params, just_args = args_with_params.extract_options!, args_with_params def extract_options! last.is_a?(::Hash) ? pop : {} end + # Non-destructive form of Array#extract_options! def extract_options last.is_a?(::Hash) ? last : {} end diff --git a/lib/cascading/identity_operations.rb b/lib/cascading/identity_operations.rb new file mode 100644 index 0000000..8f2e7ea --- /dev/null +++ b/lib/cascading/identity_operations.rb @@ -0,0 +1,82 @@ +module Cascading + # Module of pipe assemblies that wrap the Cascading Identity operation. These + # are split out only to group similar functionality. + module IdentityOperations + # Restricts the current assembly to the specified fields in the order in + # which they are specified (can be used to reorder fields). + # + # Example: + # project 'field1', 'field2' + def project(*input_fields) + each fields(input_fields), :function => Java::CascadingOperation::Identity.new + end + + # Removes the specified fields from the current assembly. + # + # Example: + # discard 'field1', 'field2' + def discard(*input_fields) + discard_fields = fields(input_fields) + keep_fields = difference_fields(scope.values_fields, discard_fields) + project(*keep_fields.to_a) + end + + # Renames fields according to the mapping provided, preserving the original + # field order. Throws an exception if non-existent fields are specified. + # + # Example: + # rename 'field1' => 'fieldA', 'field2' => 'fieldB' + # + # Produces: ['fieldA', 'fieldB'], assuming those were the only 2 input + # fields. + def rename(name_map) + original_fields = scope.values_fields.to_a + invalid = name_map.keys - original_fields + raise "Invalid field names in rename: #{invalid.inspect}" unless invalid.empty? + + renamed_fields = original_fields.map{ |name| name_map[name] || name } + + each original_fields, :function => Java::CascadingOperation::Identity.new(fields(renamed_fields)) + end + + # Coerces fields to the Java type selected from Cascading::JAVA_TYPE_MAP. + # + # Example: + # cast 'field1' => :int, 'field2' => :double + def cast(type_map) + input_fields = type_map.keys.sort + types = JAVA_TYPE_MAP.values_at(*type_map.values_at(*input_fields)) + input_fields = fields(input_fields) + types = types.to_java(java.lang.Class) + each input_fields, :function => Java::CascadingOperation::Identity.new(input_fields, types) + end + + # A field copy (not a pipe copy). Renames fields according to name_map, + # appending them to the fields in the assembly in the same order as the + # original fields from which they are copied. Throws an exception if + # non-existent fields are specified. + # + # Example: + # copy 'field1' => 'fieldA', 'field2' => 'fieldB' + # + # Produces: ['field1', 'field2', 'fieldA', 'fieldB'], assuming those were + # the only input fields. + def copy(name_map) + original_fields = scope.values_fields.to_a + invalid = name_map.keys - original_fields + raise "Invalid field names in copy: #{invalid.inspect}" unless invalid.empty? + + # Original fields in name_map in their original order + input_fields = original_fields - (original_fields - name_map.keys) + into_fields = name_map.values_at(*input_fields) + + each input_fields, :function => Java::CascadingOperation::Identity.new(fields(into_fields)), :output => all_fields + end + + # A pipe copy (not a field copy). Can be used within a branch to copy a + # pipe. + def pass + each all_fields, :function => Java::CascadingOperation::Identity.new + end + end +end diff --git a/lib/cascading/operations.rb b/lib/cascading/operations.rb index c36ee9d..d09903a 100644 --- a/lib/cascading/operations.rb +++ b/lib/cascading/operations.rb @@ -39,44 +39,6 @@ def last_function(*args) aggregator_function(args, Java::CascadingOperationAggregator::Last) end - def regex_parser(*args) - options = args.extract_options! - - pattern = args[0].to_s - fields = Cascading.fields(options[:fields]) - groups = options[:groups].to_java(:int) if options[:groups] - parameters = [fields, pattern, groups].compact - - Java::CascadingOperationRegex::RegexParser.new(*parameters) - end - - def regex_splitter(*args) - options = args.extract_options! - - fields = Cascading.fields(args) - pattern = options[:pattern].to_s - parameters = [fields, pattern].compact - Java::CascadingOperationRegex::RegexSplitter.new(*parameters) - end - - def regex_split_generator(*args) - options = args.extract_options! - - fields = Cascading.fields(args) - pattern = options[:pattern].to_s - parameters = [fields, pattern].compact - Java::CascadingOperationRegex::RegexSplitGenerator.new(*parameters) - end - - def regex_generator(*args) - options = args.extract_options! - - fields = Cascading.fields(args) - pattern = options[:pattern].to_s - parameters = [fields, pattern].compact - Java::CascadingOperationRegex::RegexGenerator.new(*parameters) - end - def expression_function(*args) options = args.extract_options! @@ -177,18 +139,6 @@ def regex_filter(*args) Java::CascadingOperationRegex::RegexFilter.new(*parameters) end - def regex_replace(*args) - options = args.extract_options! - - fields = fields(args[0]) - pattern = args[1] - replacement = args[2] - replace_all = options[:replace_all] - - parameters = [fields, pattern.to_s, replacement.to_s, replace_all].compact - Java::CascadingOperationRegex::RegexReplace.new(*parameters) - end - def field_joiner(*args) options = args.extract_options! delimiter = options[:delimiter] || ',' diff --git a/lib/cascading/regex_operations.rb b/lib/cascading/regex_operations.rb new file mode 100644 index 0000000..9eb06f0 --- /dev/null +++ b/lib/cascading/regex_operations.rb @@ -0,0 +1,128 @@ +module Cascading + # Module of pipe assemblies that wrap operations defined in the Cascading + # cascading.operations.regex package. These are split out only to group + # similar functionality. + # + # All DSL regex pipes require an input_field, a regex, and either a single + # into_field or one or more into_fields. Requiring a single input field + # allows us to raise an exception early if the wrong input is specified and + # avoids the non-intuitive situation where the first of many fields is + # silently taken as in Cascading. Requiring a regex means you don't have to + # go looking for defaults in code. And into_field(s) means we can propagate + # field names through the dataflow. + # + # Mapping of DSL pipes into Cascading regex operations: + # parse:: {RegexParser}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexParser.html] + # split:: {RegexSplitter}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexSplitter.html] + # split\_rows:: {RegexSplitGenerator}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexSplitGenerator.html] + # match\_rows:: {RegexGenerator}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexGenerator.html] + # replace:: {RegexReplace}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexReplace.html] + module RegexOperations + # Parses the given input_field using the specified regular expression to + # produce one output per group in that expression. + # + # The named params are: + # [groups] Array of integers specifying which groups to capture if you want + # a subset of groups. + # + # Example: + # parse 'field1', /([\w]+)\s+([\w]+)/, ['out1', 'out2'], :groups => [1, 2] + def parse(input_field, regex, into_fields, params = {}) + groups = params[:groups].to_java(:int) if params[:groups] + output = params[:output] || all_fields # Overrides Cascading default + + input_field = fields(input_field) + raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1 + + parameters = [fields(into_fields), regex.to_s, groups].compact + each( + input_field, + :function => Java::CascadingOperationRegex::RegexParser.new(*parameters), + :output => output + ) + end + + # Splits the given input_field into multiple fields using the specified + # regular expression. + # + # Example: + # split 'line', /\s+/, ['out1', 'out2'] + def split(input_field, regex, into_fields, params = {}) + output = params[:output] || all_fields # Overrides Cascading default + + input_field = fields(input_field) + raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1 + + each( + input_field, + :function => Java::CascadingOperationRegex::RegexSplitter.new(fields(into_fields), regex.to_s), + :output => output + ) + end + + # Splits the given input_field into new rows using the specified regular + # expression. + # + # Example: + # split_rows 'line', /\s+/, 'word' + def split_rows(input_field, regex, into_field, params = {}) + output = params[:output] || all_fields # Overrides Cascading default + + input_field = fields(input_field) + raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1 + into_field = fields(into_field) + raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1 + + each( + input_field, + :function => Java::CascadingOperationRegex::RegexSplitGenerator.new(into_field, regex.to_s), + :output => output + ) + end + + # Emits a new row for each regex group matched in input_field using the + # specified regular expression. + # + # Example: + # match_rows 'line', /([\w+])\s+([\w+])/, 'word' + def match_rows(input_field, regex, into_field, params = {}) + output = params[:output] || all_fields # Overrides Cascading default + + input_field = fields(input_field) + raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1 + into_field = fields(into_field) + raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1 + + each( + input_field, + :function => Java::CascadingOperationRegex::RegexGenerator.new(into_field, regex.to_s), + :output => output + ) + end + + # Performs a query/replace on the given input_field using the specified + # regular expression and replacement. + # + # The named params are: + # [replace_all] Boolean indicating if all matches should be replaced; + # defaults to true (the Cascading default). + # + # Example: + # replace 'line', /[.,]*\s+/, 'tab_separated_line', "\t" + def replace(input_field, regex, into_field, replacement, params = {}) + output = params[:output] || all_fields # Overrides Cascading default + + input_field = fields(input_field) + raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1 + into_field = fields(into_field) + raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1 + + parameters = [into_field, regex.to_s, replacement.to_s, params[:replace_all]].compact + each( + input_field, + :function => Java::CascadingOperationRegex::RegexReplace.new(*parameters), + :output => output + ) + end + end +end diff --git a/samples/branch.rb b/samples/branch.rb index 64a6b74..3d98d4b 100755 --- a/samples/branch.rb +++ b/samples/branch.rb @@ -9,8 +9,7 @@ source 'input', tap('samples/data/data2.txt') assembly 'input' do - split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/ - + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'] branch 'branch1' do group_by 'score1' do count diff --git a/samples/group_by.rb b/samples/group_by.rb index ce44436..3083ebc 100755 --- a/samples/group_by.rb +++ b/samples/group_by.rb @@ -8,7 +8,7 @@ source 'input', tap('samples/data/data_group_by.tsv') assembly 'input' do - split 'line', ['id', 'city'], :output => ['id', 'city'] + split 'line', /\t/, ['id', 'city'], :output => ['id', 'city'] branch 'group_by' do group_by 'city', :sort_by => 'city' do diff --git a/samples/join.rb b/samples/join.rb index 6313ab9..7d50c62 100755 --- a/samples/join.rb +++ b/samples/join.rb @@ -10,15 +10,15 @@ source 'input3', tap('samples/data/data_join3.txt') assembly 'input1' do - split 'line', ['id', 'name'] + split 'line', /\t/, ['id', 'name'] end assembly 'input2' do - split 'line', ['id', 'age'] + split 'line', /\t/, ['id', 'age'] end assembly 'input3' do - split 'line', ['id', 'city'] + split 'line', /\t/, ['id', 'city'] end assembly 'join' do diff --git a/samples/logwordcount.rb b/samples/logwordcount.rb index 9e93bc8..b037bf5 100755 --- a/samples/logwordcount.rb +++ b/samples/logwordcount.rb @@ -10,7 +10,7 @@ source 'input', tap('samples/data/gutenberg/the_outline_of_science_vol_1') assembly 'input' do - split_rows 'line', 'word', :pattern => /[.,]*\s+/, :output => 'word' + split_rows 'line', /[.,]*\s+/, 'word', :output => 'word' group_by 'word' do count end diff --git a/samples/project.rb b/samples/project.rb index 908f5e1..50bce24 100755 --- a/samples/project.rb +++ b/samples/project.rb @@ -10,7 +10,7 @@ source 'input', tap('samples/data/data2.txt') assembly 'input' do - split 'line', ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] assert Java::CascadingOperationAssertion::AssertSizeEquals.new(4) project 'name', 'score1', 'score2' assert Java::CascadingOperationAssertion::AssertSizeEquals.new(3) diff --git a/samples/rename.rb b/samples/rename.rb index bbd47e1..bee222d 100755 --- a/samples/rename.rb +++ b/samples/rename.rb @@ -8,7 +8,7 @@ source 'input', tap('samples/data/data2.txt') assembly 'input' do - split 'line', ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] assert Java::CascadingOperationAssertion::AssertSizeEquals.new(4) rename 'name' => 'new_name', 'score1' => 'new_score1', 'score2' => 'new_score2' assert Java::CascadingOperationAssertion::AssertSizeEquals.new(4) diff --git a/samples/replace.rb b/samples/replace.rb new file mode 100755 index 0000000..40e600e --- /dev/null +++ b/samples/replace.rb @@ -0,0 +1,16 @@ +#! /usr/bin/env jruby +$: << File.join(File.dirname(__FILE__), '..', 'lib') + +require 'cascading' + +cascade 'replace', :mode => :local do + flow 'replace' do + source 'input', tap('samples/data/data2.txt') + + assembly 'input' do + replace 'line', /[.,]*\s+/, 'tab_separated_line', "\t", :output => 'tab_separated_line' + end + + sink 'input', tap('output/replace', :sink_mode => :replace) + end +end.complete diff --git a/samples/scorenames.rb b/samples/scorenames.rb index aadd23e..3cd3e51 100755 --- a/samples/scorenames.rb +++ b/samples/scorenames.rb @@ -10,7 +10,7 @@ source 'input', tap('samples/data/genealogy/names/dist.all.last') assembly 'input' do - split 'line', ['name', 'val1', 'val2', 'id'] + split 'line', /[.,]*\s+/, ['name', 'val1', 'val2', 'id'] insert 'val3' => expr('val2:double < 40.0 ? val1:double : val2:double') project 'name', 'val3', 'id' end diff --git a/samples/splitter.rb b/samples/splitter.rb index 81042c9..021feaf 100755 --- a/samples/splitter.rb +++ b/samples/splitter.rb @@ -8,7 +8,7 @@ source 'input', tap('samples/data/data2.txt') assembly 'input' do - split 'line', ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] group_by 'score1' do count end diff --git a/samples/sub_assembly.rb b/samples/sub_assembly.rb index a6101ec..a089c56 100755 --- a/samples/sub_assembly.rb +++ b/samples/sub_assembly.rb @@ -8,7 +8,7 @@ source 'input', tap('samples/data/data2.txt') assembly 'input' do - split 'line', ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] assert Java::CascadingOperationAssertion::AssertSizeEquals.new(4) sub_assembly Java::CascadingPipeAssembly::Discard.new(tail_pipe, fields('id')) assert Java::CascadingOperationAssertion::AssertSizeEquals.new(3) diff --git a/samples/ungroup.rb b/samples/ungroup.rb index 0a70545..24d8216 100755 --- a/samples/ungroup.rb +++ b/samples/ungroup.rb @@ -11,7 +11,7 @@ source 'input', tap('samples/data/ungroup.tsv') a = assembly 'input' do - split 'line', ['key', 'val1', 'val2', 'val3'], :output => ['key', 'val1', 'val2', 'val3'] + split 'line', /\t/, ['key', 'val1', 'val2', 'val3'], :output => ['key', 'val1', 'val2', 'val3'] branch 'ungroup_using_value_selectors' do #each all_fields, :function => Java::CascadingOperationFunction::UnGroup.new(fields(['new_key', 'val']), fields('key'), [fields('val1'), fields('val2'), fields('val3')].to_java(Java::CascadingTuple::Fields)), :output => ['new_key', 'val'] diff --git a/samples/union.rb b/samples/union.rb index 1e402ef..1e16772 100755 --- a/samples/union.rb +++ b/samples/union.rb @@ -10,7 +10,7 @@ source 'input', tap('samples/data/genealogy/names/dist.all.last') assembly 'input' do - split 'line', ['name', 'score1', 'score2', 'id'] + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'] branch 'branch1' do group_by 'score1', 'name' do diff --git a/samples/unique.rb b/samples/unique.rb index 3c75fa9..c2cc0c4 100755 --- a/samples/unique.rb +++ b/samples/unique.rb @@ -10,7 +10,7 @@ source 'input', tap('samples/data/data_group_by.tsv') assembly 'input' do - split 'line', ['id', 'city'], :output => ['id', 'city'] + split 'line', /\t/, ['id', 'city'], :output => ['id', 'city'] branch 'unique' do sub_assembly Java::CascadingPipeAssembly::Unique.new(tail_pipe, fields('city')) diff --git a/spec/cascading_spec.rb b/spec/cascading_spec.rb index a075661..faec9e1 100644 --- a/spec/cascading_spec.rb +++ b/spec/cascading_spec.rb @@ -85,12 +85,12 @@ source 'right', tap('spec/resource/join_input.txt', :scheme => text_line_scheme) assembly 'left' do - split 'line', ['x', 'y', 'z'], :pattern => /,/ + split 'line', /,/, ['x', 'y', 'z'] project 'x', 'y', 'z' end assembly 'right' do - split 'line', ['x', 'y', 'z'], :pattern => /,/ + split 'line', /,/, ['x', 'y', 'z'] project 'x', 'y', 'z' branch 'branch_join' do diff --git a/spec/scope_spec.rb b/spec/scope_spec.rb index 3bacd26..c32b277 100644 --- a/spec/scope_spec.rb +++ b/spec/scope_spec.rb @@ -22,7 +22,7 @@ check_scope :values_fields => ['offset', 'line'] assert_size_equals 2 - split 'line', ['x', 'y'], :pattern => /,/ + split 'line', /,/, ['x', 'y'] check_scope :values_fields => ['offset', 'line', 'x', 'y'] assert_size_equals 4 end @@ -33,7 +33,7 @@ check_scope :values_fields => ['offset', 'line'] assert_size_equals 2 - split 'line', ['x', 'y'], :pattern => /,/, :output => ['x', 'y'] + split 'line', /,/, ['x', 'y'], :output => ['x', 'y'] check_scope :values_fields => ['x', 'y'] assert_size_equals 2 end diff --git a/spec/spec_util.rb b/spec/spec_util.rb index 890b554..10c5c4c 100644 --- a/spec/spec_util.rb +++ b/spec/spec_util.rb @@ -63,13 +63,13 @@ def test_join_assembly(params = {}, &block) assembly 'left' do check_scope :values_fields => ['offset', 'line'] - split 'line', ['x', 'y', 'z'], :pattern => /,/ + split 'line', /,/, ['x', 'y', 'z'] check_scope :values_fields => ['offset', 'line', 'x', 'y', 'z'] end assembly 'right' do check_scope :values_fields => ['offset', 'line'] - split 'line', ['x', 'y', 'z'], :pattern => /,/ + split 'line', /,/, ['x', 'y', 'z'] check_scope :values_fields => ['offset', 'line', 'x', 'y', 'z'] end diff --git a/test/mock_assemblies.rb b/test/mock_assemblies.rb index cfa8284..c926b9f 100644 --- a/test/mock_assemblies.rb +++ b/test/mock_assemblies.rb @@ -39,11 +39,11 @@ def mock_two_input_assembly(&block) source 'test2', tap('test/data/data2.txt') assembly 'test1' do - split 'line', :pattern => /[.,]*\s+/, :into => ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] end assembly 'test2' do - split 'line', :pattern => /[.,]*\s+/, :into => ['name', 'id', 'town'], :output => ['name', 'id', 'town'] + split 'line', /[.,]*\s+/, ['name', 'id', 'town'], :output => ['name', 'id', 'town'] end assembly = assembly 'test', &block diff --git a/test/test_assembly.rb b/test/test_assembly.rb index bda8e28..fa48ba7 100644 --- a/test/test_assembly.rb +++ b/test/test_assembly.rb @@ -661,7 +661,7 @@ def test_sum_by_sub_assembly def test_empty_where assembly = mock_assembly do - split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/, :output => ['name', 'score1', 'score2', 'id'] + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] where end assert_equal Java::CascadingPipe::Each, assembly.tail_pipe.class @@ -672,7 +672,7 @@ def test_empty_where def test_where assembly = mock_assembly do - split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/, :output => ['name', 'score1', 'score2', 'id'] + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] where 'score1:double < score2:double' end assert_equal Java::CascadingPipe::Each, assembly.tail_pipe.class @@ -681,7 +681,7 @@ def test_where def test_where_with_expression assembly = mock_assembly do - split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/, :output => ['name', 'score1', 'score2', 'id'] + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] where :expression => 'score1:double < score2:double' end assert_equal Java::CascadingPipe::Each, assembly.tail_pipe.class @@ -690,7 +690,7 @@ def test_where_with_expression def test_where_with_import assembly = mock_assembly do - split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/, :output => ['name', 'score1', 'score2', 'id'] + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] names = ['SMITH', 'JONES', 'BROWN'] where "import java.util.Arrays;\nArrays.asList(new String[] { \"#{names.join('", "')}\" }).contains(name:string)" end @@ -698,6 +698,24 @@ def test_where_with_import assert_equal Java::CascadingOperationExpression::ExpressionFilter, assembly.tail_pipe.operation.class end + def test_rename + assembly = mock_assembly do + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] + rename 'score2' => 'new_score2', 'score1' => 'new_score1', 'name' => 'new_name' + end + # Original order preserved + assert_equal ['new_name', 'new_score1', 'new_score2', 'id'], assembly.scope.values_fields.to_a + end + + def test_copy + assembly = mock_assembly do + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] + copy 'score2' => 'new_score2', 'id' => 'new_id', 'name' => 'new_name' + end + # Original order preserved in copied fields + assert_equal ['name', 'score1', 'score2', 'id', 'new_name', 'new_score2', 'new_id'], assembly.scope.values_fields.to_a + end + def test_smoke_test_describe cascade 'smoke' do flow 'smoke' do diff --git a/test/test_local_execution.rb b/test/test_local_execution.rb index 5756acb..d262be1 100644 --- a/test/test_local_execution.rb +++ b/test/test_local_execution.rb @@ -36,7 +36,7 @@ def test_splitter source 'copy', tap('test/data/data1.txt') assembly 'copy' do - split 'line', :pattern => /[.,]*\s+/, :into=>['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] assert_size_equals 4 assert_not_null debug :print_fields => true @@ -70,14 +70,14 @@ def test_join1 source 'data2', tap('test/data/data2.txt') assembly1 = assembly 'data1' do - split 'line', :pattern => /[.,]*\s+/, :into => ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] assert_size_equals 4 assert_not_null debug :print_fields => true end assembly2 = assembly 'data2' do - split 'line', :pattern => /[.,]*\s+/, :into => ['name', 'id', 'town'], :output => ['name', 'id', 'town'] + split 'line', /[.,]*\s+/, ['name', 'id', 'town'], :output => ['name', 'id', 'town'] assert_size_equals 3 assert_not_null debug :print_fields => true @@ -106,12 +106,12 @@ def test_join2 source 'data2', tap('test/data/data2.txt') assembly 'data1' do - split 'line', :pattern => /[.,]*\s+/, :into => ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] debug :print_fields => true end assembly 'data2' do - split 'line', :pattern => /[.,]*\s+/, :into => ['name', 'code', 'town'], :output => ['name', 'code', 'town'] + split 'line', /[.,]*\s+/, ['name', 'code', 'town'], :output => ['name', 'code', 'town'] debug :print_fields => true end @@ -135,7 +135,7 @@ def test_union source 'data2', tap('test/data/data2.txt') assembly 'data1' do - split 'line', :pattern => /[.,]*\s+/, :into => ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] + split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] assert_size_equals 4 assert_not_null @@ -144,7 +144,7 @@ def test_union end assembly 'data2' do - split 'line', :pattern => /[.,]*\s+/, :into => ['name', 'code', 'town'], :output => ['name', 'code', 'town'] + split 'line', /[.,]*\s+/, ['name', 'code', 'town'], :output => ['name', 'code', 'town'] assert_size_equals 3 assert_not_null From 01ea0010d947b3936c34da32c8b7aba5b16bcd3a Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Wed, 17 Apr 2013 12:20:32 -0500 Subject: [PATCH 04/36] Remove *args from text operations; this is a non-backwards compatible change --- lib/cascading.rb | 1 + lib/cascading/assembly.rb | 45 +-------------------- lib/cascading/operations.rb | 21 ---------- lib/cascading/text_operations.rb | 67 ++++++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+), 64 deletions(-) create mode 100644 lib/cascading/text_operations.rb diff --git a/lib/cascading.rb b/lib/cascading.rb index a294b0b..59d6c34 100644 --- a/lib/cascading.rb +++ b/lib/cascading.rb @@ -16,6 +16,7 @@ module Cascading require 'cascading/operations' require 'cascading/identity_operations' require 'cascading/regex_operations' +require 'cascading/text_operations' require 'cascading/scope' require 'cascading/tap' diff --git a/lib/cascading/assembly.rb b/lib/cascading/assembly.rb index a20d8ea..441f3a5 100644 --- a/lib/cascading/assembly.rb +++ b/lib/cascading/assembly.rb @@ -2,6 +2,7 @@ require 'cascading/operations' require 'cascading/identity_operations' require 'cascading/regex_operations' +require 'cascading/text_operations' require 'cascading/aggregations' require 'cascading/sub_assembly' require 'cascading/ext/array' @@ -323,6 +324,7 @@ def each(*args_with_params) include IdentityOperations include RegexOperations + include TextOperations def assert(assertion, params = {}) assertion_level = params[:level] || Java::CascadingOperation::AssertionLevel::STRICT @@ -359,42 +361,6 @@ def debug(params = {}) each(all_fields, :filter => debug) end - # Builds a pipe that parses the specified field as a date using hte provided format string. - # The unamed argument specifies the field to format. - # - # The named options are: - # * :into a string. It specifies the receiving field. By default, it will be named after - # the input argument. - # * :pattern a string. Specifies the date format. - # * :output a string or array of strings. Specifies the outgoing fields (all fields will be output by default) - def parse_date(*args) - options = args.extract_options! - field = options[:into] || "#{args[0]}_parsed" - output = options[:output] || all_fields - pattern = options[:pattern] || "yyyy/MM/dd" - - each args[0], :function => date_parser(field, pattern), :output => output - end - - # Builds a pipe that format a date using a specified format pattern. - # - # The unamed argument specifies the field to format. - # - # The named options are: - # * :into a string. It specifies the receiving field. By default, it will be named after - # the input argument. - # * :pattern a string. Specifies the date format. - # * :timezone a string. Specifies the timezone (defaults to UTC). - # * :output a string or array of strings. Specifies the outgoing fields (all fields will be output by default) - def format_date(*args) - options = args.extract_options! - field = options[:into] || "#{args[0]}_formatted" - pattern = options[:pattern] || "yyyy/MM/dd" - output = options[:output] || all_fields - - each args[0], :function => date_formatter(field, pattern, options[:timezone]), :output => output - end - # Builds a pipe that inserts values into the current tuple. # # The method takes a hash as parameter. This hash contains as keys the names of the fields to insert @@ -543,13 +509,6 @@ def distinct(*args) pass end - def join_fields(*args) - options = args.extract_options! - output = options[:output] || all_fields - - each args, :function => field_joiner(options), :output => output - end - # Ungroups, or unpivots, a tuple (see Cascading's UnGroup at http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/function/UnGroup.html). # # You must provide :key and you must provide only one of :value_selectors diff --git a/lib/cascading/operations.rb b/lib/cascading/operations.rb index d09903a..44b53ba 100644 --- a/lib/cascading/operations.rb +++ b/lib/cascading/operations.rb @@ -117,18 +117,6 @@ def expression_filter(*args) Java::CascadingOperationExpression::ExpressionFilter.new(*arguments) end - def date_parser(field, format) - fields = fields(field) - Java::CascadingOperationText::DateParser.new(fields, format) - end - - def date_formatter(fields, format, timezone=nil) - fields = fields(fields) - timezone = Java::JavaUtil::TimeZone.get_time_zone(timezone) if timezone - arguments = [fields, format, timezone].compact - Java::CascadingOperationText::DateFormatter.new(*arguments) - end - def regex_filter(*args) options = args.extract_options! @@ -138,14 +126,5 @@ def regex_filter(*args) parameters = [pattern.to_s, remove_match, match_each_element].compact Java::CascadingOperationRegex::RegexFilter.new(*parameters) end - - def field_joiner(*args) - options = args.extract_options! - delimiter = options[:delimiter] || ',' - fields = fields(options[:into]) - - parameters = [fields, delimiter].compact - Java::CascadingOperationText::FieldJoiner.new(*parameters) - end end end diff --git a/lib/cascading/text_operations.rb b/lib/cascading/text_operations.rb new file mode 100644 index 0000000..a54973c --- /dev/null +++ b/lib/cascading/text_operations.rb @@ -0,0 +1,67 @@ +module Cascading + # Module of pipe assemblies that wrap operations defined in the Cascading + # cascading.operations.text package. These are split out only to group + # similar functionality. + # + # Mapping of DSL pipes into Cascading text operations: + # parse\_date:: {DateParser}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/text/DateParser.html] + # format\_date:: {DateFormatter}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/text/DateFormatter.html] + # join\_fields:: {FieldJoiner}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/text/FieldJoiner.html] + module TextOperations + # Parses the given input_field as a date using the provided format string. + # + # Example: + # parse_date 'text_date', 'yyyy/MM/dd', 'timestamp' + def parse_date(input_field, date_format, into_field, params = {}) + output = params[:output] || all_fields # Overrides Cascading default + + input_field = fields(input_field) + raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1 + into_field = fields(into_field) + raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1 + + each( + input_field, + :function => Java::CascadingOperationText::DateParser.new(into_field, date_format), + :output => output + ) + end + + # Converts a timestamp into a formatted date string using the specified + # date_format. + # + # Example: + # format_date 'timestamp', 'yyyy/MM/dd', 'text_date' + def format_date(input_field, date_format, into_field, params = {}) + output = params[:output] || all_fields # Overrides Cascading default + + input_field = fields(input_field) + raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1 + into_field = fields(into_field) + raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1 + + each( + input_field, + :function => Java::CascadingOperationText::DateFormatter.new(into_field, date_format), + :output => output + ) + end + + # Joins multiple fields into a single field given a delimiter. + # + # Example: + # join_fields ['field1', 'field2'], ',', 'comma_separated' + def join_fields(input_fields, delimiter, into_field) + output = params[:output] || all_fields # Overrides Cascading default + + into_field = fields(into_field) + raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1 + + each( + input_fields, + :function => Java::CascadingOperationText::FieldJoiner.new(into_field, delimiter.to_s), + :output => output + ) + end + end +end From 984df8d08112c6e0e087bca31c3470ca909f6ea6 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Wed, 17 Apr 2013 12:33:16 -0500 Subject: [PATCH 05/36] Remove unused identity wrapper --- lib/cascading/operations.rb | 4 ---- test/test_assembly.rb | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/lib/cascading/operations.rb b/lib/cascading/operations.rb index 44b53ba..8c8aa4c 100644 --- a/lib/cascading/operations.rb +++ b/lib/cascading/operations.rb @@ -11,10 +11,6 @@ module Cascading # idiomatic Ruby (positional required params and a params hash for optional # args) should be used. See Cascading::Assembly#set_value for an example. module Operations - def identity - Java::CascadingOperation::Identity.new - end - def aggregator_function(args, aggregator_klass) options = args.extract_options! ignore = options[:ignore] diff --git a/test/test_assembly.rb b/test/test_assembly.rb index fa48ba7..e9b457b 100644 --- a/test/test_assembly.rb +++ b/test/test_assembly.rb @@ -23,7 +23,7 @@ def test_create_assembly_simple def test_each_identity assembly = mock_assembly do - each 'offset', :function => identity + each 'offset', :function => Java::CascadingOperation::Identity.new end flow = assembly.parent @@ -35,7 +35,7 @@ def test_each_identity def test_create_each # You can apply an Each to 0 fields assembly = mock_assembly do - each(:function => identity) + each(:function => Java::CascadingOperation::Identity.new) end assert_equal Java::CascadingPipe::Each, assembly.tail_pipe.class @@ -579,7 +579,7 @@ def test_branch_single assembly = mock_assembly do branch 'branch1' do branch 'branch2' do - each 'line', :function => identity + each 'line', :function => Java::CascadingOperation::Identity.new end end end From b4b44f344cce4dd02938d380a8067300c5bd9c27 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Wed, 17 Apr 2013 12:53:00 -0500 Subject: [PATCH 06/36] Refactor filter/reject/where into FilterOperations module; further refactoring is going to get messy --- lib/cascading.rb | 1 + lib/cascading/assembly.rb | 96 +------------------------- lib/cascading/filter_operations.rb | 106 +++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 94 deletions(-) create mode 100644 lib/cascading/filter_operations.rb diff --git a/lib/cascading.rb b/lib/cascading.rb index 59d6c34..12ceb85 100644 --- a/lib/cascading.rb +++ b/lib/cascading.rb @@ -15,6 +15,7 @@ module Cascading require 'cascading/mode' require 'cascading/operations' require 'cascading/identity_operations' +require 'cascading/filter_operations' require 'cascading/regex_operations' require 'cascading/text_operations' require 'cascading/scope' diff --git a/lib/cascading/assembly.rb b/lib/cascading/assembly.rb index 441f3a5..ea30238 100644 --- a/lib/cascading/assembly.rb +++ b/lib/cascading/assembly.rb @@ -1,6 +1,7 @@ require 'cascading/base' require 'cascading/operations' require 'cascading/identity_operations' +require 'cascading/filter_operations' require 'cascading/regex_operations' require 'cascading/text_operations' require 'cascading/aggregations' @@ -323,6 +324,7 @@ def each(*args_with_params) end include IdentityOperations + include FilterOperations include RegexOperations include TextOperations @@ -385,100 +387,6 @@ def insert(args) end end - # Builds a pipe that filters the tuples based on an expression or a pattern (but not both !). - # - # The first unamed argument, if provided, is a filtering expression (using the Janino syntax). - # - # The named options are: - # * :pattern a string. Specifies a regular expression pattern used to filter the tuples. If this - # option is provided, then the filter is regular expression-based. This is incompatible with the _expression_ option. - # * :expression a string. Specifies a Janino expression used to filter the tuples. This option has the - # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino - # expression-based. This is incompatible with the _pattern_ option. - # * :validate a boolean. Passed into Cascading#expr to enable or disable - # expression validation. Defaults to true. - # * :validate_with a hash. Actual arguments used by Cascading#expr for - # expression validation. Defaults to {}. - def filter(*args) - options = args.extract_options! - from = options.delete(:from) || all_fields - expression = options.delete(:expression) || args.shift - regex = options.delete(:pattern) - validate = options.has_key?(:validate) ? options.delete(:validate) : true - validate_with = options.has_key?(:validate_with) ? options.delete(:validate_with) : {} - - if expression - stub = expr(expression, { :validate => validate, :validate_with => validate_with }) - types, expression = stub.types, stub.expression - - stub.validate_scope(scope) - each from, :filter => expression_filter( - :parameters => types, - :expression => expression - ) - elsif regex - each from, :filter => regex_filter(regex, options) - end - end - - def filter_null(*args) - options = args.extract_options! - each(args, :filter => Java::CascadingOperationFilter::FilterNull.new) - end - alias reject_null filter_null - - def filter_not_null(*args) - options = args.extract_options! - each(args, :filter => Java::CascadingOperationFilter::FilterNotNull.new) - end - alias where_null filter_not_null - - # Builds a pipe that rejects the tuples based on an expression. - # - # The first unamed argument, if provided, is a filtering expression (using the Janino syntax). - # - # The named options are: - # * :expression a string. Specifies a Janino expression used to filter the tuples. This option has the - # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino - # expression-based. - # * :validate a boolean. Passed into Cascading#expr to enable or disable - # expression validation. Defaults to true. - # * :validate_with a hash. Actual arguments used by Cascading#expr for - # expression validation. Defaults to {}. - def reject(*args) - options = args.extract_options - raise "Regex not allowed" if options && options[:pattern] - - filter(*args) - end - - # Builds a pipe that includes just the tuples matching an expression. - # - # The first unamed argument, if provided, is a filtering expression (using the Janino syntax). - # - # The named options are: - # * :expression a string. Specifies a Janino expression used to select the tuples. This option has the - # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino - # expression-based. - # * :validate a boolean. Passed into Cascading#expr to enable or disable - # expression validation. Defaults to true. - # * :validate_with a hash. Actual arguments used by Cascading#expr for - # expression validation. Defaults to {}. - def where(*args) - options = args.extract_options - raise "Regex not allowed" if options && options[:pattern] - - if options[:expression] - _, imports, expr = options[:expression].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a - options[:expression] = "#{imports}!(#{expr})" - elsif args[0] - _, imports, expr = args[0].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a - args[0] = "#{imports}!(#{expr})" - end - - filter(*args) - end - # Builds a pipe that evaluates the specified Janino expression and insert it in a new field in the tuple. # # The named options are: diff --git a/lib/cascading/filter_operations.rb b/lib/cascading/filter_operations.rb new file mode 100644 index 0000000..0485435 --- /dev/null +++ b/lib/cascading/filter_operations.rb @@ -0,0 +1,106 @@ +module Cascading + # Module of filtering operations. Unlike some of the other functional + # operations modules, this one does not just wrap operations defined by + # Cascading in cascading.operation.filter. Instead, it provides some useful + # high-level DSL pipes which map many Cascading operations into a smaller + # number of DSL statements. + # + # Still, some are direct wrappers: + # filter\_null:: {FilterNull}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/filter/FilterNull.html] + # filter\_not\_null:: {FilterNotNull}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/filter/FilterNotNull.html] + module FilterOperations + # Filter the current assembly based on an expression or regex, but not both. + # + # The first unamed argument, if provided, is a filtering expression (using the Janino syntax). + # + # The named options are: + # * :pattern a string. Specifies a regular expression pattern used to filter the tuples. If this + # option is provided, then the filter is regular expression-based. This is incompatible with the _expression_ option. + # * :expression a string. Specifies a Janino expression used to filter the tuples. This option has the + # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino + # expression-based. This is incompatible with the _pattern_ option. + # * :validate a boolean. Passed into Cascading#expr to enable or disable + # expression validation. Defaults to true. + # * :validate_with a hash. Actual arguments used by Cascading#expr for + # expression validation. Defaults to {}. + def filter(*args) + options = args.extract_options! + from = options.delete(:from) || all_fields + expression = options.delete(:expression) || args.shift + regex = options.delete(:pattern) + validate = options.has_key?(:validate) ? options.delete(:validate) : true + validate_with = options.has_key?(:validate_with) ? options.delete(:validate_with) : {} + + if expression + stub = expr(expression, { :validate => validate, :validate_with => validate_with }) + types, expression = stub.types, stub.expression + + stub.validate_scope(scope) + each from, :filter => expression_filter( + :parameters => types, + :expression => expression + ) + elsif regex + each from, :filter => regex_filter(regex, options) + end + end + + def filter_null(*args) + options = args.extract_options! + each(args, :filter => Java::CascadingOperationFilter::FilterNull.new) + end + alias reject_null filter_null + + def filter_not_null(*args) + options = args.extract_options! + each(args, :filter => Java::CascadingOperationFilter::FilterNotNull.new) + end + alias where_null filter_not_null + + # Builds a pipe that rejects the tuples based on an expression. + # + # The first unamed argument, if provided, is a filtering expression (using the Janino syntax). + # + # The named options are: + # * :expression a string. Specifies a Janino expression used to filter the tuples. This option has the + # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino + # expression-based. + # * :validate a boolean. Passed into Cascading#expr to enable or disable + # expression validation. Defaults to true. + # * :validate_with a hash. Actual arguments used by Cascading#expr for + # expression validation. Defaults to {}. + def reject(*args) + options = args.extract_options + raise "Regex not allowed" if options && options[:pattern] + + filter(*args) + end + + # Builds a pipe that includes just the tuples matching an expression. + # + # The first unamed argument, if provided, is a filtering expression (using the Janino syntax). + # + # The named options are: + # * :expression a string. Specifies a Janino expression used to select the tuples. This option has the + # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino + # expression-based. + # * :validate a boolean. Passed into Cascading#expr to enable or disable + # expression validation. Defaults to true. + # * :validate_with a hash. Actual arguments used by Cascading#expr for + # expression validation. Defaults to {}. + def where(*args) + options = args.extract_options + raise "Regex not allowed" if options && options[:pattern] + + if options[:expression] + _, imports, expr = options[:expression].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a + options[:expression] = "#{imports}!(#{expr})" + elsif args[0] + _, imports, expr = args[0].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a + args[0] = "#{imports}!(#{expr})" + end + + filter(*args) + end + end +end From 79d7d119253bfa505a14b1ce8e4febba4a181c88 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Wed, 17 Apr 2013 13:15:10 -0500 Subject: [PATCH 07/36] Remove * args from filter/reject/where; this is a non-backwards compatible change --- lib/cascading/expr_stub.rb | 12 +-- lib/cascading/filter_operations.rb | 130 ++++++++++++----------------- lib/cascading/operations.rb | 34 -------- test/test_assembly.rb | 20 ----- 4 files changed, 60 insertions(+), 136 deletions(-) diff --git a/lib/cascading/expr_stub.rb b/lib/cascading/expr_stub.rb index 014f70f..30c0a43 100644 --- a/lib/cascading/expr_stub.rb +++ b/lib/cascading/expr_stub.rb @@ -21,6 +21,12 @@ def initialize(expression) end end + # Extract Java names and types from @types hash + def names_and_types + names, types = split_hash(@types) + [names.to_java(java.lang.String), types.to_java(java.lang.Class)] + end + def to_s @input_expression end @@ -113,12 +119,6 @@ def evaluator end end - # Extract Java names and types from @types hash - def names_and_types - names, types = split_hash(@types) - [names.to_java(java.lang.String), types.to_java(java.lang.Class)] - end - # Makes best effort to convert Ruby numbers into the Java numeric type # exepcted by a Janino expression. However, if the conversion fails, it # returns the original value so that the exception thrown will be from diff --git a/lib/cascading/filter_operations.rb b/lib/cascading/filter_operations.rb index 0485435..2b4deca 100644 --- a/lib/cascading/filter_operations.rb +++ b/lib/cascading/filter_operations.rb @@ -11,96 +11,74 @@ module Cascading module FilterOperations # Filter the current assembly based on an expression or regex, but not both. # - # The first unamed argument, if provided, is a filtering expression (using the Janino syntax). - # - # The named options are: - # * :pattern a string. Specifies a regular expression pattern used to filter the tuples. If this - # option is provided, then the filter is regular expression-based. This is incompatible with the _expression_ option. - # * :expression a string. Specifies a Janino expression used to filter the tuples. This option has the - # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino - # expression-based. This is incompatible with the _pattern_ option. - # * :validate a boolean. Passed into Cascading#expr to enable or disable - # expression validation. Defaults to true. - # * :validate_with a hash. Actual arguments used by Cascading#expr for - # expression validation. Defaults to {}. - def filter(*args) - options = args.extract_options! - from = options.delete(:from) || all_fields - expression = options.delete(:expression) || args.shift - regex = options.delete(:pattern) - validate = options.has_key?(:validate) ? options.delete(:validate) : true - validate_with = options.has_key?(:validate_with) ? options.delete(:validate_with) : {} + # The named params are: + # [expression] A Janino expression used to filter. Has access to all :input + # fields. + # [validate] Boolean passed to Cascading#expr to enable or disable + # expression validation. Defaults to true. + # [validate_with] Hash mapping field names to actual arguments used by + # Cascading#expr for expression validation. Defaults to {}. + # [regex] A regular expression used to filter. + # [remove_match] Boolean indicating if regex matches should be removed or + # kept. Defaults to false, which is a bit counterintuitive. + # [match_each_element] Boolean indicating if regex should match entire + # incoming tuple (joined with tabs) or each field + # individually. Defaults to false. + def filter(params = {}) + input_fields = params[:input] || all_fields + expression = params[:expression] + regex = params[:regex] + validate = params.has_key?(:validate) ? params[:validate] : true + validate_with = params[:validate_with] || {} if expression stub = expr(expression, { :validate => validate, :validate_with => validate_with }) - types, expression = stub.types, stub.expression - stub.validate_scope(scope) - each from, :filter => expression_filter( - :parameters => types, - :expression => expression - ) + + names, types = stub.names_and_types + each input_fields, :filter => Java::CascadingOperationExpression::ExpressionFilter.new( + stub.expression, + names, + types + ) elsif regex - each from, :filter => regex_filter(regex, options) + parameters = [regex.to_s, params[:remove_match], params[:match_each_element]].compact + each input_fields, :filter => Java::CascadingOperationRegex::RegexFilter.new(*parameters) + else + raise 'filter requires one of :expression or :regex' end end - def filter_null(*args) - options = args.extract_options! - each(args, :filter => Java::CascadingOperationFilter::FilterNull.new) - end - alias reject_null filter_null - - def filter_not_null(*args) - options = args.extract_options! - each(args, :filter => Java::CascadingOperationFilter::FilterNotNull.new) + # Rejects tuples from the current assembly based on a Janino expression. + # This is just a wrapper for FilterOperations.filter. + def reject(expression, params = {}) + params[:expression] = expression + filter(params) end - alias where_null filter_not_null - # Builds a pipe that rejects the tuples based on an expression. - # - # The first unamed argument, if provided, is a filtering expression (using the Janino syntax). + # Keeps tuples from the current assembly based on a Janino expression. This + # is a wrapper for FilterOperations.filter. # - # The named options are: - # * :expression a string. Specifies a Janino expression used to filter the tuples. This option has the - # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino - # expression-based. - # * :validate a boolean. Passed into Cascading#expr to enable or disable - # expression validation. Defaults to true. - # * :validate_with a hash. Actual arguments used by Cascading#expr for - # expression validation. Defaults to {}. - def reject(*args) - options = args.extract_options - raise "Regex not allowed" if options && options[:pattern] - - filter(*args) + # Note that this is accomplished by inverting the given expression, and best + # attempt is made to support import statements prior to the expression. If + # this support should break, simply negate your expression and use + # FilterOperations.reject. + def where(expression, params = {}) + _, imports, expr = expression.match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a + params[:expression] = "#{imports}!(#{expr})" + filter(params) end - # Builds a pipe that includes just the tuples matching an expression. - # - # The first unamed argument, if provided, is a filtering expression (using the Janino syntax). - # - # The named options are: - # * :expression a string. Specifies a Janino expression used to select the tuples. This option has the - # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino - # expression-based. - # * :validate a boolean. Passed into Cascading#expr to enable or disable - # expression validation. Defaults to true. - # * :validate_with a hash. Actual arguments used by Cascading#expr for - # expression validation. Defaults to {}. - def where(*args) - options = args.extract_options - raise "Regex not allowed" if options && options[:pattern] - - if options[:expression] - _, imports, expr = options[:expression].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a - options[:expression] = "#{imports}!(#{expr})" - elsif args[0] - _, imports, expr = args[0].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a - args[0] = "#{imports}!(#{expr})" - end + # Rejects tuples from the current assembly if any input field is null. + def filter_null(*input_fields) + each(input_fields, :filter => Java::CascadingOperationFilter::FilterNull.new) + end + alias reject_null filter_null - filter(*args) + # Rejects tuples from the current assembly if any input field is not null. + def filter_not_null(*input_fields) + each(input_fields, :filter => Java::CascadingOperationFilter::FilterNotNull.new) end + alias where_null filter_not_null end end diff --git a/lib/cascading/operations.rb b/lib/cascading/operations.rb index 8c8aa4c..94f6a73 100644 --- a/lib/cascading/operations.rb +++ b/lib/cascading/operations.rb @@ -88,39 +88,5 @@ def coerce_to_java(v) java.lang.String.new(v.to_s) end end - - def expression_filter(*args) - options = args.extract_options! - expression = (args[0] || options[:expression]).to_s - parameters = options[:parameters] - parameter_names = [] - parameter_types = [] - if parameters.is_a? ::Hash - parameters.each do |name, type| - parameter_names << name - parameter_types << type - end - parameter_names = parameter_names.to_java(java.lang.String) - parameter_types = parameter_types.to_java(java.lang.Class) - - arguments = [expression, parameter_names, parameter_types].compact - elsif !parameters.nil? - arguments = [expression, parameters.java_class].compact - else - arguments = [expression, java.lang.String.java_class].compact - end - - Java::CascadingOperationExpression::ExpressionFilter.new(*arguments) - end - - def regex_filter(*args) - options = args.extract_options! - - pattern = args[0] - remove_match = options[:remove_match] - match_each_element = options[:match_each_element] - parameters = [pattern.to_s, remove_match, match_each_element].compact - Java::CascadingOperationRegex::RegexFilter.new(*parameters) - end end end diff --git a/test/test_assembly.rb b/test/test_assembly.rb index e9b457b..a67f532 100644 --- a/test/test_assembly.rb +++ b/test/test_assembly.rb @@ -659,17 +659,6 @@ def test_sum_by_sub_assembly assert_equal ['line', 'sum'], assembly.scope.grouping_fields.to_a end - def test_empty_where - assembly = mock_assembly do - split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] - where - end - assert_equal Java::CascadingPipe::Each, assembly.tail_pipe.class - - # Empty where compiles away - assert_equal Java::CascadingOperationRegex::RegexSplitter, assembly.tail_pipe.operation.class - end - def test_where assembly = mock_assembly do split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] @@ -679,15 +668,6 @@ def test_where assert_equal Java::CascadingOperationExpression::ExpressionFilter, assembly.tail_pipe.operation.class end - def test_where_with_expression - assembly = mock_assembly do - split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] - where :expression => 'score1:double < score2:double' - end - assert_equal Java::CascadingPipe::Each, assembly.tail_pipe.class - assert_equal Java::CascadingOperationExpression::ExpressionFilter, assembly.tail_pipe.operation.class - end - def test_where_with_import assembly = mock_assembly do split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id'] From c5ef54373e9fd42ea9c94dcbdcae08e4c2a90537 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Wed, 17 Apr 2013 14:39:41 -0500 Subject: [PATCH 08/36] Refactor Operations/AggregatorOperations for convenience in preparing to continue removing *args --- cascading.jruby.gemspec | 2 +- lib/cascading.rb | 1 + lib/cascading/aggregations.rb | 6 ++--- lib/cascading/aggregator_operations.rb | 27 ++++++++++++++++++++ lib/cascading/assembly.rb | 3 +-- lib/cascading/operations.rb | 35 -------------------------- test/test_aggregations.rb | 11 ++++++++ test/test_operations.rb | 10 -------- 8 files changed, 44 insertions(+), 51 deletions(-) create mode 100644 lib/cascading/aggregator_operations.rb diff --git a/cascading.jruby.gemspec b/cascading.jruby.gemspec index 607df80..21a077a 100644 --- a/cascading.jruby.gemspec +++ b/cascading.jruby.gemspec @@ -16,7 +16,7 @@ Gem::Specification.new do |s| s.rubyforge_project = "cascading.jruby" s.rubygems_version = "1.8.21" s.summary = "A JRuby DSL for Cascading" - s.test_files = ["test/test_aggregations.rb", "test/test_assembly.rb", "test/test_cascade.rb", "test/test_cascading.rb", "test/test_exceptions.rb", "test/test_flow.rb", "test/test_local_execution.rb", "test/test_operations.rb"] + s.test_files = Dir.glob("test/**/*.rb") if s.respond_to? :specification_version then s.specification_version = 3 diff --git a/lib/cascading.rb b/lib/cascading.rb index 12ceb85..407c9b3 100644 --- a/lib/cascading.rb +++ b/lib/cascading.rb @@ -14,6 +14,7 @@ module Cascading require 'cascading/flow' require 'cascading/mode' require 'cascading/operations' +require 'cascading/aggregator_operations' require 'cascading/identity_operations' require 'cascading/filter_operations' require 'cascading/regex_operations' diff --git a/lib/cascading/aggregations.rb b/lib/cascading/aggregations.rb index 12a28de..00a5dc6 100644 --- a/lib/cascading/aggregations.rb +++ b/lib/cascading/aggregations.rb @@ -1,4 +1,4 @@ -require 'cascading/operations' +require 'cascading/aggregator_operations' require 'cascading/scope' require 'cascading/ext/array' @@ -16,8 +16,6 @@ module Cascading # Optimizations: # * If the leading Group is a GroupBy and all subsequent Everies are Aggregators that have a corresponding AggregateBy, Aggregations can replace the GroupBy/Aggregator pipe with a single composite AggregateBy class Aggregations - include Operations - attr_reader :assembly, :tail_pipe, :scope, :aggregate_bys def initialize(assembly, group, incoming_scopes) @@ -84,6 +82,8 @@ def every(*args) make_pipe(Java::CascadingPipe::Every, parameters) end + include AggregatorOperations + def assert_group(*args) options = args.extract_options! diff --git a/lib/cascading/aggregator_operations.rb b/lib/cascading/aggregator_operations.rb new file mode 100644 index 0000000..69069b2 --- /dev/null +++ b/lib/cascading/aggregator_operations.rb @@ -0,0 +1,27 @@ +module Cascading + module AggregatorOperations + def aggregator_function(args, aggregator_klass) + options = args.extract_options! + ignore = options[:ignore] + + parameters = [Cascading.fields(args), ignore].compact + aggregator_klass.new(*parameters) + end + + def first_function(*args) + aggregator_function(args, Java::CascadingOperationAggregator::First) + end + + def min_function(*args) + aggregator_function(args, Java::CascadingOperationAggregator::Min) + end + + def max_function(*args) + aggregator_function(args, Java::CascadingOperationAggregator::Max) + end + + def last_function(*args) + aggregator_function(args, Java::CascadingOperationAggregator::Last) + end + end +end diff --git a/lib/cascading/assembly.rb b/lib/cascading/assembly.rb index ea30238..295500c 100644 --- a/lib/cascading/assembly.rb +++ b/lib/cascading/assembly.rb @@ -28,8 +28,6 @@ module Cascading # [into] c.o.Operation field declaration # [output] c.p.Each output selector class Assembly < Cascading::Node - include Operations - attr_reader :head_pipe, :tail_pipe def initialize(name, parent, outgoing_scopes = {}) @@ -323,6 +321,7 @@ def each(*args_with_params) each end + include Operations include IdentityOperations include FilterOperations include RegexOperations diff --git a/lib/cascading/operations.rb b/lib/cascading/operations.rb index 94f6a73..ecb8c38 100644 --- a/lib/cascading/operations.rb +++ b/lib/cascading/operations.rb @@ -1,40 +1,5 @@ module Cascading - # The Cascading::Operations module is deprecated. The original idea from long - # ago is that it would be useful to mixin operator wrappers to places other - # than Cascading::Assembly, but this is not true. Instead, put Eaches in - # Cascading::Assembly, Everies in Cascading::Aggregations, and any more - # generally useful utility code directly in the Cascading module - # (cascading/cascading.rb). - # - # Further, the entire *args pattern should be deprecated as it leads to - # functions that can only be understood by reading their code. Instead, - # idiomatic Ruby (positional required params and a params hash for optional - # args) should be used. See Cascading::Assembly#set_value for an example. module Operations - def aggregator_function(args, aggregator_klass) - options = args.extract_options! - ignore = options[:ignore] - - parameters = [Cascading.fields(args), ignore].compact - aggregator_klass.new(*parameters) - end - - def first_function(*args) - aggregator_function(args, Java::CascadingOperationAggregator::First) - end - - def min_function(*args) - aggregator_function(args, Java::CascadingOperationAggregator::Min) - end - - def max_function(*args) - aggregator_function(args, Java::CascadingOperationAggregator::Max) - end - - def last_function(*args) - aggregator_function(args, Java::CascadingOperationAggregator::Last) - end - def expression_function(*args) options = args.extract_options! diff --git a/test/test_aggregations.rb b/test/test_aggregations.rb index 69e8fa3..da0900b 100644 --- a/test/test_aggregations.rb +++ b/test/test_aggregations.rb @@ -6,6 +6,17 @@ class TC_Aggregations < Test::Unit::TestCase include MockAssemblies + include AggregatorOperations + + def test_aggregator_function_ignore_values + min = min_function 'min_field', :ignore => [nil].to_java(:string) + assert_not_nil min + end + + def test_aggregator_function_ignore_tuples + first = first_function 'first_field', :ignore => [Java::CascadingTuple::Tuple.new(-1)].to_java(Java::CascadingTuple::Tuple) + assert_not_nil first + end # first chosen because it does not have a corresponding AggregateBy def test_create_group_by diff --git a/test/test_operations.rb b/test/test_operations.rb index 9f098d8..5d54d27 100644 --- a/test/test_operations.rb +++ b/test/test_operations.rb @@ -4,16 +4,6 @@ class TC_Operations < Test::Unit::TestCase include Operations - def test_aggregator_function_ignore_values - min = min_function 'min_field', :ignore => [nil].to_java(:string) - assert_not_nil min - end - - def test_aggregator_function_ignore_tuples - first = first_function 'first_field', :ignore => [Java::CascadingTuple::Tuple.new(-1)].to_java(Java::CascadingTuple::Tuple) - assert_not_nil first - end - def test_coerce_to_java_int result = coerce_to_java(1) From 7f716825beab5e69c9797c5d7309ca4ca3eb1daa Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Wed, 17 Apr 2013 15:05:10 -0500 Subject: [PATCH 09/36] Move remainder of operations from Assembly into Operations catch-all and refactor insert --- lib/cascading/assembly.rb | 164 ----------------------------------- lib/cascading/operations.rb | 168 ++++++++++++++++++++++++++++++------ 2 files changed, 142 insertions(+), 190 deletions(-) diff --git a/lib/cascading/assembly.rb b/lib/cascading/assembly.rb index 295500c..51249ba 100644 --- a/lib/cascading/assembly.rb +++ b/lib/cascading/assembly.rb @@ -345,169 +345,5 @@ def assert_not_null(params = {}) assertion = Java::CascadingOperationAssertion::AssertNotNull.new assert(assertion, params) end - - # Builds a debugging pipe. - # - # Without arguments, it generate a simple debug pipe, that prints all tuple to the standard - # output. - # - # The other named options are: - # * :print_fields a boolean. If is set to true, then it prints every 10 tuples. - # - def debug(params = {}) - print_fields = params[:print_fields] || true - debug = Java::CascadingOperation::Debug.new(print_fields) - debug.print_tuple_every = params[:tuple_interval] || 1 - debug.print_fields_every = params[:fields_interval] || 10 - each(all_fields, :filter => debug) - end - - # Builds a pipe that inserts values into the current tuple. - # - # The method takes a hash as parameter. This hash contains as keys the names of the fields to insert - # and as values, the values they must contain. For example: - # - # insert {"who" => "Grégoire", "when" => Time.now.strftime("%Y-%m-%d") } - # - # will insert two new fields: a field _who_ containing the string "Grégoire", and a field _when_ containing - # the formatted current date. - # The methods outputs all fields. - # The named options are: - def insert(args) - args.keys.sort.each do |field_name| - value = args[field_name] - - if value.kind_of?(ExprStub) - value.validate_scope(scope) - each all_fields, :function => expression_function(field_name, :expression => value.expression, :parameters => value.types), :output => all_fields - else - each all_fields, :function => insert_function([field_name], :values => [value]), :output => all_fields - end - end - end - - # Builds a pipe that evaluates the specified Janino expression and insert it in a new field in the tuple. - # - # The named options are: - # * :from a string or array of strings. Specifies the input fields. - # * :express a string. The janino expression. - # * :into a string. Specified the name of the field to insert with the result of the evaluation. - # * :parameters a hash. Specifies the type mapping for the parameters. See Cascading::Operations.expression_function. - def eval_expression(*args) - options = args.extract_options! - - into = options.delete(:into) - from = options.delete(:from) || all_fields - output = options.delete(:output) || all_fields - options[:expression] ||= args.shift - options[:parameters] ||= args.shift - - each from, :function => expression_function(into, options), :output=>output - end - - # Builds a pipe that returns distinct tuples based on the provided fields. - # - # The method accepts optional unamed argument specifying the fields to base the distinct on - # (all fields, by default). - def distinct(*args) - raise "Distinct is badly broken" - fields = args[0] || all_fields - group_by *fields - pass - end - - # Ungroups, or unpivots, a tuple (see Cascading's UnGroup at http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/function/UnGroup.html). - # - # You must provide :key and you must provide only one of :value_selectors - # and :num_values. - # - # The named options are: - # * :key required array of field names to replicate on every - # output row in an ungrouped group. - # * :value_selectors an array of field names to ungroup. Each - # field will be ungrouped into an output tuple along with the key fields - # in the order provided. - # * :num_values an integer specifying the number of fields to - # ungroup into each output tuple (excluding the key fields). All input - # fields will be ungrouped. - # * :input an array of field names that specifies the fields to - # input to UnGroup. Defaults to all_fields. - # * :into an array of field names. Default set by UnGroup. - # * :output an array of field names that specifies the fields to - # produce as output of UnGroup. Defaults to all_fields. - def ungroup(*args) - options = args.extract_options! - input = options[:input] || all_fields - into = fields(options[:into]) - output = options[:output] || all_fields - key = fields(options[:key]) - - raise 'You must provide exactly one of :value_selectors or :num_values to ungroup' unless options.has_key?(:value_selectors) ^ options.has_key?(:num_values) - value_selectors = options[:value_selectors].map{ |vs| fields(vs) }.to_java(Java::CascadingTuple::Fields) if options.has_key?(:value_selectors) - num_values = options[:num_values] if options.has_key?(:num_values) - - parameters = [into, key, value_selectors, num_values].compact - each input, :function => Java::CascadingOperationFunction::UnGroup.new(*parameters), :output => output - end - - # Inserts one of two values into the dataflow based upon the result of the - # supplied filter on the input fields. This is primarily useful for - # creating indicators from filters. - # - # Parameters: - # * input name of field to apply the filter. - # * filter Cascading Filter to apply. - # * keep_value Java value to produce when the filter would keep - # the given input. - # * remove_value Java value to produce when the filter would - # remove the given input. - # - # The named options are: - # * :into an output field name, defaulting to 'filter_value'. - # * :output an array of field names that specifies the fields to - # retain in the output tuple. Defaults to all_fields. - def set_value(input, filter, keep_value, remove_value, params = {}) - into = fields(params[:into] || 'filter_value') - output = params[:output] || all_fields - each input, :function => Java::CascadingOperationFunction::SetValue.new(into, filter, keep_value, remove_value), :output => output - end - - # Efficient way of inserting a null indicator for any field, even one that - # cannot be coerced to a string. This is accomplished using Cascading's - # FilterNull and SetValue operators rather than Janino. 1 is produced if - # the field is null and 0 otherwise. - # - # Parameters: - # * input name of field to check for null. - # - # The named options are: - # * :into an output field name, defaulting to 'is_null'. - # * :output an array of field names that specifies the fields to - # retain in the output tuple. Defaults to all_fields. - def null_indicator(input, params = {}) - into = fields(params[:into] || 'is_null') - output = params[:output] || all_fields - set_value input, Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, :into => into, :output => output - end - - # Given a field and a regex, returns an indicator that is 1 if the string - # contains at least 1 match and 0 otherwise. - # - # Parameters: - # * input field name or names that specifies the fields over which - # to perform the match. - # * pattern regex to apply to the input. - # - # The named options are: - # * :into an output field name, defaulting to 'regex_contains'. - # * :output an array of field names that specifies the fields to - # retain in the output tuple. Defaults to all_fields. - def regex_contains(input, pattern, params = {}) - input = fields(input) - pattern = pattern.to_s # Supports JRuby regexes - into = fields(params[:into] || 'regex_contains') - output = params[:output] || all_fields - set_value input, Java::CascadingOperationRegex::RegexFilter.new(pattern), 1.to_java, 0.to_java, :into => into, :output => output - end end end diff --git a/lib/cascading/operations.rb b/lib/cascading/operations.rb index ecb8c38..3ca03ff 100644 --- a/lib/cascading/operations.rb +++ b/lib/cascading/operations.rb @@ -1,40 +1,156 @@ module Cascading module Operations - def expression_function(*args) - options = args.extract_options! + # Builds a debugging pipe. + # + # Without arguments, it generate a simple debug pipe, that prints all tuple to the standard + # output. + # + # The other named options are: + # * :print_fields a boolean. If is set to true, then it prints every 10 tuples. + # + def debug(params = {}) + print_fields = params[:print_fields] || true + debug = Java::CascadingOperation::Debug.new(print_fields) + debug.print_tuple_every = params[:tuple_interval] || 1 + debug.print_fields_every = params[:fields_interval] || 10 + each(all_fields, :filter => debug) + end - fields = Cascading.fields(args) - expression = options[:expression].to_s - parameters = options[:parameters] - parameter_names = [] - parameter_types = [] - if parameters.is_a? ::Hash - parameters.each do |name, type| - parameter_names << name - parameter_types << type + # Inserts new fields into the current assembly. Values may be constants or + # expressions (see Cascading::expr). Fields will be inserted in + # lexicographic order (not necessarily the order provided). + # + # Example: + # insert 'field1' => 'constant_string', 'field2' => 0, 'field3' => expr('fieldA:long + fieldB:long') + def insert(insert_map) + insert_map.keys.sort.each do |field_name| + value = insert_map[field_name] + + if value.kind_of?(ExprStub) + value.validate_scope(scope) + names, types = value.names_and_types + each( + all_fields, + :function => Java::CascadingOperationExpression::ExpressionFunction.new(fields(field_name), value.expression, names, types), + :output => all_fields + ) + else # value is a constant + each( + all_fields, + :function => Java::CascadingOperation::Insert.new(fields(field_name), to_java_comparable_array([value])), + :output => all_fields + ) end - parameter_names = parameter_names.to_java(java.lang.String) - parameter_types = parameter_types.to_java(java.lang.Class) - - arguments = [fields, expression, parameter_names, parameter_types].compact - elsif !parameters.nil? - arguments = [fields, expression, parameters.java_class].compact - else - arguments = [fields, expression, java.lang.String.java_class].compact end + end - Java::CascadingOperationExpression::ExpressionFunction.new(*arguments) + # Builds a pipe that returns distinct tuples based on the provided fields. + # + # The method accepts optional unamed argument specifying the fields to base the distinct on + # (all fields, by default). + def distinct(*args) + raise "Distinct is badly broken" + fields = args[0] || all_fields + group_by *fields + pass end - def insert_function(*args) - options=args.extract_options! - fields = Cascading.fields(args) - values = options[:values] + # Ungroups, or unpivots, a tuple (see Cascading's UnGroup at http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/function/UnGroup.html). + # + # You must provide :key and you must provide only one of :value_selectors + # and :num_values. + # + # The named options are: + # * :key required array of field names to replicate on every + # output row in an ungrouped group. + # * :value_selectors an array of field names to ungroup. Each + # field will be ungrouped into an output tuple along with the key fields + # in the order provided. + # * :num_values an integer specifying the number of fields to + # ungroup into each output tuple (excluding the key fields). All input + # fields will be ungrouped. + # * :input an array of field names that specifies the fields to + # input to UnGroup. Defaults to all_fields. + # * :into an array of field names. Default set by UnGroup. + # * :output an array of field names that specifies the fields to + # produce as output of UnGroup. Defaults to all_fields. + def ungroup(*args) + options = args.extract_options! + input = options[:input] || all_fields + into = fields(options[:into]) + output = options[:output] || all_fields + key = fields(options[:key]) + + raise 'You must provide exactly one of :value_selectors or :num_values to ungroup' unless options.has_key?(:value_selectors) ^ options.has_key?(:num_values) + value_selectors = options[:value_selectors].map{ |vs| fields(vs) }.to_java(Java::CascadingTuple::Fields) if options.has_key?(:value_selectors) + num_values = options[:num_values] if options.has_key?(:num_values) - parameters = [fields, to_java_comparable_array(values)].compact - Java::CascadingOperation::Insert.new(*parameters) + parameters = [into, key, value_selectors, num_values].compact + each input, :function => Java::CascadingOperationFunction::UnGroup.new(*parameters), :output => output end + # Inserts one of two values into the dataflow based upon the result of the + # supplied filter on the input fields. This is primarily useful for + # creating indicators from filters. + # + # Parameters: + # * input name of field to apply the filter. + # * filter Cascading Filter to apply. + # * keep_value Java value to produce when the filter would keep + # the given input. + # * remove_value Java value to produce when the filter would + # remove the given input. + # + # The named options are: + # * :into an output field name, defaulting to 'filter_value'. + # * :output an array of field names that specifies the fields to + # retain in the output tuple. Defaults to all_fields. + def set_value(input, filter, keep_value, remove_value, params = {}) + into = fields(params[:into] || 'filter_value') + output = params[:output] || all_fields + each input, :function => Java::CascadingOperationFunction::SetValue.new(into, filter, keep_value, remove_value), :output => output + end + + # Efficient way of inserting a null indicator for any field, even one that + # cannot be coerced to a string. This is accomplished using Cascading's + # FilterNull and SetValue operators rather than Janino. 1 is produced if + # the field is null and 0 otherwise. + # + # Parameters: + # * input name of field to check for null. + # + # The named options are: + # * :into an output field name, defaulting to 'is_null'. + # * :output an array of field names that specifies the fields to + # retain in the output tuple. Defaults to all_fields. + def null_indicator(input, params = {}) + into = fields(params[:into] || 'is_null') + output = params[:output] || all_fields + set_value input, Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, :into => into, :output => output + end + + # Given a field and a regex, returns an indicator that is 1 if the string + # contains at least 1 match and 0 otherwise. + # + # Parameters: + # * input field name or names that specifies the fields over which + # to perform the match. + # * pattern regex to apply to the input. + # + # The named options are: + # * :into an output field name, defaulting to 'regex_contains'. + # * :output an array of field names that specifies the fields to + # retain in the output tuple. Defaults to all_fields. + def regex_contains(input, pattern, params = {}) + input = fields(input) + pattern = pattern.to_s # Supports JRuby regexes + into = fields(params[:into] || 'regex_contains') + output = params[:output] || all_fields + set_value input, Java::CascadingOperationRegex::RegexFilter.new(pattern), 1.to_java, 0.to_java, :into => into, :output => output + end + + private + def to_java_comparable_array(arr) (arr.map do |v| coerce_to_java(v) From 5fefc74292fba214c52807df80bb37c9fe864c06 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Wed, 17 Apr 2013 15:06:23 -0500 Subject: [PATCH 10/36] Remvoe distinct; not sure why it was even there given that it raises immediately. Should re-engineer wrapper around Unique AggregateBy instead. --- lib/cascading/operations.rb | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/lib/cascading/operations.rb b/lib/cascading/operations.rb index 3ca03ff..29e08bc 100644 --- a/lib/cascading/operations.rb +++ b/lib/cascading/operations.rb @@ -44,17 +44,6 @@ def insert(insert_map) end end - # Builds a pipe that returns distinct tuples based on the provided fields. - # - # The method accepts optional unamed argument specifying the fields to base the distinct on - # (all fields, by default). - def distinct(*args) - raise "Distinct is badly broken" - fields = args[0] || all_fields - group_by *fields - pass - end - # Ungroups, or unpivots, a tuple (see Cascading's UnGroup at http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/function/UnGroup.html). # # You must provide :key and you must provide only one of :value_selectors From 1876c30cc12b88d29ed46c44e61ea1de38c897cb Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Wed, 17 Apr 2013 15:12:56 -0500 Subject: [PATCH 11/36] Update debug pipe contract to match Cascading --- lib/cascading/operations.rb | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/lib/cascading/operations.rb b/lib/cascading/operations.rb index 29e08bc..3a20396 100644 --- a/lib/cascading/operations.rb +++ b/lib/cascading/operations.rb @@ -1,19 +1,25 @@ module Cascading module Operations - # Builds a debugging pipe. - # - # Without arguments, it generate a simple debug pipe, that prints all tuple to the standard - # output. - # - # The other named options are: - # * :print_fields a boolean. If is set to true, then it prints every 10 tuples. + # Debugs the current assembly at runtime, printing every tuple and fields + # every 10 tuples by default. # + # The named params are: + # [prefix] String to prefix prints with. + # [print_fields] Boolean controlling field printing, defaults to false. + # [tuple_interval] Integer specifying interval between printed tuples + # [fields_interval] Integer specifying interval between printing fields def debug(params = {}) - print_fields = params[:print_fields] || true - debug = Java::CascadingOperation::Debug.new(print_fields) + input_fields = params[:input] || all_fields + prefix = params[:prefix] + print_fields = params[:print_fields] + + parameters = [prefix, print_fields].compact + debug = Java::CascadingOperation::Debug.new(*parameters) + debug.print_tuple_every = params[:tuple_interval] || 1 debug.print_fields_every = params[:fields_interval] || 10 - each(all_fields, :filter => debug) + + each(input_fields, :filter => debug) end # Inserts new fields into the current assembly. Values may be constants or From f6b49660529b5711cfc60da992fa4f95a56cee5c Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Wed, 17 Apr 2013 15:34:48 -0500 Subject: [PATCH 12/36] Refactor and document ungroup/set_value/etc.; this is a non-backwards compatible change --- lib/cascading/operations.rb | 113 +++++++++++------------------- lib/cascading/regex_operations.rb | 4 +- samples/ungroup.rb | 9 +-- 3 files changed, 46 insertions(+), 80 deletions(-) diff --git a/lib/cascading/operations.rb b/lib/cascading/operations.rb index 3a20396..675e1ca 100644 --- a/lib/cascading/operations.rb +++ b/lib/cascading/operations.rb @@ -8,6 +8,9 @@ module Operations # [print_fields] Boolean controlling field printing, defaults to false. # [tuple_interval] Integer specifying interval between printed tuples # [fields_interval] Integer specifying interval between printing fields + # + # Example: + # debug :prefix => 'DEBUG', :print_fields => true, :fields_interval => 1000 def debug(params = {}) input_fields = params[:input] || all_fields prefix = params[:prefix] @@ -50,60 +53,44 @@ def insert(insert_map) end end - # Ungroups, or unpivots, a tuple (see Cascading's UnGroup at http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/function/UnGroup.html). + # Ungroups, or unpivots, a tuple (see Cascading's {UnGroup}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/function/UnGroup.html]). # - # You must provide :key and you must provide only one of :value_selectors - # and :num_values. + # You must provide exactly one of :value_selectors and :num_values. + # + # The named params are: + # [value_selectors] Array of field names to ungroup. Each field will be + # ungrouped into an output tuple along with the key fields + # in the order provided. + # [num_values] Integer specifying the number of fields to ungroup into each + # output tuple (excluding the key fields). All input fields + # will be ungrouped. # - # The named options are: - # * :key required array of field names to replicate on every - # output row in an ungrouped group. - # * :value_selectors an array of field names to ungroup. Each - # field will be ungrouped into an output tuple along with the key fields - # in the order provided. - # * :num_values an integer specifying the number of fields to - # ungroup into each output tuple (excluding the key fields). All input - # fields will be ungrouped. - # * :input an array of field names that specifies the fields to - # input to UnGroup. Defaults to all_fields. - # * :into an array of field names. Default set by UnGroup. - # * :output an array of field names that specifies the fields to - # produce as output of UnGroup. Defaults to all_fields. - def ungroup(*args) - options = args.extract_options! - input = options[:input] || all_fields - into = fields(options[:into]) - output = options[:output] || all_fields - key = fields(options[:key]) + # Example: + # ungroup 'key', ['new_key', 'val], :value_selectors => ['val1', 'val2', 'val3'], :output => ['new_key', 'val'] + def ungroup(key, into_fields, params = {}) + input_fields = params[:input] || all_fields + output = params[:output] || all_fields - raise 'You must provide exactly one of :value_selectors or :num_values to ungroup' unless options.has_key?(:value_selectors) ^ options.has_key?(:num_values) - value_selectors = options[:value_selectors].map{ |vs| fields(vs) }.to_java(Java::CascadingTuple::Fields) if options.has_key?(:value_selectors) - num_values = options[:num_values] if options.has_key?(:num_values) + raise 'You must provide exactly one of :value_selectors or :num_values to ungroup' unless params.has_key?(:value_selectors) ^ params.has_key?(:num_values) + value_selectors = params[:value_selectors].map{ |vs| fields(vs) }.to_java(Java::CascadingTuple::Fields) if params.has_key?(:value_selectors) + num_values = params[:num_values] if params.has_key?(:num_values) - parameters = [into, key, value_selectors, num_values].compact - each input, :function => Java::CascadingOperationFunction::UnGroup.new(*parameters), :output => output + parameters = [fields(into_fields), fields(key), value_selectors, num_values].compact + each input_fields, :function => Java::CascadingOperationFunction::UnGroup.new(*parameters), :output => output end # Inserts one of two values into the dataflow based upon the result of the - # supplied filter on the input fields. This is primarily useful for - # creating indicators from filters. - # - # Parameters: - # * input name of field to apply the filter. - # * filter Cascading Filter to apply. - # * keep_value Java value to produce when the filter would keep - # the given input. - # * remove_value Java value to produce when the filter would - # remove the given input. + # supplied filter on the input_fields. This is primarily useful for + # creating indicators from filters. keep_value specifies the Java value to + # produce when the filter would keep the given input and remove_value + # specifies the Java value to produce when the filter would remove the given + # input. # - # The named options are: - # * :into an output field name, defaulting to 'filter_value'. - # * :output an array of field names that specifies the fields to - # retain in the output tuple. Defaults to all_fields. - def set_value(input, filter, keep_value, remove_value, params = {}) - into = fields(params[:into] || 'filter_value') + # Example: + # set_value 'field1', Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, 'is_field1_null' + def set_value(input_fields, filter, keep_value, remove_value, into_field, params = {}) output = params[:output] || all_fields - each input, :function => Java::CascadingOperationFunction::SetValue.new(into, filter, keep_value, remove_value), :output => output + each input_fields, :function => Java::CascadingOperationFunction::SetValue.new(fields(into_field), filter, keep_value, remove_value), :output => output end # Efficient way of inserting a null indicator for any field, even one that @@ -111,37 +98,19 @@ def set_value(input, filter, keep_value, remove_value, params = {}) # FilterNull and SetValue operators rather than Janino. 1 is produced if # the field is null and 0 otherwise. # - # Parameters: - # * input name of field to check for null. - # - # The named options are: - # * :into an output field name, defaulting to 'is_null'. - # * :output an array of field names that specifies the fields to - # retain in the output tuple. Defaults to all_fields. - def null_indicator(input, params = {}) - into = fields(params[:into] || 'is_null') - output = params[:output] || all_fields - set_value input, Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, :into => into, :output => output + # Example: + # null_indicator 'field1', 'is_field1_null' + def null_indicator(input_field, into_field, params = {}) + set_value input_field, Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, into_field, :output => params[:output] end - # Given a field and a regex, returns an indicator that is 1 if the string + # Given an input_field and a regex, returns an indicator that is 1 if the string # contains at least 1 match and 0 otherwise. # - # Parameters: - # * input field name or names that specifies the fields over which - # to perform the match. - # * pattern regex to apply to the input. - # - # The named options are: - # * :into an output field name, defaulting to 'regex_contains'. - # * :output an array of field names that specifies the fields to - # retain in the output tuple. Defaults to all_fields. - def regex_contains(input, pattern, params = {}) - input = fields(input) - pattern = pattern.to_s # Supports JRuby regexes - into = fields(params[:into] || 'regex_contains') - output = params[:output] || all_fields - set_value input, Java::CascadingOperationRegex::RegexFilter.new(pattern), 1.to_java, 0.to_java, :into => into, :output => output + # Example: + # regex_contains 'field1', /\w+\s+\w+/, 'does_field1_contain_pair' + def regex_contains(input_field, regex, into_field, params = {}) + set_value input_field, Java::CascadingOperationRegex::RegexFilter.new(pattern.to_s), 1.to_java, 0.to_java, into_field, :output => params[:output] end private diff --git a/lib/cascading/regex_operations.rb b/lib/cascading/regex_operations.rb index 9eb06f0..35451bf 100644 --- a/lib/cascading/regex_operations.rb +++ b/lib/cascading/regex_operations.rb @@ -26,7 +26,7 @@ module RegexOperations # a subset of groups. # # Example: - # parse 'field1', /([\w]+)\s+([\w]+)/, ['out1', 'out2'], :groups => [1, 2] + # parse 'field1', /(\w+)\s+(\w+)/, ['out1', 'out2'], :groups => [1, 2] def parse(input_field, regex, into_fields, params = {}) groups = params[:groups].to_java(:int) if params[:groups] output = params[:output] || all_fields # Overrides Cascading default @@ -84,7 +84,7 @@ def split_rows(input_field, regex, into_field, params = {}) # specified regular expression. # # Example: - # match_rows 'line', /([\w+])\s+([\w+])/, 'word' + # match_rows 'line', /(\w+)\s+(\w+)/, 'word' def match_rows(input_field, regex, into_field, params = {}) output = params[:output] || all_fields # Overrides Cascading default diff --git a/samples/ungroup.rb b/samples/ungroup.rb index 24d8216..e8e94ab 100755 --- a/samples/ungroup.rb +++ b/samples/ungroup.rb @@ -14,19 +14,16 @@ split 'line', /\t/, ['key', 'val1', 'val2', 'val3'], :output => ['key', 'val1', 'val2', 'val3'] branch 'ungroup_using_value_selectors' do - #each all_fields, :function => Java::CascadingOperationFunction::UnGroup.new(fields(['new_key', 'val']), fields('key'), [fields('val1'), fields('val2'), fields('val3')].to_java(Java::CascadingTuple::Fields)), :output => ['new_key', 'val'] - ungroup :key => 'key', :value_selectors => ['val1', 'val2', 'val3'], :into => ['new_key', 'val'], :output => ['new_key', 'val'] + ungroup 'key', ['new_key', 'val'], :value_selectors => ['val1', 'val2', 'val3'], :output => ['new_key', 'val'] end branch 'ungroup_using_num_values' do - #each all_fields, :function => Java::CascadingOperationFunction::UnGroup.new(fields(['new_key', 'val']), fields('key'), 1), :output => ['new_key', 'val'] - ungroup :key => 'key', :num_values => 1, :into => ['new_key', 'val'], :output => ['new_key', 'val'] + ungroup 'key', ['new_key', 'val'], :num_values => 1, :output => ['new_key', 'val'] end # This pairs up the first and last two fields with no "key" branch 'ungroup_no_key' do - #each all_fields, :function => Java::CascadingOperationFunction::UnGroup.new(fields(['left', 'right']), fields([]), 2), :output => ['left', 'right'] - ungroup :key => [], :num_values => 2, :into => ['left', 'right'], :output => ['left', 'right'] + ungroup [], ['left', 'right'], :num_values => 2, :output => ['left', 'right'] end end From 65eae31cb21bd16c227cd6bf95b59a0db6545d05 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Wed, 17 Apr 2013 15:49:17 -0500 Subject: [PATCH 13/36] Add examples to FilterOperations documentation --- lib/cascading/filter_operations.rb | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lib/cascading/filter_operations.rb b/lib/cascading/filter_operations.rb index 2b4deca..07291ad 100644 --- a/lib/cascading/filter_operations.rb +++ b/lib/cascading/filter_operations.rb @@ -24,6 +24,10 @@ module FilterOperations # [match_each_element] Boolean indicating if regex should match entire # incoming tuple (joined with tabs) or each field # individually. Defaults to false. + # + # Example: + # filter :input => 'field1', :regex => /\t/, :remove_match => true + # filter :expression => 'field1:long > 0 && "".equals(field2:string)', :remove_match => true def filter(params = {}) input_fields = params[:input] || all_fields expression = params[:expression] @@ -51,6 +55,9 @@ def filter(params = {}) # Rejects tuples from the current assembly based on a Janino expression. # This is just a wrapper for FilterOperations.filter. + # + # Example: + # reject 'field1:long > 0 && "".equals(field2:string)' def reject(expression, params = {}) params[:expression] = expression filter(params) @@ -63,6 +70,9 @@ def reject(expression, params = {}) # attempt is made to support import statements prior to the expression. If # this support should break, simply negate your expression and use # FilterOperations.reject. + # + # Example: + # where 'field1:long > 0 && "".equals(field2:string)' def where(expression, params = {}) _, imports, expr = expression.match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a params[:expression] = "#{imports}!(#{expr})" @@ -70,12 +80,18 @@ def where(expression, params = {}) end # Rejects tuples from the current assembly if any input field is null. + # + # Example: + # filter_null 'field1', 'field2' def filter_null(*input_fields) each(input_fields, :filter => Java::CascadingOperationFilter::FilterNull.new) end alias reject_null filter_null # Rejects tuples from the current assembly if any input field is not null. + # + # Example: + # filter_not_null 'field1', 'field2' def filter_not_null(*input_fields) each(input_fields, :filter => Java::CascadingOperationFilter::FilterNotNull.new) end From 94ebec5c6060e550abce8b5eb4d197152ef97773 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Wed, 17 Apr 2013 21:23:54 -0500 Subject: [PATCH 14/36] Minor cleanups in Cascading module --- lib/cascading/cascading.rb | 56 ++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/lib/cascading/cascading.rb b/lib/cascading/cascading.rb index 763d313..b597aa7 100644 --- a/lib/cascading/cascading.rb +++ b/lib/cascading/cascading.rb @@ -1,6 +1,8 @@ require 'cascading/expr_stub' module Cascading + # Mapping that defines a convenient syntax for specifying Java classes, used + # in Janino expressions and elsewhere. JAVA_TYPE_MAP = { :int => java.lang.Integer.java_class, :long => java.lang.Long.java_class, :bool => java.lang.Boolean.java_class, :double => java.lang.Double.java_class, @@ -80,23 +82,26 @@ def all_fields Java::CascadingTuple::Fields::ALL end - def union_fields(*fields) - fields(fields.inject([]){ |acc, arr| acc | arr.to_a }) - end - - def difference_fields(*fields) - fields(fields[1..-1].inject(fields.first.to_a){ |acc, arr| acc - arr.to_a }) + def last_grouping_fields + Java::CascadingTuple::Fields::VALUES end - def copy_fields(fields) - fields.select(all_fields) + # Computes fields formed by removing remove_fields from base_fields. Operates + # only on named fields, not positional fields. + def difference_fields(base_fields, remove_fields) + fields(base_fields.to_a - remove_fields.to_a) end + # Combines fields deduplicating them with trailing underscores as necessary. + # This is used in joins to avoid requiring the caller to unique fields before + # they are joined. def dedup_fields(*fields) raise 'Can only be applied to declarators' unless fields.all?{ |f| f.is_declarator? } fields(dedup_field_names(*fields.map{ |f| f.to_a })) end + # Helper used by dedup_fields that operates on arrays of field names rather + # than fields objects. def dedup_field_names(*names) names.inject([]) do |acc, arr| acc + arr.map{ |e| search_field_name(acc, e) } @@ -106,32 +111,25 @@ def dedup_field_names(*names) def search_field_name(names, candidate) names.include?(candidate) ? search_field_name(names, "#{candidate}_") : candidate end - - def last_grouping_fields - Java::CascadingTuple::Fields::VALUES - end - - def results_fields - Java::CascadingTuple::Fields::RESULTS - end + private :search_field_name # Creates a TextLine scheme (can be used in both Cascading local and hadoop # modes). Positional args are used if :source_fields is not # provided. # - # The named options are: - # * :source_fields a string or array of strings. Specifies the - # fields to be read from a source with this scheme. Defaults to ['offset', 'line']. - # * :sink_fields a string or array of strings. Specifies the fields - # to be written to a sink with this scheme. Defaults to all_fields. - # * :compression a symbol, either :enable or - # :disable, that governs the TextLine scheme's compression. Defaults - # to the default TextLine compression (only applies to c.s.h.TextLine). - def text_line_scheme(*args) - options = args.extract_options! - source_fields = fields(options[:source_fields] || (args.empty? ? ['offset', 'line'] : args)) - sink_fields = fields(options[:sink_fields]) || all_fields - sink_compression = case options[:compression] + # The named params are: + # [source_fields] Fields to be read from a source with this scheme. Defaults + # to ['offset', 'line']. + # [sink_fields] Fields to be written to a sink with this scheme. Defaults to + # all_fields. + # [compression] A symbol, either :enable or :disable, that + # governs the TextLine scheme's compression. Defaults to the + # default TextLine compression (only applies to c.s.h.TextLine). + def text_line_scheme(*args_with_params) + params, source_fields = args_with_params.extract_options!, args_with_params + source_fields = fields(params[:source_fields] || (source_fields.empty? ? ['offset', 'line'] : source_fields)) + sink_fields = fields(params[:sink_fields]) || all_fields + sink_compression = case params[:compression] when :enable then Java::CascadingSchemeHadoop::TextLine::Compress::ENABLE when :disable then Java::CascadingSchemeHadoop::TextLine::Compress::DISABLE else Java::CascadingSchemeHadoop::TextLine::Compress::DEFAULT From 23caa2a922788639952779b93f783028ddde8219 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Thu, 18 Apr 2013 07:09:52 -0500 Subject: [PATCH 15/36] Clean up Aggregations in preparation for removing *args --- lib/cascading.rb | 1 - lib/cascading/aggregations.rb | 37 +++++++++++++++++++++++--- lib/cascading/aggregator_operations.rb | 27 ------------------- test/test_aggregations.rb | 11 -------- 4 files changed, 34 insertions(+), 42 deletions(-) delete mode 100644 lib/cascading/aggregator_operations.rb diff --git a/lib/cascading.rb b/lib/cascading.rb index 407c9b3..12ceb85 100644 --- a/lib/cascading.rb +++ b/lib/cascading.rb @@ -14,7 +14,6 @@ module Cascading require 'cascading/flow' require 'cascading/mode' require 'cascading/operations' -require 'cascading/aggregator_operations' require 'cascading/identity_operations' require 'cascading/filter_operations' require 'cascading/regex_operations' diff --git a/lib/cascading/aggregations.rb b/lib/cascading/aggregations.rb index 00a5dc6..7024948 100644 --- a/lib/cascading/aggregations.rb +++ b/lib/cascading/aggregations.rb @@ -1,8 +1,12 @@ -require 'cascading/aggregator_operations' require 'cascading/scope' require 'cascading/ext/array' module Cascading + # Aggregations is the context available to you within the block of a group_by, + # union, or join that allows you to apply Every pipes to the result of those + # operations. You may apply aggregators and buffers within this context + # subject to several rules laid out by Cascading. + # # Rules enforced by Aggregations: # * Contains either 1 Buffer or >= 1 Aggregator (explicitly checked) # * No GroupBys, CoGroups, Joins, or Merges (methods for these pipes do not exist on Aggregations) @@ -15,6 +19,11 @@ module Cascading # # Optimizations: # * If the leading Group is a GroupBy and all subsequent Everies are Aggregators that have a corresponding AggregateBy, Aggregations can replace the GroupBy/Aggregator pipe with a single composite AggregateBy + # + # Aggregator and buffer DSL standard optional parameter names: + # [input] c.p.Every argument selector + # [into] c.o.Operation field declaration + # [output] c.p.Every output selector class Aggregations attr_reader :assembly, :tail_pipe, :scope, :aggregate_bys @@ -82,8 +91,6 @@ def every(*args) make_pipe(Java::CascadingPipe::Every, parameters) end - include AggregatorOperations - def assert_group(*args) options = args.extract_options! @@ -184,5 +191,29 @@ def extract_field_map(args) end [field_map, options] end + + def aggregator_function(args, aggregator_klass) + options = args.extract_options! + ignore = options[:ignore] + + parameters = [Cascading.fields(args), ignore].compact + aggregator_klass.new(*parameters) + end + + def first_function(*args) + aggregator_function(args, Java::CascadingOperationAggregator::First) + end + + def min_function(*args) + aggregator_function(args, Java::CascadingOperationAggregator::Min) + end + + def max_function(*args) + aggregator_function(args, Java::CascadingOperationAggregator::Max) + end + + def last_function(*args) + aggregator_function(args, Java::CascadingOperationAggregator::Last) + end end end diff --git a/lib/cascading/aggregator_operations.rb b/lib/cascading/aggregator_operations.rb deleted file mode 100644 index 69069b2..0000000 --- a/lib/cascading/aggregator_operations.rb +++ /dev/null @@ -1,27 +0,0 @@ -module Cascading - module AggregatorOperations - def aggregator_function(args, aggregator_klass) - options = args.extract_options! - ignore = options[:ignore] - - parameters = [Cascading.fields(args), ignore].compact - aggregator_klass.new(*parameters) - end - - def first_function(*args) - aggregator_function(args, Java::CascadingOperationAggregator::First) - end - - def min_function(*args) - aggregator_function(args, Java::CascadingOperationAggregator::Min) - end - - def max_function(*args) - aggregator_function(args, Java::CascadingOperationAggregator::Max) - end - - def last_function(*args) - aggregator_function(args, Java::CascadingOperationAggregator::Last) - end - end -end diff --git a/test/test_aggregations.rb b/test/test_aggregations.rb index da0900b..69e8fa3 100644 --- a/test/test_aggregations.rb +++ b/test/test_aggregations.rb @@ -6,17 +6,6 @@ class TC_Aggregations < Test::Unit::TestCase include MockAssemblies - include AggregatorOperations - - def test_aggregator_function_ignore_values - min = min_function 'min_field', :ignore => [nil].to_java(:string) - assert_not_nil min - end - - def test_aggregator_function_ignore_tuples - first = first_function 'first_field', :ignore => [Java::CascadingTuple::Tuple.new(-1)].to_java(Java::CascadingTuple::Tuple) - assert_not_nil first - end # first chosen because it does not have a corresponding AggregateBy def test_create_group_by From a1851c4100836e1720b30938a7c700d1061e7997 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Thu, 18 Apr 2013 08:00:55 -0500 Subject: [PATCH 16/36] Begin removing *args, remove some unnecessary metaprogramming, and make every raise an exception when aggregator/buffer are mismatched --- lib/cascading/aggregations.rb | 115 +++++++++++++++------------------- 1 file changed, 50 insertions(+), 65 deletions(-) diff --git a/lib/cascading/aggregations.rb b/lib/cascading/aggregations.rb index 7024948..fe8f96b 100644 --- a/lib/cascading/aggregations.rb +++ b/lib/cascading/aggregations.rb @@ -50,6 +50,8 @@ def make_pipe(type, parameters) @tail_pipe = pipe @scope = Scope.outgoing_scope(tail_pipe, [scope]) + + tail_pipe end private :make_pipe @@ -74,65 +76,53 @@ def finalize # Builds an every pipe and adds it to the current list of aggregations. # Note that this list may be either exactly 1 Buffer or any number of # Aggregators. - def every(*args) - options = args.extract_options! - - in_fields = fields(args) - out_fields = fields(options[:output]) - operation = options[:aggregator] || options[:buffer] - - if options[:aggregate_by] && aggregate_bys - aggregate_bys << options[:aggregate_by] + def every(*args_with_params) + params, in_fields = args_with_params.extract_options!, fields(args_with_params) + out_fields = fields(params[:output]) + operation = params[:aggregator] || params[:buffer] + raise 'every requires either :aggregator or :buffer' unless operation + + if params[:aggregate_by] && aggregate_bys + aggregate_bys << params[:aggregate_by] else @aggregate_bys = nil end parameters = [tail_pipe, in_fields, operation, out_fields].compact - make_pipe(Java::CascadingPipe::Every, parameters) - end + every = make_pipe(Java::CascadingPipe::Every, parameters) + raise ':aggregator specified but c.o.Buffer provided' if params[:aggregator] && every.is_buffer + raise ':buffer specified but c.o.Aggregator provided' if params[:buffer] && every.is_aggregator - def assert_group(*args) - options = args.extract_options! + every + end - assertion = args[0] - assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT + def assert_group(assertion, params = {}) + assertion_level = params[:level] || Java::CascadingOperation::AssertionLevel::STRICT parameters = [tail_pipe, assertion_level, assertion] make_pipe(Java::CascadingPipe::Every, parameters) end - def assert_group_size_equals(*args) - options = args.extract_options! + def assert_group_size_equals(size, params = {}) + assertion = Java::CascadingOperationAssertion::AssertGroupSizeEquals.new(size) + assert_group(assertion, params) + end - assertion = Java::CascadingOperationAssertion::AssertGroupSizeEquals.new(args[0]) - assert_group(assertion, options) + def min(*args) + composite_aggregator(args, Java::CascadingOperationAggregator::Min) end - # Builds a series of every pipes for aggregation. - # - # Args can either be a list of fields to aggregate and an options hash or - # a hash that maps input field name to output field name (similar to - # insert) and an options hash. - # - # Options include: - # * :ignore a Java Array of Objects (for min and max) or Tuples - # (for first and last) of values for the aggregator to ignore - # * function is a symbol that is the method to call to construct - # the Cascading Aggregator. - def composite_aggregator(args, function) - field_map, options = extract_field_map(args) + def max(*args) + composite_aggregator(args, Java::CascadingOperationAggregator::Max) + end - field_map.each do |in_field, out_field| - agg = self.send(function, out_field, options) - every(in_field, :aggregator => agg, :output => all_fields) - end - raise "Composite aggregator '#{function.to_s.gsub('_function', '')}' invoked on 0 fields" if field_map.empty? + def first(*args) + composite_aggregator(args, Java::CascadingOperationAggregator::First) end - def min(*args); composite_aggregator(args, :min_function); end - def max(*args); composite_aggregator(args, :max_function); end - def first(*args); composite_aggregator(args, :first_function); end - def last(*args); composite_aggregator(args, :last_function); end + def last(*args) + composite_aggregator(args, Java::CascadingOperationAggregator::Last) + end # Counts elements of a group. May optionally specify the name of the # output count field (defaults to 'count'). @@ -177,6 +167,25 @@ def average(*args) private + # Builds a series of every pipes for aggregation. + # + # Args can either be a list of fields to aggregate and an options hash or + # a hash that maps input field name to output field name (similar to + # insert) and an options hash. + # + # The named params are: + # [ignore] Java Array of Objects (for min and max) or Tuples (for first and + # last) of values for the aggregator to ignore. + def composite_aggregator(args, aggregator) + field_map, params = extract_field_map(args) + + field_map.each do |in_field, out_field| + parameters = [fields(out_field), params[:ignore]].compact + every(in_field, :aggregator => aggregator.new(*parameters), :output => all_fields) + end + raise "Composite aggregator '#{aggregator}' invoked on 0 fields" if field_map.empty? + end + # Extracts a field mapping, input field => output field, by accepting a # hash in the first argument. If no hash is provided, then maps arguments # onto themselves which names outputs the same as inputs. Additionally @@ -191,29 +200,5 @@ def extract_field_map(args) end [field_map, options] end - - def aggregator_function(args, aggregator_klass) - options = args.extract_options! - ignore = options[:ignore] - - parameters = [Cascading.fields(args), ignore].compact - aggregator_klass.new(*parameters) - end - - def first_function(*args) - aggregator_function(args, Java::CascadingOperationAggregator::First) - end - - def min_function(*args) - aggregator_function(args, Java::CascadingOperationAggregator::Min) - end - - def max_function(*args) - aggregator_function(args, Java::CascadingOperationAggregator::Max) - end - - def last_function(*args) - aggregator_function(args, Java::CascadingOperationAggregator::Last) - end end end From fcdd43f9f71097c36e28107485cd75bf6f68fe82 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Thu, 18 Apr 2013 08:21:52 -0500 Subject: [PATCH 17/36] Correct use of split_rows in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7cfa943..b8e6397 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ cascade 'wordcount', :mode => :local do source 'input', tap(input_path) assembly 'input' do - split_rows 'line', 'word', :pattern => /[.,]*\s+/, :output => 'word' + split_rows 'line', /[.,]*\s+/, 'word', :output => 'word' group_by 'word' do count end From 419d74c7d16da600cc4e9feb65d2227b9f6d9085 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Thu, 18 Apr 2013 15:03:04 -0500 Subject: [PATCH 18/36] Remove additional *args; inspiration isn't striking, though, so this is just a rename --- lib/cascading/aggregations.rb | 42 +++++++++++++++++------------------ 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/lib/cascading/aggregations.rb b/lib/cascading/aggregations.rb index fe8f96b..cb85639 100644 --- a/lib/cascading/aggregations.rb +++ b/lib/cascading/aggregations.rb @@ -108,20 +108,20 @@ def assert_group_size_equals(size, params = {}) assert_group(assertion, params) end - def min(*args) - composite_aggregator(args, Java::CascadingOperationAggregator::Min) + def min(*args_with_params) + composite_aggregator(args_with_params, Java::CascadingOperationAggregator::Min) end - def max(*args) - composite_aggregator(args, Java::CascadingOperationAggregator::Max) + def max(*args_with_params) + composite_aggregator(args_with_params, Java::CascadingOperationAggregator::Max) end - def first(*args) - composite_aggregator(args, Java::CascadingOperationAggregator::First) + def first(*args_with_params) + composite_aggregator(args_with_params, Java::CascadingOperationAggregator::First) end - def last(*args) - composite_aggregator(args, Java::CascadingOperationAggregator::Last) + def last(*args_with_params) + composite_aggregator(args_with_params, Java::CascadingOperationAggregator::Last) end # Counts elements of a group. May optionally specify the name of the @@ -138,11 +138,11 @@ def count(name = 'count') # parameter (in which case they will be aggregated from the field named by # the key into the field named by the value after being sorted). The type # of the output sum may be controlled with the :type parameter. - def sum(*args) - options = args.extract_options! - type = JAVA_TYPE_MAP[options[:type]] + def sum(*args_with_params) + params, in_fields = args_with_params.extract_options!, args_with_params + type = JAVA_TYPE_MAP[params[:type]] - mapping = options[:mapping] ? options[:mapping].sort : args.zip(args) + mapping = params[:mapping] ? params[:mapping].sort : in_fields.zip(in_fields) mapping.each do |in_field, out_field| sum_aggregator = Java::CascadingOperationAggregator::Sum.new(*[fields(out_field), type].compact) # NOTE: SumBy requires a type in wip-286, unlike Sum (see Sum.java line 42 for default) @@ -153,9 +153,9 @@ def sum(*args) end # Averages one or more fields. The contract of average is identical to - # that of other composite aggregators, but it accepts no options. - def average(*args) - field_map, _ = extract_field_map(args) + # that of other composite aggregators, but it accepts no params. + def average(*fields_or_field_map) + field_map, _ = extract_field_map(fields_or_field_map) field_map.each do |in_field, out_field| average_aggregator = Java::CascadingOperationAggregator::Average.new(fields(out_field)) @@ -169,9 +169,9 @@ def average(*args) # Builds a series of every pipes for aggregation. # - # Args can either be a list of fields to aggregate and an options hash or + # Args can either be a list of fields to aggregate and an params hash or # a hash that maps input field name to output field name (similar to - # insert) and an options hash. + # insert) and an params hash. # # The named params are: # [ignore] Java Array of Objects (for min and max) or Tuples (for first and @@ -189,16 +189,16 @@ def composite_aggregator(args, aggregator) # Extracts a field mapping, input field => output field, by accepting a # hash in the first argument. If no hash is provided, then maps arguments # onto themselves which names outputs the same as inputs. Additionally - # extracts options from args. + # extracts params from args. def extract_field_map(args) if !args.empty? && args.first.kind_of?(Hash) field_map = args.shift.sort - options = args.extract_options! + params = args.extract_options! else - options = args.extract_options! + params = args.extract_options! field_map = args.zip(args) end - [field_map, options] + [field_map, params] end end end From 07d7c3a39942f8346a61a45cb55e47bc5b984d09 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Fri, 19 Apr 2013 05:45:53 -0500 Subject: [PATCH 19/36] I believe we inherited the name "params" from Rails, which doesn't apply here, so uniformly use "options" instead --- lib/cascading/aggregations.rb | 73 +++++++++--------- lib/cascading/assembly.rb | 118 ++++++++++++++--------------- lib/cascading/cascade.rb | 14 ++-- lib/cascading/cascading.rb | 36 ++++----- lib/cascading/expr_stub.rb | 10 +-- lib/cascading/ext/array.rb | 10 +-- lib/cascading/filter_operations.rb | 28 +++---- lib/cascading/flow.rb | 6 +- lib/cascading/operations.rb | 43 +++++------ lib/cascading/regex_operations.rb | 28 +++---- lib/cascading/tap.rb | 8 +- lib/cascading/text_operations.rb | 10 +-- spec/spec_util.rb | 24 +++--- 13 files changed, 205 insertions(+), 203 deletions(-) diff --git a/lib/cascading/aggregations.rb b/lib/cascading/aggregations.rb index cb85639..5d08168 100644 --- a/lib/cascading/aggregations.rb +++ b/lib/cascading/aggregations.rb @@ -76,52 +76,52 @@ def finalize # Builds an every pipe and adds it to the current list of aggregations. # Note that this list may be either exactly 1 Buffer or any number of # Aggregators. - def every(*args_with_params) - params, in_fields = args_with_params.extract_options!, fields(args_with_params) - out_fields = fields(params[:output]) - operation = params[:aggregator] || params[:buffer] + def every(*args_with_options) + options, in_fields = args_with_options.extract_options!, fields(args_with_options) + out_fields = fields(options[:output]) + operation = options[:aggregator] || options[:buffer] raise 'every requires either :aggregator or :buffer' unless operation - if params[:aggregate_by] && aggregate_bys - aggregate_bys << params[:aggregate_by] + if options[:aggregate_by] && aggregate_bys + aggregate_bys << options[:aggregate_by] else @aggregate_bys = nil end parameters = [tail_pipe, in_fields, operation, out_fields].compact every = make_pipe(Java::CascadingPipe::Every, parameters) - raise ':aggregator specified but c.o.Buffer provided' if params[:aggregator] && every.is_buffer - raise ':buffer specified but c.o.Aggregator provided' if params[:buffer] && every.is_aggregator + raise ':aggregator specified but c.o.Buffer provided' if options[:aggregator] && every.is_buffer + raise ':buffer specified but c.o.Aggregator provided' if options[:buffer] && every.is_aggregator every end - def assert_group(assertion, params = {}) - assertion_level = params[:level] || Java::CascadingOperation::AssertionLevel::STRICT + def assert_group(assertion, options = {}) + assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT parameters = [tail_pipe, assertion_level, assertion] make_pipe(Java::CascadingPipe::Every, parameters) end - def assert_group_size_equals(size, params = {}) + def assert_group_size_equals(size, options = {}) assertion = Java::CascadingOperationAssertion::AssertGroupSizeEquals.new(size) - assert_group(assertion, params) + assert_group(assertion, options) end - def min(*args_with_params) - composite_aggregator(args_with_params, Java::CascadingOperationAggregator::Min) + def min(*args_with_options) + composite_aggregator(args_with_options, Java::CascadingOperationAggregator::Min) end - def max(*args_with_params) - composite_aggregator(args_with_params, Java::CascadingOperationAggregator::Max) + def max(*args_with_options) + composite_aggregator(args_with_options, Java::CascadingOperationAggregator::Max) end - def first(*args_with_params) - composite_aggregator(args_with_params, Java::CascadingOperationAggregator::First) + def first(*args_with_options) + composite_aggregator(args_with_options, Java::CascadingOperationAggregator::First) end - def last(*args_with_params) - composite_aggregator(args_with_params, Java::CascadingOperationAggregator::Last) + def last(*args_with_options) + composite_aggregator(args_with_options, Java::CascadingOperationAggregator::Last) end # Counts elements of a group. May optionally specify the name of the @@ -138,11 +138,11 @@ def count(name = 'count') # parameter (in which case they will be aggregated from the field named by # the key into the field named by the value after being sorted). The type # of the output sum may be controlled with the :type parameter. - def sum(*args_with_params) - params, in_fields = args_with_params.extract_options!, args_with_params - type = JAVA_TYPE_MAP[params[:type]] + def sum(*args_with_options) + options, in_fields = args_with_options.extract_options!, args_with_options + type = JAVA_TYPE_MAP[options[:type]] - mapping = params[:mapping] ? params[:mapping].sort : in_fields.zip(in_fields) + mapping = options[:mapping] ? options[:mapping].sort : in_fields.zip(in_fields) mapping.each do |in_field, out_field| sum_aggregator = Java::CascadingOperationAggregator::Sum.new(*[fields(out_field), type].compact) # NOTE: SumBy requires a type in wip-286, unlike Sum (see Sum.java line 42 for default) @@ -153,7 +153,7 @@ def sum(*args_with_params) end # Averages one or more fields. The contract of average is identical to - # that of other composite aggregators, but it accepts no params. + # that of other composite aggregators, but it accepts no options. def average(*fields_or_field_map) field_map, _ = extract_field_map(fields_or_field_map) @@ -169,19 +169,22 @@ def average(*fields_or_field_map) # Builds a series of every pipes for aggregation. # - # Args can either be a list of fields to aggregate and an params hash or + # Args can either be a list of fields to aggregate and an options hash or # a hash that maps input field name to output field name (similar to - # insert) and an params hash. + # insert) and an options hash. # - # The named params are: + # The named options are: # [ignore] Java Array of Objects (for min and max) or Tuples (for first and # last) of values for the aggregator to ignore. def composite_aggregator(args, aggregator) - field_map, params = extract_field_map(args) + field_map, options = extract_field_map(args) field_map.each do |in_field, out_field| - parameters = [fields(out_field), params[:ignore]].compact - every(in_field, :aggregator => aggregator.new(*parameters), :output => all_fields) + every( + in_field, + :aggregator => aggregator.new(*[fields(out_field), options[:ignore]].compact), + :output => all_fields + ) end raise "Composite aggregator '#{aggregator}' invoked on 0 fields" if field_map.empty? end @@ -189,16 +192,16 @@ def composite_aggregator(args, aggregator) # Extracts a field mapping, input field => output field, by accepting a # hash in the first argument. If no hash is provided, then maps arguments # onto themselves which names outputs the same as inputs. Additionally - # extracts params from args. + # extracts options from args. def extract_field_map(args) if !args.empty? && args.first.kind_of?(Hash) field_map = args.shift.sort - params = args.extract_options! + options = args.extract_options! else - params = args.extract_options! + options = args.extract_options! field_map = args.zip(args) end - [field_map, params] + [field_map, options] end end end diff --git a/lib/cascading/assembly.rb b/lib/cascading/assembly.rb index 51249ba..ba74f06 100644 --- a/lib/cascading/assembly.rb +++ b/lib/cascading/assembly.rb @@ -17,11 +17,11 @@ module Cascading # # Function and filter DSL rules: # * Use positional arguments for required parameters - # * Use params = {} for optional parameters + # * Use options = {} for optional parameters # * Use *args sparingly, specifically when you need to accept a varying length list of fields # * If you require both a varying length list of fields and optional parameters, then see the Array#extract_options! extension - # * If you choose to name a required parameter, add it to params = {} and throw an exception if the caller does not provide it - # * If you have a require parameter that is provided by one of a set of params names, throw an exception if the caller does not provide at least one value (see :function and :filter in Assembly#each for an example) + # * If you choose to name a required parameter, add it to options = {} and throw an exception if the caller does not provide it + # * If you have a require parameter that is provided by one of a set of options names, throw an exception if the caller does not provide at least one value (see :function and :filter in Assembly#each for an example) # # Function and filter DSL standard optional parameter names: # [input] c.p.Each argument selector @@ -125,10 +125,10 @@ def to_s "#{name} : head pipe : #{head_pipe} - tail pipe: #{tail_pipe}" end - def prepare_join(assembly_names, params, &block) + def prepare_join(assembly_names, options, &block) pipes, _ = populate_incoming_scopes(assembly_names) - group_fields_args = params[:on] + group_fields_args = options[:on] raise 'join requires :on parameter' unless group_fields_args if group_fields_args.kind_of?(String) @@ -149,9 +149,9 @@ def prepare_join(assembly_names, params, &block) raise 'join requires non-empty :on parameter' if group_fields_args.empty? group_fields = group_fields.to_java(Java::CascadingTuple::Fields) incoming_fields = @incoming_scopes.map{ |s| s.values_fields } - declared_fields = fields(params[:declared_fields] || dedup_fields(*incoming_fields)) - joiner = params[:joiner] - is_hash_join = params[:hash] || false + declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields)) + joiner = options[:joiner] + is_hash_join = options[:hash] || false case joiner when :inner, 'inner', nil @@ -200,47 +200,47 @@ def prepare_join(assembly_names, params, &block) # Builds a HashJoin pipe. This should be used carefully, as the right side # of the join is accumulated entirely in memory. Requires a list of assembly # names to join and :on to specify the join_fields. - def hash_join(*args_with_params, &block) - params, assembly_names = args_with_params.extract_options!, args_with_params - params[:hash] = true - prepare_join(assembly_names, params, &block) + def hash_join(*args_with_options, &block) + options, assembly_names = args_with_options.extract_options!, args_with_options + options[:hash] = true + prepare_join(assembly_names, options, &block) end # Builds a join (CoGroup) pipe. Requires a list of assembly names to join # and :on to specify the group_fields. - def join(*args_with_params, &block) - params, assembly_names = args_with_params.extract_options!, args_with_params - params[:hash] = false - prepare_join(assembly_names, params, &block) + def join(*args_with_options, &block) + options, assembly_names = args_with_options.extract_options!, args_with_options + options[:hash] = false + prepare_join(assembly_names, options, &block) end alias co_group join - def inner_join(*args_with_params, &block) - params = args_with_params.extract_options! - params[:joiner] = :inner - args_with_params << params - join(*args_with_params, &block) + def inner_join(*args_with_options, &block) + options = args_with_options.extract_options! + options[:joiner] = :inner + args_with_options << options + join(*args_with_options, &block) end - def left_join(*args_with_params, &block) - params = args_with_params.extract_options! - params[:joiner] = :left - args_with_params << params - join(*args_with_params, &block) + def left_join(*args_with_options, &block) + options = args_with_options.extract_options! + options[:joiner] = :left + args_with_options << options + join(*args_with_options, &block) end - def right_join(*args_with_params, &block) - params = args_with_params.extract_options! - params[:joiner] = :right - args_with_params << params - join(*args_with_params, &block) + def right_join(*args_with_options, &block) + options = args_with_options.extract_options! + options[:joiner] = :right + args_with_options << options + join(*args_with_options, &block) end - def outer_join(*args_with_params, &block) - params = args_with_params.extract_options! - params[:joiner] = :outer - args_with_params << params - join(*args_with_params, &block) + def outer_join(*args_with_options, &block) + options = args_with_options.extract_options! + options[:joiner] = :outer + args_with_options << options + join(*args_with_options, &block) end # Builds a new branch. @@ -253,12 +253,12 @@ def branch(name, &block) end # Builds a new GroupBy pipe that groups on the fields given in - # args_with_params. Any block passed to this method should contain only + # args_with_options. Any block passed to this method should contain only # Everies. - def group_by(*args_with_params, &block) - params, group_fields = args_with_params.extract_options!, fields(args_with_params) - sort_fields = fields(params[:sort_by]) - reverse = params[:reverse] + def group_by(*args_with_options, &block) + options, group_fields = args_with_options.extract_options!, fields(args_with_options) + sort_fields = fields(options[:sort_by]) + reverse = options[:reverse] parameters = [tail_pipe, group_fields, sort_fields, reverse].compact apply_aggregations(Java::CascadingPipe::GroupBy.new(*parameters), [scope], &block) @@ -270,11 +270,11 @@ def group_by(*args_with_params, &block) # aggregations. # # By default, groups only on the first field (see line 189 of GroupBy.java) - def union(*args_with_params, &block) - params, assembly_names = args_with_params.extract_options!, args_with_params - group_fields = fields(params[:on]) - sort_fields = fields(params[:sort_by]) - reverse = params[:reverse] + def union(*args_with_options, &block) + options, assembly_names = args_with_options.extract_options!, args_with_options + group_fields = fields(options[:on]) + sort_fields = fields(options[:sort_by]) + reverse = options[:reverse] pipes, _ = populate_incoming_scopes(assembly_names) @@ -306,17 +306,17 @@ def sub_assembly(sub_assembly, pipes = [tail_pipe], incoming_scopes = [scope]) # Builds a basic each pipe, and adds it to the current assembly. # # Default arguments are all_fields, a default inherited from c.o.Each. - def each(*args_with_params) - params, in_fields = args_with_params.extract_options!, fields(args_with_params) - out_fields = fields(params[:output]) # Default Fields.RESULTS from c.o.Each - operation = params[:filter] || params[:function] + def each(*args_with_options) + options, in_fields = args_with_options.extract_options!, fields(args_with_options) + out_fields = fields(options[:output]) # Default Fields.RESULTS from c.o.Each + operation = options[:filter] || options[:function] raise 'each requires either :filter or :function' unless operation - raise 'c.p.Each does not support applying an output selector to a c.o.Filter' if params[:filter] && params[:output] + raise 'c.p.Each does not support applying an output selector to a c.o.Filter' if options[:filter] && options[:output] parameters = [tail_pipe, in_fields, operation, out_fields].compact each = make_pipe(Java::CascadingPipe::Each, parameters) - raise ':function specified but c.o.Filter provided' if params[:function] && each.is_filter - raise ':filter specified but c.o.Function provided' if params[:filter] && each.is_function + raise ':function specified but c.o.Filter provided' if options[:function] && each.is_filter + raise ':filter specified but c.o.Function provided' if options[:filter] && each.is_function each end @@ -327,23 +327,23 @@ def each(*args_with_params) include RegexOperations include TextOperations - def assert(assertion, params = {}) - assertion_level = params[:level] || Java::CascadingOperation::AssertionLevel::STRICT + def assert(assertion, options = {}) + assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT parameters = [tail_pipe, assertion_level, assertion] make_pipe(Java::CascadingPipe::Each, parameters) end # Builds a pipe that assert the size of the tuple is the size specified in parameter. - def assert_size_equals(size, params = {}) + def assert_size_equals(size, options = {}) assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(size) - assert(assertion, params) + assert(assertion, options) end # Builds a pipe that assert the none of the fields in the tuple are null. - def assert_not_null(params = {}) + def assert_not_null(options = {}) assertion = Java::CascadingOperationAssertion::AssertNotNull.new - assert(assertion, params) + assert(assertion, options) end end end diff --git a/lib/cascading/cascade.rb b/lib/cascading/cascade.rb index 3a9de80..4246ccb 100644 --- a/lib/cascading/cascade.rb +++ b/lib/cascading/cascade.rb @@ -17,9 +17,9 @@ class Cascade < Cascading::Node # Cascading::Flow#initialize for details on how flows handle properties. # Optionally accepts a :mode which will be used as the default mode for all # child flows. See Cascading::Mode.parse for details. - def initialize(name, params = {}) - @properties = params[:properties] || {} - @mode = params[:mode] + def initialize(name, options = {}) + @properties = options[:properties] || {} + @mode = options[:mode] super(name, nil) # A Cascade cannot have a parent self.class.add(name, self) end @@ -28,13 +28,13 @@ def initialize(name, params = {}) # :properties which will override the default properties stroed in this # cascade. Optionally accepts a :mode, which will override the default # mode stored in this cascade. - def flow(name, params = {}, &block) + def flow(name, options = {}, &block) raise "Could not build flow '#{name}'; block required" unless block_given? - params[:properties] ||= properties.dup - params[:mode] ||= mode + options[:properties] ||= properties.dup + options[:mode] ||= mode - flow = Flow.new(name, self, params) + flow = Flow.new(name, self, options) add_child(flow) flow.instance_eval(&block) flow diff --git a/lib/cascading/cascading.rb b/lib/cascading/cascading.rb index b597aa7..215dd2a 100644 --- a/lib/cascading/cascading.rb +++ b/lib/cascading/cascading.rb @@ -28,13 +28,13 @@ module Cascading # Builds a top-level cascade given a name and a block. Optionally accepts a # :mode, as explained in Cascading::Cascade#initialize. - def cascade(name, params = {}, &block) + def cascade(name, options = {}, &block) raise "Could not build cascade '#{name}'; block required" unless block_given? - raise 'Cascading::cascade does not accept the :properties param only the global $jobconf_properties' if params[:properties] + raise 'Cascading::cascade does not accept the :properties param only the global $jobconf_properties' if options[:properties] - params[:properties] = $jobconf_properties.dup if defined?($jobconf_properties) && $jobconf_properties + options[:properties] = $jobconf_properties.dup if defined?($jobconf_properties) && $jobconf_properties - cascade = Cascade.new(name, params) + cascade = Cascade.new(name, options) cascade.instance_eval(&block) cascade end @@ -42,13 +42,13 @@ def cascade(name, params = {}, &block) # Builds a top-level flow given a name and block for applications built of # flows with no cascades. Optionally accepts a :mode, as explained in # Cascading::Flow#initialize. - def flow(name, params = {}, &block) + def flow(name, options = {}, &block) raise "Could not build flow '#{name}'; block required" unless block_given? - raise 'Cascading::flow does not accept the :properties param only the global $jobconf_properties' if params[:properties] + raise 'Cascading::flow does not accept the :properties param only the global $jobconf_properties' if options[:properties] - params[:properties] = $jobconf_properties.dup if defined?($jobconf_properties) && $jobconf_properties + options[:properties] = $jobconf_properties.dup if defined?($jobconf_properties) && $jobconf_properties - flow = Flow.new(name, nil, params) + flow = Flow.new(name, nil, options) flow.instance_eval(&block) flow end @@ -59,8 +59,8 @@ def describe alias desc describe # See ExprStub.expr - def expr(expression, params = {}) - ExprStub.expr(expression, params) + def expr(expression, options = {}) + ExprStub.expr(expression, options) end # Creates a cascading.tuple.Fields instance from a string or an array of strings. @@ -117,7 +117,7 @@ def search_field_name(names, candidate) # modes). Positional args are used if :source_fields is not # provided. # - # The named params are: + # The named options are: # [source_fields] Fields to be read from a source with this scheme. Defaults # to ['offset', 'line']. # [sink_fields] Fields to be written to a sink with this scheme. Defaults to @@ -125,11 +125,11 @@ def search_field_name(names, candidate) # [compression] A symbol, either :enable or :disable, that # governs the TextLine scheme's compression. Defaults to the # default TextLine compression (only applies to c.s.h.TextLine). - def text_line_scheme(*args_with_params) - params, source_fields = args_with_params.extract_options!, args_with_params - source_fields = fields(params[:source_fields] || (source_fields.empty? ? ['offset', 'line'] : source_fields)) - sink_fields = fields(params[:sink_fields]) || all_fields - sink_compression = case params[:compression] + def text_line_scheme(*args_with_options) + options, source_fields = args_with_options.extract_options!, args_with_options + source_fields = fields(options[:source_fields] || (source_fields.empty? ? ['offset', 'line'] : source_fields)) + sink_fields = fields(options[:sink_fields]) || all_fields + sink_compression = case options[:compression] when :enable then Java::CascadingSchemeHadoop::TextLine::Compress::ENABLE when :disable then Java::CascadingSchemeHadoop::TextLine::Compress::DISABLE else Java::CascadingSchemeHadoop::TextLine::Compress::DEFAULT @@ -160,8 +160,8 @@ def multi_sink_tap(*taps) end # Creates a Cascading::Tap given a path and optional :scheme and :sink_mode. - def tap(path, params = {}) - Tap.new(path, params) + def tap(path, options = {}) + Tap.new(path, options) end # Constructs properties to be passed to Flow#complete or Cascade#complete diff --git a/lib/cascading/expr_stub.rb b/lib/cascading/expr_stub.rb index 30c0a43..c17d3eb 100644 --- a/lib/cascading/expr_stub.rb +++ b/lib/cascading/expr_stub.rb @@ -33,15 +33,15 @@ def to_s # Convenience constructor for an ExprStub that optionally performs # validation. Takes a string to use as a Janino expression and an optional - # params hash. By default, the param :validate is set to true (performs + # options hash. By default, the param :validate is set to true (performs # expression validation using default actual argument values) and the param # :validate_with is set to {} (which doesn't override any of the default # actual argument values used for validation). - def self.expr(expression, params = {}) - params = { :validate => true, :validate_with => {} }.merge(params) + def self.expr(expression, options = {}) + options = { :validate => true, :validate_with => {} }.merge(options) expr_stub = expression.kind_of?(ExprStub) ? expression : ExprStub.new(expression).compile - expr_stub.validate(params[:validate_with]) if params[:validate] - puts "Expression validation is disabled for '#{expression}'" unless params[:validate] + expr_stub.validate(options[:validate_with]) if options[:validate] + puts "Expression validation is disabled for '#{expression}'" unless options[:validate] expr_stub end diff --git a/lib/cascading/ext/array.rb b/lib/cascading/ext/array.rb index e6f636c..c8def8e 100644 --- a/lib/cascading/ext/array.rb +++ b/lib/cascading/ext/array.rb @@ -5,16 +5,16 @@ # end # # The most obvious limitation of the approach is that function definitions of -# the form f(*args_with_params) are not self-documenting. To compensate for +# the form f(*args_with_options) are not self-documenting. To compensate for # this, documentation of all arguments and optional parameters must be provided # on the DSL method. class Array # Use this extension to extract the optional parameters from a - # *args_with_params argument. + # *args_with_options argument. # So if you have a function: - # def f(*args_with_params) - # You can destructively process the args_with_params as follows: - # params, just_args = args_with_params.extract_options!, args_with_params + # def f(*args_with_options) + # You can destructively process the args_with_options as follows: + # options, just_args = args_with_options.extract_options!, args_with_options def extract_options! last.is_a?(::Hash) ? pop : {} end diff --git a/lib/cascading/filter_operations.rb b/lib/cascading/filter_operations.rb index 07291ad..3f5cfd4 100644 --- a/lib/cascading/filter_operations.rb +++ b/lib/cascading/filter_operations.rb @@ -11,7 +11,7 @@ module Cascading module FilterOperations # Filter the current assembly based on an expression or regex, but not both. # - # The named params are: + # The named options are: # [expression] A Janino expression used to filter. Has access to all :input # fields. # [validate] Boolean passed to Cascading#expr to enable or disable @@ -28,12 +28,12 @@ module FilterOperations # Example: # filter :input => 'field1', :regex => /\t/, :remove_match => true # filter :expression => 'field1:long > 0 && "".equals(field2:string)', :remove_match => true - def filter(params = {}) - input_fields = params[:input] || all_fields - expression = params[:expression] - regex = params[:regex] - validate = params.has_key?(:validate) ? params[:validate] : true - validate_with = params[:validate_with] || {} + def filter(options = {}) + input_fields = options[:input] || all_fields + expression = options[:expression] + regex = options[:regex] + validate = options.has_key?(:validate) ? options[:validate] : true + validate_with = options[:validate_with] || {} if expression stub = expr(expression, { :validate => validate, :validate_with => validate_with }) @@ -46,7 +46,7 @@ def filter(params = {}) types ) elsif regex - parameters = [regex.to_s, params[:remove_match], params[:match_each_element]].compact + parameters = [regex.to_s, options[:remove_match], options[:match_each_element]].compact each input_fields, :filter => Java::CascadingOperationRegex::RegexFilter.new(*parameters) else raise 'filter requires one of :expression or :regex' @@ -58,9 +58,9 @@ def filter(params = {}) # # Example: # reject 'field1:long > 0 && "".equals(field2:string)' - def reject(expression, params = {}) - params[:expression] = expression - filter(params) + def reject(expression, options = {}) + options[:expression] = expression + filter(options) end # Keeps tuples from the current assembly based on a Janino expression. This @@ -73,10 +73,10 @@ def reject(expression, params = {}) # # Example: # where 'field1:long > 0 && "".equals(field2:string)' - def where(expression, params = {}) + def where(expression, options = {}) _, imports, expr = expression.match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a - params[:expression] = "#{imports}!(#{expr})" - filter(params) + options[:expression] = "#{imports}!(#{expr})" + filter(options) end # Rejects tuples from the current assembly if any input field is null. diff --git a/lib/cascading/flow.rb b/lib/cascading/flow.rb index 631cc96..cc37c87 100644 --- a/lib/cascading/flow.rb +++ b/lib/cascading/flow.rb @@ -18,10 +18,10 @@ class Flow < Cascading::Node # properties are propagated through cascades. Optionally accepts a :mode # which will determine the execution mode of this flow. See # Cascading::Mode.parse for details. - def initialize(name, parent, params = {}) + def initialize(name, parent, options = {}) @sources, @sinks, @incoming_scopes, @outgoing_scopes, @listeners = {}, {}, {}, {}, [] - @properties = params[:properties] || {} - @mode = Mode.parse(params[:mode]) + @properties = options[:properties] || {} + @mode = Mode.parse(options[:mode]) @flow_scope = Scope.flow_scope(name) super(name, parent) self.class.add(name, self) diff --git a/lib/cascading/operations.rb b/lib/cascading/operations.rb index 675e1ca..c816b11 100644 --- a/lib/cascading/operations.rb +++ b/lib/cascading/operations.rb @@ -3,7 +3,7 @@ module Operations # Debugs the current assembly at runtime, printing every tuple and fields # every 10 tuples by default. # - # The named params are: + # The named options are: # [prefix] String to prefix prints with. # [print_fields] Boolean controlling field printing, defaults to false. # [tuple_interval] Integer specifying interval between printed tuples @@ -11,16 +11,15 @@ module Operations # # Example: # debug :prefix => 'DEBUG', :print_fields => true, :fields_interval => 1000 - def debug(params = {}) - input_fields = params[:input] || all_fields - prefix = params[:prefix] - print_fields = params[:print_fields] + def debug(options = {}) + input_fields = options[:input] || all_fields + prefix = options[:prefix] + print_fields = options[:print_fields] - parameters = [prefix, print_fields].compact - debug = Java::CascadingOperation::Debug.new(*parameters) + debug = Java::CascadingOperation::Debug.new(*[prefix, print_fields].compact) - debug.print_tuple_every = params[:tuple_interval] || 1 - debug.print_fields_every = params[:fields_interval] || 10 + debug.print_tuple_every = options[:tuple_interval] || 1 + debug.print_fields_every = options[:fields_interval] || 10 each(input_fields, :filter => debug) end @@ -57,7 +56,7 @@ def insert(insert_map) # # You must provide exactly one of :value_selectors and :num_values. # - # The named params are: + # The named options are: # [value_selectors] Array of field names to ungroup. Each field will be # ungrouped into an output tuple along with the key fields # in the order provided. @@ -67,13 +66,13 @@ def insert(insert_map) # # Example: # ungroup 'key', ['new_key', 'val], :value_selectors => ['val1', 'val2', 'val3'], :output => ['new_key', 'val'] - def ungroup(key, into_fields, params = {}) - input_fields = params[:input] || all_fields - output = params[:output] || all_fields + def ungroup(key, into_fields, options = {}) + input_fields = options[:input] || all_fields + output = options[:output] || all_fields - raise 'You must provide exactly one of :value_selectors or :num_values to ungroup' unless params.has_key?(:value_selectors) ^ params.has_key?(:num_values) - value_selectors = params[:value_selectors].map{ |vs| fields(vs) }.to_java(Java::CascadingTuple::Fields) if params.has_key?(:value_selectors) - num_values = params[:num_values] if params.has_key?(:num_values) + raise 'You must provide exactly one of :value_selectors or :num_values to ungroup' unless options.has_key?(:value_selectors) ^ options.has_key?(:num_values) + value_selectors = options[:value_selectors].map{ |vs| fields(vs) }.to_java(Java::CascadingTuple::Fields) if options.has_key?(:value_selectors) + num_values = options[:num_values] if options.has_key?(:num_values) parameters = [fields(into_fields), fields(key), value_selectors, num_values].compact each input_fields, :function => Java::CascadingOperationFunction::UnGroup.new(*parameters), :output => output @@ -88,8 +87,8 @@ def ungroup(key, into_fields, params = {}) # # Example: # set_value 'field1', Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, 'is_field1_null' - def set_value(input_fields, filter, keep_value, remove_value, into_field, params = {}) - output = params[:output] || all_fields + def set_value(input_fields, filter, keep_value, remove_value, into_field, options = {}) + output = options[:output] || all_fields each input_fields, :function => Java::CascadingOperationFunction::SetValue.new(fields(into_field), filter, keep_value, remove_value), :output => output end @@ -100,8 +99,8 @@ def set_value(input_fields, filter, keep_value, remove_value, into_field, params # # Example: # null_indicator 'field1', 'is_field1_null' - def null_indicator(input_field, into_field, params = {}) - set_value input_field, Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, into_field, :output => params[:output] + def null_indicator(input_field, into_field, options = {}) + set_value input_field, Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, into_field, :output => options[:output] end # Given an input_field and a regex, returns an indicator that is 1 if the string @@ -109,8 +108,8 @@ def null_indicator(input_field, into_field, params = {}) # # Example: # regex_contains 'field1', /\w+\s+\w+/, 'does_field1_contain_pair' - def regex_contains(input_field, regex, into_field, params = {}) - set_value input_field, Java::CascadingOperationRegex::RegexFilter.new(pattern.to_s), 1.to_java, 0.to_java, into_field, :output => params[:output] + def regex_contains(input_field, regex, into_field, options = {}) + set_value input_field, Java::CascadingOperationRegex::RegexFilter.new(pattern.to_s), 1.to_java, 0.to_java, into_field, :output => options[:output] end private diff --git a/lib/cascading/regex_operations.rb b/lib/cascading/regex_operations.rb index 35451bf..c7824fc 100644 --- a/lib/cascading/regex_operations.rb +++ b/lib/cascading/regex_operations.rb @@ -21,15 +21,15 @@ module RegexOperations # Parses the given input_field using the specified regular expression to # produce one output per group in that expression. # - # The named params are: + # The named options are: # [groups] Array of integers specifying which groups to capture if you want # a subset of groups. # # Example: # parse 'field1', /(\w+)\s+(\w+)/, ['out1', 'out2'], :groups => [1, 2] - def parse(input_field, regex, into_fields, params = {}) - groups = params[:groups].to_java(:int) if params[:groups] - output = params[:output] || all_fields # Overrides Cascading default + def parse(input_field, regex, into_fields, options = {}) + groups = options[:groups].to_java(:int) if options[:groups] + output = options[:output] || all_fields # Overrides Cascading default input_field = fields(input_field) raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1 @@ -47,8 +47,8 @@ def parse(input_field, regex, into_fields, params = {}) # # Example: # split 'line', /\s+/, ['out1', 'out2'] - def split(input_field, regex, into_fields, params = {}) - output = params[:output] || all_fields # Overrides Cascading default + def split(input_field, regex, into_fields, options = {}) + output = options[:output] || all_fields # Overrides Cascading default input_field = fields(input_field) raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1 @@ -65,8 +65,8 @@ def split(input_field, regex, into_fields, params = {}) # # Example: # split_rows 'line', /\s+/, 'word' - def split_rows(input_field, regex, into_field, params = {}) - output = params[:output] || all_fields # Overrides Cascading default + def split_rows(input_field, regex, into_field, options = {}) + output = options[:output] || all_fields # Overrides Cascading default input_field = fields(input_field) raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1 @@ -85,8 +85,8 @@ def split_rows(input_field, regex, into_field, params = {}) # # Example: # match_rows 'line', /(\w+)\s+(\w+)/, 'word' - def match_rows(input_field, regex, into_field, params = {}) - output = params[:output] || all_fields # Overrides Cascading default + def match_rows(input_field, regex, into_field, options = {}) + output = options[:output] || all_fields # Overrides Cascading default input_field = fields(input_field) raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1 @@ -103,21 +103,21 @@ def match_rows(input_field, regex, into_field, params = {}) # Performs a query/replace on the given input_field using the specified # regular expression and replacement. # - # The named params are: + # The named options are: # [replace_all] Boolean indicating if all matches should be replaced; # defaults to true (the Cascading default). # # Example: # replace 'line', /[.,]*\s+/, 'tab_separated_line', "\t" - def replace(input_field, regex, into_field, replacement, params = {}) - output = params[:output] || all_fields # Overrides Cascading default + def replace(input_field, regex, into_field, replacement, options = {}) + output = options[:output] || all_fields # Overrides Cascading default input_field = fields(input_field) raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1 into_field = fields(into_field) raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1 - parameters = [into_field, regex.to_s, replacement.to_s, params[:replace_all]].compact + parameters = [into_field, regex.to_s, replacement.to_s, options[:replace_all]].compact each( input_field, :function => Java::CascadingOperationRegex::RegexReplace.new(*parameters), diff --git a/lib/cascading/tap.rb b/lib/cascading/tap.rb index c1fb254..528aa70 100644 --- a/lib/cascading/tap.rb +++ b/lib/cascading/tap.rb @@ -32,17 +32,17 @@ def hadoop? class Tap < BaseTap attr_reader :scheme, :path, :sink_mode - def initialize(path, params = {}) + def initialize(path, options = {}) @path = path - @scheme = params[:scheme] || text_line_scheme + @scheme = options[:scheme] || text_line_scheme raise "Scheme must provide one of :local_scheme or :hadoop_scheme; received: '#{scheme.inspect}'" unless scheme[:local_scheme] || scheme[:hadoop_scheme] - @sink_mode = case params[:sink_mode] || :keep + @sink_mode = case options[:sink_mode] || :keep when :keep, 'keep' then Java::CascadingTap::SinkMode::KEEP when :replace, 'replace' then Java::CascadingTap::SinkMode::REPLACE when :append, 'append' then Java::CascadingTap::SinkMode::APPEND - else raise "Unrecognized sink mode '#{params[:sink_mode]}'" + else raise "Unrecognized sink mode '#{options[:sink_mode]}'" end local_scheme = scheme[:local_scheme] diff --git a/lib/cascading/text_operations.rb b/lib/cascading/text_operations.rb index a54973c..39e8339 100644 --- a/lib/cascading/text_operations.rb +++ b/lib/cascading/text_operations.rb @@ -12,8 +12,8 @@ module TextOperations # # Example: # parse_date 'text_date', 'yyyy/MM/dd', 'timestamp' - def parse_date(input_field, date_format, into_field, params = {}) - output = params[:output] || all_fields # Overrides Cascading default + def parse_date(input_field, date_format, into_field, options = {}) + output = options[:output] || all_fields # Overrides Cascading default input_field = fields(input_field) raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1 @@ -32,8 +32,8 @@ def parse_date(input_field, date_format, into_field, params = {}) # # Example: # format_date 'timestamp', 'yyyy/MM/dd', 'text_date' - def format_date(input_field, date_format, into_field, params = {}) - output = params[:output] || all_fields # Overrides Cascading default + def format_date(input_field, date_format, into_field, options = {}) + output = options[:output] || all_fields # Overrides Cascading default input_field = fields(input_field) raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1 @@ -52,7 +52,7 @@ def format_date(input_field, date_format, into_field, params = {}) # Example: # join_fields ['field1', 'field2'], ',', 'comma_separated' def join_fields(input_fields, delimiter, into_field) - output = params[:output] || all_fields # Overrides Cascading default + output = options[:output] || all_fields # Overrides Cascading default into_field = fields(into_field) raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1 diff --git a/spec/spec_util.rb b/spec/spec_util.rb index 10c5c4c..127a994 100644 --- a/spec/spec_util.rb +++ b/spec/spec_util.rb @@ -2,14 +2,14 @@ BUILD_DIR = 'build/spec' module ScopeTests - def check_scope(params = {}) - name_params = [params[:source]].compact - scope = scope(*name_params) - values_fields = params[:values_fields] - grouping_fields = params[:grouping_fields] || values_fields + def check_scope(options = {}) + name_options = [options[:source]].compact + scope = scope(*name_options) + values_fields = options[:values_fields] + grouping_fields = options[:grouping_fields] || values_fields - debug = params[:debug] - debug_scope(*name_params) if debug + debug = options[:debug] + debug_scope(*name_options) if debug scope.values_fields.to_a.should == values_fields scope.grouping_fields.to_a.should == grouping_fields @@ -29,8 +29,8 @@ def test_flow(&block) cascade.complete end -def test_assembly(params = {}, &block) - branches = params[:branches] || [] +def test_assembly(options = {}, &block) + branches = options[:branches] || [] test_flow do source 'input', tap('spec/resource/test_input.txt', :scheme => text_line_scheme) @@ -49,9 +49,9 @@ def test_assembly(params = {}, &block) end end -def test_join_assembly(params = {}, &block) - branches = params[:branches] || [] - post_join_block = params[:post_join_block] +def test_join_assembly(options = {}, &block) + branches = options[:branches] || [] + post_join_block = options[:post_join_block] test_flow do source 'left', tap('spec/resource/join_input.txt', :scheme => text_line_scheme) From 9ba7db1c12b3cb0cc5b7fae96e1b23e3384b42b5 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Fri, 19 Apr 2013 06:17:19 -0500 Subject: [PATCH 20/36] Alias filter operations to their Cascading equivalents, but prefer verbs to nouns for operations --- lib/cascading/assembly.rb | 5 +++++ lib/cascading/filter_operations.rb | 7 ++++--- lib/cascading/regex_operations.rb | 5 +++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/lib/cascading/assembly.rb b/lib/cascading/assembly.rb index ba74f06..a304dc3 100644 --- a/lib/cascading/assembly.rb +++ b/lib/cascading/assembly.rb @@ -27,6 +27,11 @@ module Cascading # [input] c.p.Each argument selector # [into] c.o.Operation field declaration # [output] c.p.Each output selector + # + # A note on aliases: when a DSL method uniquely wraps a single Cascading + # operation, we attempt to provide an alias that matches the Cascading + # operation. However, Cascading operations are often nouns rather than verbs, + # and the latter are preferable for a dataflow DSL. class Assembly < Cascading::Node attr_reader :head_pipe, :tail_pipe diff --git a/lib/cascading/filter_operations.rb b/lib/cascading/filter_operations.rb index 3f5cfd4..8ae26d8 100644 --- a/lib/cascading/filter_operations.rb +++ b/lib/cascading/filter_operations.rb @@ -27,15 +27,16 @@ module FilterOperations # # Example: # filter :input => 'field1', :regex => /\t/, :remove_match => true - # filter :expression => 'field1:long > 0 && "".equals(field2:string)', :remove_match => true + # filter :expression => 'field1:long > 0 && "".equals(field2:string)' def filter(options = {}) input_fields = options[:input] || all_fields expression = options[:expression] regex = options[:regex] - validate = options.has_key?(:validate) ? options[:validate] : true - validate_with = options[:validate_with] || {} if expression + validate = options.has_key?(:validate) ? options[:validate] : true + validate_with = options[:validate_with] || {} + stub = expr(expression, { :validate => validate, :validate_with => validate_with }) stub.validate_scope(scope) diff --git a/lib/cascading/regex_operations.rb b/lib/cascading/regex_operations.rb index c7824fc..daa02fc 100644 --- a/lib/cascading/regex_operations.rb +++ b/lib/cascading/regex_operations.rb @@ -41,6 +41,7 @@ def parse(input_field, regex, into_fields, options = {}) :output => output ) end + alias regex_parser parse # Splits the given input_field into multiple fields using the specified # regular expression. @@ -59,6 +60,7 @@ def split(input_field, regex, into_fields, options = {}) :output => output ) end + alias regex_splitter split # Splits the given input_field into new rows using the specified regular # expression. @@ -79,6 +81,7 @@ def split_rows(input_field, regex, into_field, options = {}) :output => output ) end + alias regex_split_generator split_rows # Emits a new row for each regex group matched in input_field using the # specified regular expression. @@ -99,6 +102,7 @@ def match_rows(input_field, regex, into_field, options = {}) :output => output ) end + alias regex_generator match_rows # Performs a query/replace on the given input_field using the specified # regular expression and replacement. @@ -124,5 +128,6 @@ def replace(input_field, regex, into_field, replacement, options = {}) :output => output ) end + alias regex_replace replace end end From 381410189c6caab40403711f11a282b9460ab710 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Wed, 24 Apr 2013 11:28:24 -0500 Subject: [PATCH 21/36] Document Cascading module and Taps --- README.md | 2 +- cascading.jruby.gemspec | 2 +- lib/cascading/cascading.rb | 108 +++++++++++++++++++++++++++++++++---- lib/cascading/tap.rb | 50 ++++++++++++----- 4 files changed, 138 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index b8e6397..7105c6f 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,6 @@ cascading.jruby provides a clean Ruby interface to Cascading, but doesn't attemp For operations you can apply to your dataflow within a pipe assembly, see the [Assembly](http://rubydoc.info/gems/cascading.jruby/0.0.10/Cascading/Assembly) class. For operations available within a block passed to a group_by, union, or join, see the [Aggregations](http://rubydoc.info/gems/cascading.jruby/0.0.10/Cascading/Aggregations) class. -Note that the Ruby code you write merely constructs a Cascading job, so no JRuby runtime is required on your cluster. This stands in contrast with writing [Hadoop streaming jobs in Ruby](http://www.quora.com/How-do-the-different-options-for-Ruby-on-Hadoop-compare). To run cascading.jruby applications on a Hadoop cluster, you must use [Jading](https://github.com/etsy/jading) to package them into a job jar. +Note that the Ruby code you write merely constructs a Cascading job, so no JRuby runtime is required on your cluster. This stands in contrast with writing [Hadoop streaming jobs in Ruby](http://www.quora.com/How-do-the-different-options-for-Ruby-on-Hadoop-compare). To run cascading.jruby applications on a Hadoop cluster, you must use [Jading](https://github.com/mrwalker/jading) to package them into a job jar. cascading.jruby has been tested on JRuby versions 1.2.0, 1.4.0, 1.5.3, 1.6.5, 1.6.7.2, 1.7.0, and 1.7.3. diff --git a/cascading.jruby.gemspec b/cascading.jruby.gemspec index 21a077a..7b60f10 100644 --- a/cascading.jruby.gemspec +++ b/cascading.jruby.gemspec @@ -8,7 +8,7 @@ Gem::Specification.new do |s| s.authors = ["Matt Walker", "Gr\303\251goire Marabout"] s.description = "cascading.jruby is a small DSL above Cascading, written in JRuby" s.email = "mwalker@etsy.com" - s.extra_rdoc_files = ["LICENSE.txt"] + s.extra_rdoc_files = ["README.md", "LICENSE.txt"] s.files = Dir.glob("lib/**/*.rb") s.homepage = "http://github.com/etsy/cascading.jruby" s.rdoc_options = ["--main", "README.md"] diff --git a/lib/cascading/cascading.rb b/lib/cascading/cascading.rb index 215dd2a..9d4755a 100644 --- a/lib/cascading/cascading.rb +++ b/lib/cascading/cascading.rb @@ -1,5 +1,30 @@ +require 'cascading/cascade' +require 'cascading/flow' require 'cascading/expr_stub' +# The Cascading module contains all of the cascading.jruby DSL. Inserting the +# following into your script: +# require 'rubygems' +# require 'cascading' +# includes this module at the top level, making all of its features available. +# +# To build a dataflow like the one in the README.md or +# {samples}[http://github.com/mrwalker/cascading.jruby/tree/master/samples], +# start by looking at Cascade or Flow. These are the +# highest level structures you'll use to put together your job. +# +# Within a flow, you'll connect sources to sinks by way of Assembly, which +# refers to "pipe assemblies" from Cascading. Within an Assembly, you'll use +# functions and filters (see Operations, IdentityOperations, RegexOperations, +# FilterOperations, and TextOperations) as well as Assembly#group_by, +# Assembly#union, and Assembly#join. You can provide those last pipes with a +# block that can select operations from Aggregations. +# +# Finally, you'll want to address the execution of your job, whether it be +# locally testing or running remotely on a Hadoop cluster. See the Mode class +# for the available modes, and parameterize your script such that it can operate +# in Cascading local mode locally and in Hadoop mode when run in a jar produced +# with {Jading}[http://github.com/mrwalker/jading]. module Cascading # Mapping that defines a convenient syntax for specifying Java classes, used # in Janino expressions and elsewhere. @@ -26,8 +51,21 @@ module Cascading # directly building their own cascades and flows so that jading can send them # default properties. - # Builds a top-level cascade given a name and a block. Optionally accepts a - # :mode, as explained in Cascading::Cascade#initialize. + # Builds a top-level cascade given a name and a block. + # + # The named options are: + # [mode] See Cascade#initialize + # + # Example: + # cascade 'wordcount', :mode => :local do + # flow 'first_step' do + # ... + # end + # + # flow 'second_step' do + # ... + # end + # end def cascade(name, options = {}, &block) raise "Could not build cascade '#{name}'; block required" unless block_given? raise 'Cascading::cascade does not accept the :properties param only the global $jobconf_properties' if options[:properties] @@ -40,8 +78,21 @@ def cascade(name, options = {}, &block) end # Builds a top-level flow given a name and block for applications built of - # flows with no cascades. Optionally accepts a :mode, as explained in - # Cascading::Flow#initialize. + # flows with no cascades. + # + # The named options are: + # [mode] See Cascade#initialize + # + # Example: + # flow 'wordcount', :mode => :local do + # assembly 'first_step' do + # ... + # end + # + # assembly 'second_step' do + # ... + # end + # end def flow(name, options = {}, &block) raise "Could not build flow '#{name}'; block required" unless block_given? raise 'Cascading::flow does not accept the :properties param only the global $jobconf_properties' if options[:properties] @@ -53,6 +104,11 @@ def flow(name, options = {}, &block) flow end + # Produces a textual description of all Cascades in the global registry. The + # description details structure, sources, sinks, and the input and output + # fields of each assembly. + # + # NOTE: will be moved to Job in later version def describe Cascade.all.map{ |cascade| cascade.describe }.join("\n") end @@ -63,7 +119,14 @@ def expr(expression, options = {}) ExprStub.expr(expression, options) end - # Creates a cascading.tuple.Fields instance from a string or an array of strings. + # Utility method for creating Cascading c.t.Fields from a field name (string) + # or list of field names (array of strings). If the input fields is already a + # c.t.Fields or nil, it is passed through. This allows for flexible use of + # the method at multiple layers in the DSL. + # + # Example: + # cascading_fields = fields(['first', 'second', 'third']) + # # cascading_fields.to_a == ['first', 'second', 'third'] def fields(fields) if fields.nil? return nil @@ -78,16 +141,24 @@ def fields(fields) return Java::CascadingTuple::Fields.new([fields].flatten.map{ |f| f.kind_of?(Fixnum) ? java.lang.Integer.new(f) : f }.to_java(java.lang.Comparable)) end + # Convenience method wrapping c.t.Fields::ALL def all_fields Java::CascadingTuple::Fields::ALL end + # Convenience method wrapping c.t.Fields::VALUES def last_grouping_fields Java::CascadingTuple::Fields::VALUES end # Computes fields formed by removing remove_fields from base_fields. Operates # only on named fields, not positional fields. + # + # Example: + # base_fields = fields(['a', 'b', 'c']) + # remove_fields = fields(['b']) + # result_fields = difference_fields(base_fields, remove_fields) + # # results_fields.to_a == ['a', 'c'] def difference_fields(base_fields, remove_fields) fields(base_fields.to_a - remove_fields.to_a) end @@ -102,6 +173,13 @@ def dedup_fields(*fields) # Helper used by dedup_fields that operates on arrays of field names rather # than fields objects. + # + # Example: + # left_names = ['a', 'b'] + # mid_names = ['a', 'c'] + # right_names = ['a', 'd'] + # deduped_names = dedup_field_names(left_names, mid_names, right_names) + # # deduped_names == ['a', 'b', 'a_', 'c', 'a__', 'd'] def dedup_field_names(*names) names.inject([]) do |acc, arr| acc + arr.map{ |e| search_field_name(acc, e) } @@ -114,15 +192,14 @@ def search_field_name(names, candidate) private :search_field_name # Creates a TextLine scheme (can be used in both Cascading local and hadoop - # modes). Positional args are used if :source_fields is not - # provided. + # modes). Positional args are used if :source_fields is not provided. # # The named options are: # [source_fields] Fields to be read from a source with this scheme. Defaults # to ['offset', 'line']. # [sink_fields] Fields to be written to a sink with this scheme. Defaults to # all_fields. - # [compression] A symbol, either :enable or :disable, that + # [compression] A symbol, either :enable or :disable, that # governs the TextLine scheme's compression. Defaults to the # default TextLine compression (only applies to c.s.h.TextLine). def text_line_scheme(*args_with_options) @@ -151,15 +228,28 @@ def sequence_file_scheme(*fields) } end + # Convenience access to MultiTap.multi_source_tap. This constructor is more + # "DSL-like" because it allows you to pass taps directly as actual args rather + # than in an array: + # multi_source_tap tap1, tap2, tap3, ..., tapn + # + # See MultiTap.multi_source_tap for more details. def multi_source_tap(*taps) MultiTap.multi_source_tap(taps) end + # Convenience access to MultiTap.multi_sink_tap. This constructor is more + # "DSL-like" because it allows you to pass taps directly as actual args rather + # than in an array: + # multi_sink_tap tap1, tap2, tap3, ..., tapn + # + # See MultiTap.multi_sink_tap for more details. def multi_sink_tap(*taps) MultiTap.multi_sink_tap(taps) end - # Creates a Cascading::Tap given a path and optional :scheme and :sink_mode. + # Convenience constructor for a Tap, that accepts the same options as that + # class' constructor. See Tap for more details. def tap(path, options = {}) Tap.new(path, options) end diff --git a/lib/cascading/tap.rb b/lib/cascading/tap.rb index 528aa70..9ba4391 100644 --- a/lib/cascading/tap.rb +++ b/lib/cascading/tap.rb @@ -1,37 +1,52 @@ module Cascading - # A Cascading::BaseTap wraps up a pair of Cascading taps, one for Cascading - # local mode and the other for Hadoop mode. + # A BaseTap wraps up a pair of Cascading taps, one for Cascading local mode + # and the other for Hadoop mode. Note that these are optional, but at least + # one must be provided for most taps. A SequenceFile is a notable example of + # a Scheme for which their is no Cascading local mode version, so a Tap you + # build with it will have no local_tap. class BaseTap attr_reader :local_tap, :hadoop_tap + # Constructor that accepts the local_tap and hadoop_tap, which may be nil def initialize(local_tap, hadoop_tap) @local_tap = local_tap @hadoop_tap = hadoop_tap end + # Passes through printing the local_tap and hadoop_tap def to_s "Local: #{local_tap}, Hadoop: #{hadoop_tap}" end + # Returns false if the local_tap is nil, true otherwise def local? !local_tap.nil? end + # Returns false if the hadoop_tap is nil, true otherwise def hadoop? !hadoop_tap.nil? end end - # A Cascading::Tap represents a non-aggregate tap with a scheme, path, and - # optional sink_mode. c.t.l.FileTap is used in Cascading local mode and - # c.t.h.Hfs is used in Hadoop mode. Whether or not these can be created is - # governed by the :scheme parameter, which must contain at least one of - # :local_scheme or :hadoop_scheme. Schemes like TextLine are supported in - # both modes (by Cascading), but SequenceFile is only supported in Hadoop - # mode. + # A Tap represents a non-aggregate tap with a scheme, path, and optional + # sink_mode. c.t.l.FileTap is used in Cascading local mode and c.t.h.Hfs is + # used in Hadoop mode. Whether or not these can be created is governed by the + # :scheme parameter, which must contain at least one of :local_scheme or + # :hadoop_scheme. Schemes like TextLine are supported in both modes (by + # Cascading), but SequenceFile is only supported in Hadoop mode. class Tap < BaseTap attr_reader :scheme, :path, :sink_mode + # Builds a Tap given a required path + # + # The named options are: + # [scheme] A Hash which must contain at least one of :local_scheme or + # :hadoop_scheme but may contain both. Default is + # text_line_scheme, which works in both modes. + # [sink_mode] A symbol or string that may be :keep, :replace, or :append, + # and corresponds to the c.t.SinkMode enumeration. The default + # is :keep, which matches Cascading's default. def initialize(path, options = {}) @path = path @@ -53,19 +68,28 @@ def initialize(path, options = {}) end end - # A Cascading::MultiTap represents one of Cascading's aggregate taps and is - # built via static constructors that accept an array of Cascading::Taps. In - # order for a mode (Cascading local or Hadoop) to be supported, all provided - # taps must support it. + # A MultiTap represents one of Cascading's aggregate taps and is built via + # static constructors that accept an array of Taps. In order for a mode + # (Cascading local or Hadoop) to be supported, all provided taps must support + # it. class MultiTap < BaseTap + # Do not call this constructor directly; instead, use one of + # MultiTap.multi_source_tap or MultiTap.multi_sink_tap. def initialize(local_tap, hadoop_tap) super(local_tap, hadoop_tap) end + # Static constructor that builds a MultiTap wrapping a c.t.MultiSourceTap + # from the given array of Taps. The resulting MultiTap will only be + # available in Cascading local mode or Hadoop mode if all input taps support + # them. def self.multi_source_tap(taps) multi_tap(taps, Java::CascadingTap::MultiSourceTap) end + # Static constructor that builds a MultiTap wrapping a c.t.MultiSinkTap from + # the given array of Taps. The resulting MultiTap will only be available in + # Cascading local mode or Hadoop mode if all input taps support them. def self.multi_sink_tap(taps) multi_tap(taps, Java::CascadingTap::MultiSinkTap) end From e74945343b02747655bc0214ccf66d37dc097844 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Wed, 24 Apr 2013 17:06:53 -0500 Subject: [PATCH 22/36] Allow Mode.parse to pass through an already-constructed Mode object to avoid confusion when people call cascade/flow constructors --- lib/cascading/mode.rb | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/cascading/mode.rb b/lib/cascading/mode.rb index 91898e0..9b073b8 100644 --- a/lib/cascading/mode.rb +++ b/lib/cascading/mode.rb @@ -7,10 +7,13 @@ module Cascading class Mode attr_reader :local - # Hadoop mode is the default. You must explicitly request Cascading local - # mode with values 'local' or :local. + # Parses a specification of which mode, Cascading local mode or Hadoop mode, + # to execute in. Defaults to Hadoop mode. You may explicitly request + # Cascading local mode with values 'local' or :local. If you pass a Mode + # object to this method, it will be passed through. def self.parse(mode) case mode + when Mode then mode when 'local', :local then Mode.new(true) else Mode.new(false) end From bd793fcb6a72fee4c78d4b2f76d0a072307e8e37 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Wed, 24 Apr 2013 17:22:26 -0500 Subject: [PATCH 23/36] Document Cascade --- lib/cascading/cascade.rb | 59 +++++++++++++++++++++++++++++++------- lib/cascading/cascading.rb | 12 ++++---- lib/cascading/flow.rb | 29 +++++++++++++------ 3 files changed, 76 insertions(+), 24 deletions(-) diff --git a/lib/cascading/cascade.rb b/lib/cascading/cascade.rb index 4246ccb..7951e87 100644 --- a/lib/cascading/cascade.rb +++ b/lib/cascading/cascade.rb @@ -2,6 +2,13 @@ require 'yaml' module Cascading + # A Cascade wraps a c.c.Cascade. A Cascade is composed of Flows, which are + # constructed using the Cascade#flow method within the block passed to the + # Cascading::cascade constructor. Many flows may be nested within a Cascade. + # + # Note that you are not required to use a Cascade to wrap your job. Instead, + # you could start with a top-level Flow, which you might prefer if you have + # no need of a c.c.Cascade's make-like semantics wrt sinks. class Cascade < Cascading::Node extend Registerable @@ -10,13 +17,16 @@ class Cascade < Cascading::Node # Do not use this constructor directly; instead, use Cascading::cascade to # build cascades. # - # Builds a cascade given the specified name. Optionally accepts - # :properties which will be used as the default properties for all child - # flows. Properties must be a Ruby Hash with string keys and values and - # will be copied before being passed into each flow in the cascade. See - # Cascading::Flow#initialize for details on how flows handle properties. - # Optionally accepts a :mode which will be used as the default mode for all - # child flows. See Cascading::Mode.parse for details. + # Builds a Cascade given the specified name. + # + # The named options are: + # [properties] Properties hash which will be used as the default properties + # for all child flows. Properties must be a Ruby Hash with + # string keys and values and will be copied before being + # passed into each flow in the cascade. See Flow#initialize + # for details on how flows handle properties. + # [mode] Mode which will be used as the default mode for all child flows. + # See Mode.parse for details. def initialize(name, options = {}) @properties = options[:properties] || {} @mode = options[:mode] @@ -24,10 +34,23 @@ def initialize(name, options = {}) self.class.add(name, self) end - # Builds a child flow given a name and block. Optionally accepts - # :properties which will override the default properties stroed in this - # cascade. Optionally accepts a :mode, which will override the default - # mode stored in this cascade. + # Builds a child Flow in this Cascade given a name and block. + # + # The named options are: + # [properties] Properties hash which will override the default properties + # stored in this cascade. + # [mode] Mode which will override the default mode stored in this cascade. + # + # Example: + # cascade 'wordcount', :mode => :local do + # flow 'first_step' do + # ... + # end + # + # flow 'second_step' do + # ... + # end + # end def flow(name, options = {}, &block) raise "Could not build flow '#{name}'; block required" unless block_given? @@ -40,16 +63,26 @@ def flow(name, options = {}, &block) flow end + # Produces a textual description of this Cascade. The description details + # the structure of the Cascade, the sources and sinks of each Flow, and the + # input and output fields of each Assembly. The offset parameter allows + # for this describe to be nested within a calling context, which lets us + # indent the structural hierarchy of a job. def describe(offset = '') "#{offset}#{name}:cascade\n#{child_names.map{ |child| children[child].describe("#{offset} ") }.join("\n")}" end + # Writes out the DOT file describing the structure of this Cascade. + # + # NOTE: will be at Job in later version and also present on Flow def draw(dir) @children.each do |name, flow| flow.connect.writeDOT("#{dir}/#{name}.dot") end end + # Builds a map, keyed by flow name, of the sink metadata for each child + # flow. Currently, this contains only the field names of each sink. def sink_metadata @children.inject({}) do |sink_fields, (name, flow)| sink_fields[name] = flow.sink_metadata @@ -57,12 +90,16 @@ def sink_metadata end end + # Writes the mapping produced by Cascade#sink_metadata to a file at the + # given path in YAML. def write_sink_metadata(file_name) File.open(file_name, 'w') do |file| YAML.dump(sink_metadata, file) end end + # Connects this Cascade, producing a c.c.Cascade, which is then completed, + # executing it. Child flows are connected, so no parameters are required. def complete begin Java::CascadingCascade::CascadeConnector.new.connect(name, make_flows(@children)).complete diff --git a/lib/cascading/cascading.rb b/lib/cascading/cascading.rb index 9d4755a..59846c4 100644 --- a/lib/cascading/cascading.rb +++ b/lib/cascading/cascading.rb @@ -51,9 +51,10 @@ module Cascading # directly building their own cascades and flows so that jading can send them # default properties. - # Builds a top-level cascade given a name and a block. + # Builds a top-level Cascade given a name and a block. # # The named options are: + # [properties] See Cascade#initialize # [mode] See Cascade#initialize # # Example: @@ -77,11 +78,12 @@ def cascade(name, options = {}, &block) cascade end - # Builds a top-level flow given a name and block for applications built of + # Builds a top-level Flow given a name and block for applications built of # flows with no cascades. # # The named options are: - # [mode] See Cascade#initialize + # [properties] See Flow#initialize + # [mode] See Flow#initialize # # Example: # flow 'wordcount', :mode => :local do @@ -105,8 +107,8 @@ def flow(name, options = {}, &block) end # Produces a textual description of all Cascades in the global registry. The - # description details structure, sources, sinks, and the input and output - # fields of each assembly. + # description details the structure of the Cascades, the sources and sinks of + # each Flow, and the input and output fields of each Assembly. # # NOTE: will be moved to Job in later version def describe diff --git a/lib/cascading/flow.rb b/lib/cascading/flow.rb index cc37c87..a76271a 100644 --- a/lib/cascading/flow.rb +++ b/lib/cascading/flow.rb @@ -1,6 +1,10 @@ require 'cascading/assembly' module Cascading + # A Flow wraps a c.f.Flow. A Flow is composed of Assemblies, which are + # constructed using the Flow#assembly method within the block passed to the + # Cascading::flow or Cascade#flow constructor. Many Assemblies may be nested + # within a Flow. class Flow < Cascading::Node extend Registerable @@ -10,14 +14,16 @@ class Flow < Cascading::Node # Do not use this constructor directly. Instead, use Cascading::flow to # build top-level flows and Cascade#flow to build flows within a Cascade. # - # Builds a flow given a name and a parent node (a cascade or nil). - # Optionally accepts :properties which allows external configuration of - # this flow. The flow will side-effect the properties during composition, - # then pass the modified properties along to the FlowConnector for - # execution. See Cascading::Cascade#initialize for details on how - # properties are propagated through cascades. Optionally accepts a :mode - # which will determine the execution mode of this flow. See - # Cascading::Mode.parse for details. + # Builds a Flow given a name and a parent node (a Cascade or nil). + # + # The named options are: + # [properties] Properties hash which allows external configuration of this + # flow. The flow will side-effect the properties during + # composition, then pass the modified properties along to the + # FlowConnector for execution. See Cascade#initialize for + # details on how properties are propagated through cascades. + # [mode] Mode which will determine the execution mode of this flow. See + # Mode.parse for details. def initialize(name, parent, options = {}) @sources, @sinks, @incoming_scopes, @outgoing_scopes, @listeners = {}, {}, {}, {}, [] @properties = options[:properties] || {} @@ -49,6 +55,11 @@ def sink(name, tap) sinks[name] = tap end + # Produces a textual description of this Flow. The description details the + # structure of the Flow, its sources and sinks, and the input and output + # fields of each Assembly. The offset parameter allows for this describe + # to be nested within a calling context, which lets us indent the + # structural hierarchy of a job. def describe(offset = '') description = "#{offset}#{name}:flow\n" description += "#{sources.keys.map{ |source| "#{offset} #{source}:source :: #{incoming_scopes[source].values_fields.to_a.inspect}" }.join("\n")}\n" @@ -69,6 +80,8 @@ def debug_scope(name = nil) puts "Scope for '#{name}':\n #{scope}" end + # Builds a map, keyed by sink name, of the sink metadata for each sink. + # Currently, this contains only the field names of each sink. def sink_metadata @sinks.keys.inject({}) do |sink_metadata, sink_name| raise "Cannot sink undefined assembly '#{sink_name}'" unless @outgoing_scopes[sink_name] From bb70b9d6801e12bac177066f4f6f03cd812cc882 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Thu, 25 Apr 2013 13:39:45 -0500 Subject: [PATCH 24/36] Document Flow; also fixes the cascading.spill.list.threshold property, which was using the deprecated cascading.cogroup.spill.threshold --- lib/cascading/assembly.rb | 34 +++++++++++++++++- lib/cascading/cascade.rb | 2 +- lib/cascading/flow.rb | 74 ++++++++++++++++++++++++++++++++------- 3 files changed, 96 insertions(+), 14 deletions(-) diff --git a/lib/cascading/assembly.rb b/lib/cascading/assembly.rb index a304dc3..8638eb4 100644 --- a/lib/cascading/assembly.rb +++ b/lib/cascading/assembly.rb @@ -35,6 +35,16 @@ module Cascading class Assembly < Cascading::Node attr_reader :head_pipe, :tail_pipe + # Do not use this constructor directly; instead, use Flow#assembly or + # Assembly#branch to build assemblies. + # + # Builds an Assembly given a name, parent, and optional outgoing_scopes + # (necessary only for branching). + # + # An assembly's name is quite important as it will determine: + # * The sources from which it will read, if any + # * The name to be used in joins or unions downstream + # * The name to be used to sink the output of the assembly downstream def initialize(name, parent, outgoing_scopes = {}) super(name, parent) @@ -248,7 +258,29 @@ def outer_join(*args_with_options, &block) join(*args_with_options, &block) end - # Builds a new branch. + # Builds a child Assembly that branches this Assembly given a name and + # block. + # + # An assembly's name is quite important as it will determine: + # * The sources from which it will read, if any + # * The name to be used in joins or unions downstream + # * The name to be used to sink the output of the assembly downstream + # + # Many branches may be built within an assembly. The result of a branch is + # the same as the Flow#assembly constructor, an Assembly object. + # + # Example: + # assembly 'some_work' do + # ... + # + # branch 'more_work' do + # ... + # end + # + # branch 'yet_more_work' do + # ... + # end + # end def branch(name, &block) raise "Could not build branch '#{name}'; block required" unless block_given? assembly = Assembly.new(name, self, @outgoing_scopes) diff --git a/lib/cascading/cascade.rb b/lib/cascading/cascade.rb index 7951e87..1fa3138 100644 --- a/lib/cascading/cascade.rb +++ b/lib/cascading/cascade.rb @@ -17,7 +17,7 @@ class Cascade < Cascading::Node # Do not use this constructor directly; instead, use Cascading::cascade to # build cascades. # - # Builds a Cascade given the specified name. + # Builds a Cascade given a name. # # The named options are: # [properties] Properties hash which will be used as the default properties diff --git a/lib/cascading/flow.rb b/lib/cascading/flow.rb index a76271a..8254264 100644 --- a/lib/cascading/flow.rb +++ b/lib/cascading/flow.rb @@ -33,6 +33,27 @@ def initialize(name, parent, options = {}) self.class.add(name, self) end + # Builds a child Assembly in this Flow given a name and block. + # + # An assembly's name is quite important as it will determine: + # * The sources from which it will read, if any + # * The name to be used in joins or unions downstream + # * The name to be used to sink the output of the assembly downstream + # + # Many assemblies may be built within a flow. The Assembly#branch method + # is used for creating nested assemblies and produces objects of the same + # type as this constructor. + # + # Example: + # flow 'wordcount', :mode => :local do + # assembly 'first_step' do + # ... + # end + # + # assembly 'second_step' do + # ... + # end + # end def assembly(name, &block) raise "Could not build assembly '#{name}'; block required" unless block_given? assembly = Assembly.new(name, self, @outgoing_scopes) @@ -68,12 +89,20 @@ def describe(offset = '') description end + # Accesses the outgoing scope of this Flow at the point at which it is + # called by default, or for the child specified by the given name, if + # specified. This is useful for grabbing the values_fields at any point in + # the construction of the Flow. See Scope for details. def scope(name = nil) raise 'Must specify name if no children have been defined yet' unless name || last_child name ||= last_child.name @outgoing_scopes[name] end + # Prints information about the scope of this Flow at the point at which it + # is called by default, or for the child specified by the given name, if + # specified. This allows you to trace the propagation of field names + # through your job and is handy for debugging. See Scope for details. def debug_scope(name = nil) scope = scope(name) name ||= last_child.name @@ -92,7 +121,16 @@ def sink_metadata end end - # TODO: support all codecs, support list of codecs + # Property modifier that sets the codec and type of the compression for all + # sinks in this flow. Currently only supports o.a.h.i.c.DefaultCodec and + # o.a.h.i.c.GzipCodec, and the the NONE, RECORD, or BLOCK compressions + # types defined in o.a.h.i.SequenceFile. + # + # codec may be symbols like :default or :gzip and type may be symbols like + # :none, :record, or :block. + # + # Example: + # compress_output :default, :block def compress_output(codec, type) properties['mapred.output.compress'] = 'true' properties['mapred.output.compression.codec'] = case codec @@ -108,22 +146,28 @@ def compress_output(codec, type) end end + # Set the cascading.spill.list.threshold property in this flow's + # properties. See c.t.c.SpillableProps for details. def set_spill_threshold(threshold) - properties['cascading.cogroup.spill.threshold'] = threshold.to_s + properties['cascading.spill.list.threshold'] = threshold.to_s end + # Adds the given path to the mapred.cache.files list property. def add_file_to_distributed_cache(file) add_to_distributed_cache(file, "mapred.cache.files") end + # Adds the given path to the mapred.cache.archives list property. def add_archive_to_distributed_cache(file) add_to_distributed_cache(file, "mapred.cache.archives") end + # Appends a FlowListener to the list of listeners for this flow. def add_listener(listener) @listeners << listener end + # Handles locating a file cached from S3 on local disk. TODO: remove def emr_local_path_for_distributed_cache_file(file) # NOTE this needs to be *appended* to the property mapred.local.dir if file =~ /^s3n?:\/\// @@ -135,16 +179,9 @@ def emr_local_path_for_distributed_cache_file(file) end end - def add_to_distributed_cache(file, property) - v = properties[property] - - if v - properties[property] = [v.split(/,/), file].flatten.join(",") - else - properties[property] = file - end - end - + # Connects this Flow, producing a c.f.Flow without completing it (the Flow + # is not executed). This method is used by Cascade to connect its child + # Flows. To connect and complete a Flow, see Flow#complete. def connect puts "Connecting flow '#{name}' with properties:" properties.keys.sort.each do |key| @@ -162,6 +199,9 @@ def connect mode.connect_flow(properties, name, sources, sinks, pipes) end + # Completes this Flow after connecting it. This results in execution of + # the c.f.Flow built from this Flow. Use this method when executing a + # top-level Flow. def complete begin flow = connect @@ -174,6 +214,16 @@ def complete private + def add_to_distributed_cache(file, property) + v = properties[property] + + if v + properties[property] = [v.split(/,/), file].flatten.join(",") + else + properties[property] = file + end + end + def make_tap_parameter(taps, pipe_accessor) taps.inject({}) do |map, (name, tap)| assembly = find_child(name) From c41ebceca74c453813876b04e160f122f3218c4f Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Thu, 25 Apr 2013 16:22:43 -0500 Subject: [PATCH 25/36] Document Assembly --- lib/cascading/assembly.rb | 469 +++++++++++++++++++++++++++----------- 1 file changed, 330 insertions(+), 139 deletions(-) diff --git a/lib/cascading/assembly.rb b/lib/cascading/assembly.rb index 8638eb4..8733423 100644 --- a/lib/cascading/assembly.rb +++ b/lib/cascading/assembly.rb @@ -62,6 +62,11 @@ def initialize(name, parent, outgoing_scopes = {}) @incoming_scopes = [scope] end + # Produces a textual description of this Assembly. The description details + # the structure of the Assembly, its input and output fields and any + # children (branches). The offset parameter allows for this describe to be + # nested within a calling context, which lets us indent the structural + # hierarchy of a job. def describe(offset = '') incoming_scopes_desc = "#{@incoming_scopes.map{ |incoming_scope| incoming_scope.values_fields.to_a.inspect }.join(', ')}" incoming_scopes_desc = "(#{incoming_scopes_desc})" unless @incoming_scopes.size == 1 @@ -70,151 +75,63 @@ def describe(offset = '') description end + # Rather than the immediate parent, this method returns the parent flow of + # this Assembly. If this is a branch, we must traverse the parents of + # parent assemblies. def parent_flow return parent if parent.kind_of?(Flow) parent.parent_flow end + # Accesses the outgoing scope of this Assembly at the point at which it is + # called. This is useful for grabbing the values_fields at any point in + # the construction of the Assembly. See Scope for details. def scope @outgoing_scopes[name] end + # Prints information about the scope of this Assembly at the point at which + # it is called. This allows you to trace the propagation of field names + # through your job and is handy for debugging. See Scope for details. def debug_scope puts "Current scope for '#{name}':\n #{scope}\n----------\n" end - def make_pipe(type, parameters) - @tail_pipe = type.new(*parameters) - @outgoing_scopes[name] = Scope.outgoing_scope(tail_pipe, [scope]) - - tail_pipe - end - private :make_pipe - - def populate_incoming_scopes(assembly_names, group_fields_args = {}) - # NOTE: this overrides the existing incoming_scopes, which changes the - # way describe will function on this assembly - pipes, @incoming_scopes, group_fields = [], [], [] - assembly_names.each do |assembly_name| - assembly = parent_flow.find_child(assembly_name) - raise "Could not find assembly '#{assembly_name}' from '#{name}'" unless assembly - - pipes << assembly.tail_pipe - @incoming_scopes << assembly.scope - group_fields << fields(group_fields_args[assembly_name]) if group_fields_args[assembly_name] - end - [pipes, group_fields] - end - private :populate_incoming_scopes - - def apply_aggregations(group, incoming_scopes, &block) - aggregations = Aggregations.new(self, group, incoming_scopes) - aggregations.instance_eval(&block) if block_given? - - # Sorting of any type means that we cannot use the AggregateBy optimization - if aggregations.can_aggregate_by? && !group.is_sorted && !group.is_sort_reversed - grouping_fields = group.key_selectors.values.first - group.key_selectors.values.each do |key_fields| - raise "Grouping fields mismatch: #{grouping_fields} expected; #{key_fields} found from #{group.key_selectors}" unless key_fields == grouping_fields - end - - aggregate_by = sub_assembly(Java::CascadingPipeAssembly::AggregateBy.new( - name, - group.previous, - grouping_fields, - aggregations.aggregate_bys.to_java(Java::CascadingPipeAssembly::AggregateBy) - ), group.previous, incoming_scopes) - - aggregate_by - else - aggregations.finalize if block_given? - @tail_pipe = aggregations.tail_pipe - @outgoing_scopes[name] = aggregations.scope - - group - end - end - private :apply_aggregations - + # Prints detail about this Assembly including its name, head pipe, and tail + # pipe. def to_s "#{name} : head pipe : #{head_pipe} - tail pipe: #{tail_pipe}" end - def prepare_join(assembly_names, options, &block) - pipes, _ = populate_incoming_scopes(assembly_names) - - group_fields_args = options[:on] - raise 'join requires :on parameter' unless group_fields_args - - if group_fields_args.kind_of?(String) - group_fields_args = [group_fields_args] - end - - group_fields = [] - if group_fields_args.kind_of?(Array) - pipes.size.times do - group_fields << fields(group_fields_args) - end - elsif group_fields_args.kind_of?(Hash) - pipes, group_fields = populate_incoming_scopes(group_fields_args.keys.sort, group_fields_args) - else - raise "Unsupported data type for :on in join: '#{group_fields_args.class}'" - end - - raise 'join requires non-empty :on parameter' if group_fields_args.empty? - group_fields = group_fields.to_java(Java::CascadingTuple::Fields) - incoming_fields = @incoming_scopes.map{ |s| s.values_fields } - declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields)) - joiner = options[:joiner] - is_hash_join = options[:hash] || false - - case joiner - when :inner, 'inner', nil - joiner = Java::CascadingPipeJoiner::InnerJoin.new - when :left, 'left' - joiner = Java::CascadingPipeJoiner::LeftJoin.new - when :right, 'right' - joiner = Java::CascadingPipeJoiner::RightJoin.new - when :outer, 'outer' - joiner = Java::CascadingPipeJoiner::OuterJoin.new - when Array - joiner = joiner.map do |t| - case t - when true, 1, :inner then true - when false, 0, :outer then false - else fail "invalid mixed joiner entry: #{t}" - end - end - joiner = Java::CascadingPipeJoiner::MixedJoin.new(joiner.to_java(:boolean)) - end - - if is_hash_join - raise ArgumentError, "hash joins don't support aggregations" if block_given? - parameters = [ - pipes.to_java(Java::CascadingPipe::Pipe), - group_fields, - declared_fields, - joiner - ] - group_assembly = Java::CascadingPipe::HashJoin.new(*parameters) - else - result_group_fields = dedup_fields(*group_fields) - parameters = [ - pipes.to_java(Java::CascadingPipe::Pipe), - group_fields, - declared_fields, - result_group_fields, - joiner - ] - group_assembly = Java::CascadingPipe::CoGroup.new(*parameters) - end - apply_aggregations(group_assembly, @incoming_scopes, &block) - end - private :prepare_join - # Builds a HashJoin pipe. This should be used carefully, as the right side - # of the join is accumulated entirely in memory. Requires a list of assembly - # names to join and :on to specify the join_fields. + # of the join is accumulated entirely in memory. Requires a list of + # assembly names to join and :on to specify the join_fields. Note that a + # hash_join "takes over" the Assembly in which it is built, so it is + # typically the first statement within the block of the assembly or branch. + # The block passed to this method will be evaluated in the context of + # Aggregations, not Assembly. + # + # The named options are: + # [on] The keys of the join, an array of strings if they are the same in + # all inputs, or a hash mapping assembly names to key names if they + # differ across inputs. + # [declared_fields] By default, a deduplicated array of incoming field + # names (see Cascading::dedup_fields). Specifies the + # names of the fields that will be available to + # aggregations or post-join if no aggregations are + # specified. + # [joiner] A specification of the c.p.j.Joiner to use. Values like :inner + # and 'inner', :right and 'right' are accepted, as well as an + # array specifying mixed joins. Typically, this is not provided, + # but one of the higher level join methods on Assembly is used + # directly (like Assembly#inner_join or Assembly#right_join). + # + # Example: + # assembly 'join_left_right' do + # hash_join 'left', 'right', :on => ['key1', 'key2'], :joiner => :inner do + # sum 'val1', 'val2', :type => :long + # end + # end def hash_join(*args_with_options, &block) options, assembly_names = args_with_options.extract_options!, args_with_options options[:hash] = true @@ -222,7 +139,32 @@ def hash_join(*args_with_options, &block) end # Builds a join (CoGroup) pipe. Requires a list of assembly names to join - # and :on to specify the group_fields. + # and :on to specify the group_fields. Note that a join "takes over" the + # Assembly in which it is built, so it is typically the first statement + # within the block of the assembly or branch. The block passed to this + # method will be evaluated in the context of Aggregations, not Assembly. + # + # The named options are: + # [on] The keys of the join, an array of strings if they are the same in + # all inputs, or a hash mapping assembly names to key names if they + # differ across inputs. + # [declared_fields] By default, a deduplicated array of incoming field + # names (see Cascading::dedup_fields). Specifies the + # names of the fields that will be available to + # aggregations or post-join if no aggregations are + # specified. + # [joiner] A specification of the c.p.j.Joiner to use. Values like :inner + # and 'inner', :right and 'right' are accepted, as well as an + # array specifying mixed joins. Typically, this is not provided, + # but one of the higher level join methods on Assembly is used + # directly (like Assembly#inner_join or Assembly#right_join). + # + # Example: + # assembly 'join_left_right' do + # join 'left', 'right', :on => ['key1', 'key2'], :joiner => :inner do + # sum 'val1', 'val2', :type => :long + # end + # end def join(*args_with_options, &block) options, assembly_names = args_with_options.extract_options!, args_with_options options[:hash] = false @@ -230,6 +172,25 @@ def join(*args_with_options, &block) end alias co_group join + # Builds an inner join (CoGroup) pipe. Requires a list of assembly names to + # join and :on to specify the group_fields. + # + # The named options are: + # [on] The keys of the join, an array of strings if they are the same in + # all inputs, or a hash mapping assembly names to key names if they + # differ across inputs. + # [declared_fields] By default, a deduplicated array of incoming field + # names (see Cascading::dedup_fields). Specifies the + # names of the fields that will be available to + # aggregations or post-join if no aggregations are + # specified. + # + # Example: + # assembly 'join_left_right' do + # inner_join 'left', 'right', :on => ['key1', 'key2'] + # sum 'val1', 'val2', :type => :long + # end + # end def inner_join(*args_with_options, &block) options = args_with_options.extract_options! options[:joiner] = :inner @@ -237,6 +198,25 @@ def inner_join(*args_with_options, &block) join(*args_with_options, &block) end + # Builds a left join (CoGroup) pipe. Requires a list of assembly names to + # join and :on to specify the group_fields. + # + # The named options are: + # [on] The keys of the join, an array of strings if they are the same in + # all inputs, or a hash mapping assembly names to key names if they + # differ across inputs. + # [declared_fields] By default, a deduplicated array of incoming field + # names (see Cascading::dedup_fields). Specifies the + # names of the fields that will be available to + # aggregations or post-join if no aggregations are + # specified. + # + # Example: + # assembly 'join_left_right' do + # left_join 'left', 'right', :on => ['key1', 'key2'] do + # sum 'val1', 'val2', :type => :long + # end + # end def left_join(*args_with_options, &block) options = args_with_options.extract_options! options[:joiner] = :left @@ -244,6 +224,25 @@ def left_join(*args_with_options, &block) join(*args_with_options, &block) end + # Builds a right join (CoGroup) pipe. Requires a list of assembly names to + # join and :on to specify the group_fields. + # + # The named options are: + # [on] The keys of the join, an array of strings if they are the same in + # all inputs, or a hash mapping assembly names to key names if they + # differ across inputs. + # [declared_fields] By default, a deduplicated array of incoming field + # names (see Cascading::dedup_fields). Specifies the + # names of the fields that will be available to + # aggregations or post-join if no aggregations are + # specified. + # + # Example: + # assembly 'join_left_right' do + # right_join 'left', 'right', :on => ['key1', 'key2'] do + # sum 'val1', 'val2', :type => :long + # end + # end def right_join(*args_with_options, &block) options = args_with_options.extract_options! options[:joiner] = :right @@ -251,6 +250,25 @@ def right_join(*args_with_options, &block) join(*args_with_options, &block) end + # Builds an outer join (CoGroup) pipe. Requires a list of assembly names to + # join and :on to specify the group_fields. + # + # The named options are: + # [on] The keys of the join, an array of strings if they are the same in + # all inputs, or a hash mapping assembly names to key names if they + # differ across inputs. + # [declared_fields] By default, a deduplicated array of incoming field + # names (see Cascading::dedup_fields). Specifies the + # names of the fields that will be available to + # aggregations or post-join if no aggregations are + # specified. + # + # Example: + # assembly 'join_left_right' do + # outer_join 'left', 'right', :on => ['key1', 'key2'] do + # sum 'val1', 'val2', :type => :long + # end + # end def outer_join(*args_with_options, &block) options = args_with_options.extract_options! options[:joiner] = :outer @@ -290,8 +308,23 @@ def branch(name, &block) end # Builds a new GroupBy pipe that groups on the fields given in - # args_with_options. Any block passed to this method should contain only - # Everies. + # args_with_options. The block passed to this method will be evaluated in + # the context of Aggregations, not Assembly. + # + # The named options are: + # [sort_by] Optional keys for within-group sort. + # [reverse] Boolean that can reverse the order of within-group sorting + # (only makes sense given :sort_by keys). + # + # Example: + # assembly 'total' do + # ... + # insert 'const' => 1 + # group_by 'const' do + # count + # sum 'val1', 'val2', :type => :long + # end + # end def group_by(*args_with_options, &block) options, group_fields = args_with_options.extract_options!, fields(args_with_options) sort_fields = fields(options[:sort_by]) @@ -304,9 +337,24 @@ def group_by(*args_with_options, &block) # Unifies multiple incoming pipes sharing the same field structure using a # GroupBy. Accepts :on like join and :sort_by and :reverse like group_by, # as well as a block which may be used for a sequence of Every - # aggregations. + # aggregations. The block passed to this method will be evaluated in the + # context of Aggregations, not Assembly. # # By default, groups only on the first field (see line 189 of GroupBy.java) + # + # The named options are: + # [on] The keys of the union, which defaults to the first field in the + # first input assembly. + # [sort_by] Optional keys for sorting. + # [reverse] Boolean that can reverse the order of sorting + # (only makes sense given :sort_by keys). + # + # Example: + # assembly 'union_left_right' do + # union 'left', 'right' do + # sum 'val1', 'val2', :type => :long + # end + # end def union(*args_with_options, &block) options, assembly_names = args_with_options.extract_options!, args_with_options group_fields = fields(options[:on]) @@ -326,10 +374,15 @@ def union(*args_with_options, &block) end alias :union_pipes :union - # Allows you to plugin c.p.SubAssemblies to a cascading.jruby Assembly - # under certain assumptions. Note the default is to extend the tail pipe - # of this Assembly using a linear SubAssembly. See SubAssembly class for - # details. + # Allows you to plugin c.p.SubAssemblies to an Assembly under certain + # assumptions. Note the default is to extend the tail pipe of this + # Assembly using a linear SubAssembly. See SubAssembly class for details. + # + # Example: + # assembly 'id_rows' do + # ... + # sub_assembly Java::CascadingPipeAssembly::Discard.new(tail_pipe, fields('id')) + # end def sub_assembly(sub_assembly, pipes = [tail_pipe], incoming_scopes = [scope]) sub_assembly = SubAssembly.new(self, sub_assembly) sub_assembly.finalize(pipes, incoming_scopes) @@ -340,9 +393,19 @@ def sub_assembly(sub_assembly, pipes = [tail_pipe], incoming_scopes = [scope]) sub_assembly end - # Builds a basic each pipe, and adds it to the current assembly. + # Builds a basic each pipe and adds it to the current Assembly. Default + # arguments are all_fields, a default inherited from c.o.Each. Exactly one + # of :function and :filter must be specified and filters do not support an + # :output selector. + # + # The named options are: + # [filter] A Cascading Filter, mutually exclusive with :function. + # [function] A Cascading Function, mutually exclusive with :filter. + # [output] c.p.Each output selector, only valid with :function. # - # Default arguments are all_fields, a default inherited from c.o.Each. + # Example: + # each fields(input_fields), :function => Java::CascadingOperation::Identity.new + # each 'field1', 'field2', :function => Java::CascadingOperation::Identity.new def each(*args_with_options) options, in_fields = args_with_options.extract_options!, fields(args_with_options) out_fields = fields(options[:output]) # Default Fields.RESULTS from c.o.Each @@ -364,6 +427,11 @@ def each(*args_with_options) include RegexOperations include TextOperations + # Builds an each assertion pipe given a c.o.a.Assertion and adds it to the + # current Assembly. + # + # The named options are: + # [level] The assertion level; defaults to strict. def assert(assertion, options = {}) assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT @@ -371,16 +439,139 @@ def assert(assertion, options = {}) make_pipe(Java::CascadingPipe::Each, parameters) end - # Builds a pipe that assert the size of the tuple is the size specified in parameter. + # Builds a pipe that asserts the size of the tuple is the specified size. def assert_size_equals(size, options = {}) assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(size) assert(assertion, options) end - # Builds a pipe that assert the none of the fields in the tuple are null. + # Builes a pipe that asserts none of the fiels in the tuple are null. def assert_not_null(options = {}) assertion = Java::CascadingOperationAssertion::AssertNotNull.new assert(assertion, options) end + + private + + def make_pipe(type, parameters) + @tail_pipe = type.new(*parameters) + @outgoing_scopes[name] = Scope.outgoing_scope(tail_pipe, [scope]) + + tail_pipe + end + + def populate_incoming_scopes(assembly_names, group_fields_args = {}) + # NOTE: this overrides the existing incoming_scopes, which changes the + # way describe will function on this assembly + pipes, @incoming_scopes, group_fields = [], [], [] + assembly_names.each do |assembly_name| + assembly = parent_flow.find_child(assembly_name) + raise "Could not find assembly '#{assembly_name}' from '#{name}'" unless assembly + + pipes << assembly.tail_pipe + @incoming_scopes << assembly.scope + group_fields << fields(group_fields_args[assembly_name]) if group_fields_args[assembly_name] + end + [pipes, group_fields] + end + + def apply_aggregations(group, incoming_scopes, &block) + aggregations = Aggregations.new(self, group, incoming_scopes) + aggregations.instance_eval(&block) if block_given? + + # Sorting of any type means that we cannot use the AggregateBy optimization + if aggregations.can_aggregate_by? && !group.is_sorted && !group.is_sort_reversed + grouping_fields = group.key_selectors.values.first + group.key_selectors.values.each do |key_fields| + raise "Grouping fields mismatch: #{grouping_fields} expected; #{key_fields} found from #{group.key_selectors}" unless key_fields == grouping_fields + end + + aggregate_by = sub_assembly(Java::CascadingPipeAssembly::AggregateBy.new( + name, + group.previous, + grouping_fields, + aggregations.aggregate_bys.to_java(Java::CascadingPipeAssembly::AggregateBy) + ), group.previous, incoming_scopes) + + aggregate_by + else + aggregations.finalize if block_given? + @tail_pipe = aggregations.tail_pipe + @outgoing_scopes[name] = aggregations.scope + + group + end + end + + def prepare_join(assembly_names, options, &block) + pipes, _ = populate_incoming_scopes(assembly_names) + + group_fields_args = options[:on] + raise 'join requires :on parameter' unless group_fields_args + + if group_fields_args.kind_of?(String) + group_fields_args = [group_fields_args] + end + + group_fields = [] + if group_fields_args.kind_of?(Array) + pipes.size.times do + group_fields << fields(group_fields_args) + end + elsif group_fields_args.kind_of?(Hash) + pipes, group_fields = populate_incoming_scopes(group_fields_args.keys.sort, group_fields_args) + else + raise "Unsupported data type for :on in join: '#{group_fields_args.class}'" + end + + raise 'join requires non-empty :on parameter' if group_fields_args.empty? + group_fields = group_fields.to_java(Java::CascadingTuple::Fields) + incoming_fields = @incoming_scopes.map{ |s| s.values_fields } + declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields)) + joiner = options[:joiner] + is_hash_join = options[:hash] || false + + case joiner + when :inner, 'inner', nil + joiner = Java::CascadingPipeJoiner::InnerJoin.new + when :left, 'left' + joiner = Java::CascadingPipeJoiner::LeftJoin.new + when :right, 'right' + joiner = Java::CascadingPipeJoiner::RightJoin.new + when :outer, 'outer' + joiner = Java::CascadingPipeJoiner::OuterJoin.new + when Array + joiner = joiner.map do |t| + case t + when true, 1, :inner then true + when false, 0, :outer then false + else fail "invalid mixed joiner entry: #{t}" + end + end + joiner = Java::CascadingPipeJoiner::MixedJoin.new(joiner.to_java(:boolean)) + end + + if is_hash_join + raise ArgumentError, "hash joins don't support aggregations" if block_given? + parameters = [ + pipes.to_java(Java::CascadingPipe::Pipe), + group_fields, + declared_fields, + joiner + ] + group_assembly = Java::CascadingPipe::HashJoin.new(*parameters) + else + result_group_fields = dedup_fields(*group_fields) + parameters = [ + pipes.to_java(Java::CascadingPipe::Pipe), + group_fields, + declared_fields, + result_group_fields, + joiner + ] + group_assembly = Java::CascadingPipe::CoGroup.new(*parameters) + end + apply_aggregations(group_assembly, @incoming_scopes, &block) + end end end From c98bda2b23eda6a2f419003be16b52c4c15e3810 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Fri, 26 Apr 2013 06:25:20 -0500 Subject: [PATCH 26/36] Document Node --- lib/cascading/base.rb | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/lib/cascading/base.rb b/lib/cascading/base.rb index 80c30aa..bedb5c6 100644 --- a/lib/cascading/base.rb +++ b/lib/cascading/base.rb @@ -1,7 +1,22 @@ module Cascading + # A Node is a Cascade, Flow, or Assembly, all of which are composite + # structures that describe the hierarchical structure of your job. A Cascade + # may contain many Flows and a Flow and Assembly may contain many Assemblies + # (branches in the case of the Assembly). Nodes are named, contain parent + # and child pointers, and keep track of their children both by name and by + # insertion order. + # + # Nodes must be uniquely named within the scope of their parent so that they + # unambiguously looked up for connecting pipes within a flow. However, we + # only ensure that children are uniquely named upon insertion; full + # uniqueness isn't required until Node#find_child is called (this allows for + # name reuse in a few limited circumstances that was important when migrating + # the Etsy workload to enforce these constraints). class Node attr_accessor :name, :parent, :children, :child_names, :last_child + # A Node requires a name and a parent when it is constructed. Children are + # added later with Node#add_child. def initialize(name, parent) @name = name @parent = parent @@ -23,10 +38,15 @@ def add_child(node) node end + # The qualified name of a node is formed from the name of all nodes in the + # path from the root to that node. def qualified_name parent ? "#{parent.qualified_name}.#{name}" : name end + # Produces a textual description of this Node. This method is overridden + # by all classes inheriting Node, so it serves mainly as a template for + # describing a node with children. def describe(offset = '') "#{offset}#{name}:node\n#{child_names.map{ |child| children[child].describe("#{offset} ") }.join("\n")}" end @@ -44,6 +64,8 @@ def find_child(name) all_children_with_name.first end + # Returns the root Node, the topmost parent of the hierarchy (typically a + # Cascade or Flow). def root return self unless parent parent.root From 2390f2f66a1965b0cd2f0ee02223aa71c385393a Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Fri, 26 Apr 2013 06:32:10 -0500 Subject: [PATCH 27/36] HashJoin may not be followed by aggregations, so remove block entirely from contract to clarify; this is a non-backwards compatible change --- lib/cascading/aggregations.rb | 2 +- lib/cascading/assembly.rb | 16 ++++++++-------- test/test_assembly.rb | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/cascading/aggregations.rb b/lib/cascading/aggregations.rb index 5d08168..b44a7fa 100644 --- a/lib/cascading/aggregations.rb +++ b/lib/cascading/aggregations.rb @@ -15,7 +15,7 @@ module Cascading # # Externally enforced rules: # * May be empty (in which case, Aggregations is not instantiated) - # * Must follow a GroupBy or CoGroup (not a Join or Merge) + # * Must follow a GroupBy or CoGroup (not a HashJoin or Merge) # # Optimizations: # * If the leading Group is a GroupBy and all subsequent Everies are Aggregators that have a corresponding AggregateBy, Aggregations can replace the GroupBy/Aggregator pipe with a single composite AggregateBy diff --git a/lib/cascading/assembly.rb b/lib/cascading/assembly.rb index 8733423..70e2e8c 100644 --- a/lib/cascading/assembly.rb +++ b/lib/cascading/assembly.rb @@ -108,8 +108,9 @@ def to_s # assembly names to join and :on to specify the join_fields. Note that a # hash_join "takes over" the Assembly in which it is built, so it is # typically the first statement within the block of the assembly or branch. - # The block passed to this method will be evaluated in the context of - # Aggregations, not Assembly. + # Additionally, a hash join does not accept a block for aggregations like + # other joins; this restriction is enforced here, but comes directly from + # Cascading. # # The named options are: # [on] The keys of the join, an array of strings if they are the same in @@ -128,14 +129,14 @@ def to_s # # Example: # assembly 'join_left_right' do - # hash_join 'left', 'right', :on => ['key1', 'key2'], :joiner => :inner do - # sum 'val1', 'val2', :type => :long - # end + # hash_join 'left', 'right', :on => ['key1', 'key2'], :joiner => :inner # end - def hash_join(*args_with_options, &block) + def hash_join(*args_with_options) + raise ArgumentError, "HashJoin doesn't support aggregations so the block provided to hash_join will be ignored" if block_given? + options, assembly_names = args_with_options.extract_options!, args_with_options options[:hash] = true - prepare_join(assembly_names, options, &block) + prepare_join(assembly_names, options) end # Builds a join (CoGroup) pipe. Requires a list of assembly names to join @@ -552,7 +553,6 @@ def prepare_join(assembly_names, options, &block) end if is_hash_join - raise ArgumentError, "hash joins don't support aggregations" if block_given? parameters = [ pipes.to_java(Java::CascadingPipe::Pipe), group_fields, diff --git a/test/test_assembly.rb b/test/test_assembly.rb index a67f532..1b03ffd 100644 --- a/test/test_assembly.rb +++ b/test/test_assembly.rb @@ -547,7 +547,7 @@ def test_hash_join_with_block end end end - assert_equal "hash joins don't support aggregations", ex.message + assert_equal "HashJoin doesn't support aggregations so the block provided to hash_join will be ignored", ex.message end def test_branch_unique From 1de715f1b7ef9f64b7dee5ca5be2516d5bc4397f Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Fri, 26 Apr 2013 07:19:57 -0500 Subject: [PATCH 28/36] Document Aggregations --- lib/cascading/aggregations.rb | 181 +++++++++++++++++++++++++++++----- lib/cascading/assembly.rb | 1 + 2 files changed, 156 insertions(+), 26 deletions(-) diff --git a/lib/cascading/aggregations.rb b/lib/cascading/aggregations.rb index b44a7fa..b08a5d7 100644 --- a/lib/cascading/aggregations.rb +++ b/lib/cascading/aggregations.rb @@ -27,6 +27,13 @@ module Cascading class Aggregations attr_reader :assembly, :tail_pipe, :scope, :aggregate_bys + # Do not use this constructor directly; instead, pass a block containing + # the desired aggregations to a group_by, union, or join and it will be + # instantiated for you. + # + # Builds the context in which a sequence of Every aggregations may be + # evaluated in the given assembly appended to the given group pipe and with + # the given incoming_scopes. def initialize(assembly, group, incoming_scopes) @assembly = assembly @tail_pipe = group @@ -36,25 +43,14 @@ def initialize(assembly, group, incoming_scopes) @aggregate_bys = tail_pipe.is_group_by ? [] : nil end + # Prints information about the scope of these Aggregations at the point at + # which it is called. This allows you to trace the propagation of field + # names through your job and is handy for debugging. See Scope for + # details. def debug_scope puts "Current scope of aggregations for '#{assembly.name}':\n #{scope}\n----------\n" end - def make_pipe(type, parameters) - pipe = type.new(*parameters) - - # Enforce 1 Buffer or >= 1 Aggregator rule - if tail_pipe.kind_of?(Java::CascadingPipe::Every) - raise 'Buffer must be sole aggregation' if tail_pipe.buffer? || (tail_pipe.aggregator? && pipe.buffer?) - end - - @tail_pipe = pipe - @scope = Scope.outgoing_scope(tail_pipe, [scope]) - - tail_pipe - end - private :make_pipe - # We can replace these aggregations with the corresponding composite # AggregateBy if the leading Group was a GroupBy and all subsequent # Aggregators had a corresponding AggregateBy (which we've encoded in the @@ -75,7 +71,22 @@ def finalize # Builds an every pipe and adds it to the current list of aggregations. # Note that this list may be either exactly 1 Buffer or any number of - # Aggregators. + # Aggregators. Exactly one of :aggregator or :buffer must be specified and + # :aggregator may be accompanied by a corresponding :aggregate_by. + # + # The named options are: + # [aggregator] A Cascading Aggregator, mutually exclusive with :buffer. + # [aggregate_by] A Cascading AggregateBy that corresponds to the given + # :aggregator. Only makes sense with the :aggregator option + # and does not exist for all Aggregators. Providing nothing + # or nil will cause all Aggregations to operate as normal, + # without being compiled into a composite AggregateBy. + # [buffer] A Cascading Buffer, mutually exclusive with :aggregator. + # [output] c.p.Every output selector. + # + # Example: + # every 'field1', 'field2', :aggregator => sum_aggregator, :aggregate_by => sum_by, :output => all_fields + # every fields(input_fields), :buffer => Java::SomePackage::SomeBuffer.new, :output => all_fields def every(*args_with_options) options, in_fields = args_with_options.extract_options!, fields(args_with_options) out_fields = fields(options[:output]) @@ -96,6 +107,11 @@ def every(*args_with_options) every end + # Builds an every assertion pipe given a c.o.a.Assertion and adds it to the + # current list of aggregations. Note this breaks a chain of AggregateBys. + # + # The named options are: + # [level] The assertion level; defaults to strict. def assert_group(assertion, options = {}) assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT @@ -103,41 +119,128 @@ def assert_group(assertion, options = {}) make_pipe(Java::CascadingPipe::Every, parameters) end + # Builds a pipe that asserts the size of the current group is the specified + # size for all groups. def assert_group_size_equals(size, options = {}) assertion = Java::CascadingOperationAssertion::AssertGroupSizeEquals.new(size) assert_group(assertion, options) end + # Computes the minima of the specified fields within each group. Fields + # may be a list or a map for renaming. Note that fields are sorted by + # input name when a map is provided. + # + # The named options are: + # [ignore] Java Array of Objects of values to be ignored. + # + # Examples: + # assembly 'aggregate' do + # ... + # insert 'const' => 1 + # group_by 'const' do + # min 'field1', 'field2' + # min 'field3' => 'fieldA', 'field4' => 'fieldB' + # end + # discard 'const' + # end def min(*args_with_options) composite_aggregator(args_with_options, Java::CascadingOperationAggregator::Min) end + # Computes the maxima of the specified fields within each group. Fields + # may be a list or a map for renaming. Note that fields are sorted by + # input name when a map is provided. + # + # The named options are: + # [ignore] Java Array of Objects of values to be ignored. + # + # Examples: + # assembly 'aggregate' do + # ... + # insert 'const' => 1 + # group_by 'const' do + # max 'field1', 'field2' + # max 'field3' => 'fieldA', 'field4' => 'fieldB' + # end + # discard 'const' + # end def max(*args_with_options) composite_aggregator(args_with_options, Java::CascadingOperationAggregator::Max) end + # Returns the first value within each group for the specified fields. + # Fields may be a list or a map for renaming. Note that fields are sorted + # by input name when a map is provided. + # + # The named options are: + # [ignore] Java Array of Tuples which should be ignored + # + # Examples: + # assembly 'aggregate' do + # ... + # group_by 'key1', 'key2' do + # first 'field1', 'field2' + # first 'field3' => 'fieldA', 'field4' => 'fieldB' + # end + # end def first(*args_with_options) composite_aggregator(args_with_options, Java::CascadingOperationAggregator::First) end + # Returns the last value within each group for the specified fields. + # Fields may be a list or a map for renaming. Note that fields are sorted + # by input name when a map is provided. + # + # The named options are: + # [ignore] Java Array of Tuples which should be ignored + # + # Examples: + # assembly 'aggregate' do + # ... + # group_by 'key1', 'key2' do + # last 'field1', 'field2' + # last 'field3' => 'fieldA', 'field4' => 'fieldB' + # end + # end def last(*args_with_options) composite_aggregator(args_with_options, Java::CascadingOperationAggregator::Last) end - # Counts elements of a group. May optionally specify the name of the - # output count field (defaults to 'count'). + # Counts elements of each group. May optionally specify the name of the + # output count field, which defaults to 'count'. + # + # Examples: + # assembly 'aggregate' do + # ... + # group_by 'key1', 'key2' do + # count + # count 'key1_key2_count' + # end + # end def count(name = 'count') count_aggregator = Java::CascadingOperationAggregator::Count.new(fields(name)) count_by = Java::CascadingPipeAssembly::CountBy.new(fields(name)) every(last_grouping_fields, :aggregator => count_aggregator, :output => all_fields, :aggregate_by => count_by) end - # Sums one or more fields. Fields to be summed may either be provided as - # the arguments to sum (in which case they will be aggregated into a field - # of the same name in the given order), or via a hash using the :mapping - # parameter (in which case they will be aggregated from the field named by - # the key into the field named by the value after being sorted). The type - # of the output sum may be controlled with the :type parameter. + # Sums the specified fields within each group. Fields may be a list or + # provided through the :mapping option for renaming. Note that fields are + # sorted by name when a map is provided. + # + # The named options are: + # [mapping] Map of input to output field names if renaming is desired. + # Results in output fields sorted by input field. + # [type] Controls the type of the output, specified using values from the + # JAVA_TYPE_MAP as in Janino expressions (:double, :long, etc.) + # + # Examples: + # assembly 'aggregate' do + # ... + # group_by 'key1', 'key2' do + # sum 'field1', 'field2', :type => :long + # sum :mapping => { 'field3' => 'fieldA', 'field4' => 'fieldB' }, :type => :double + # end + # end def sum(*args_with_options) options, in_fields = args_with_options.extract_options!, args_with_options type = JAVA_TYPE_MAP[options[:type]] @@ -152,8 +255,20 @@ def sum(*args_with_options) raise "sum invoked on 0 fields (note :mapping must be provided to explicitly rename fields)" if mapping.empty? end - # Averages one or more fields. The contract of average is identical to - # that of other composite aggregators, but it accepts no options. + # Averages the specified fields within each group. Fields may be a list or + # a map for renaming. Note that fields are sorted by input name when a map + # is provided. + # + # Examples: + # assembly 'aggregate' do + # ... + # insert 'const' => 1 + # group_by 'const' do + # max 'field1', 'field2' + # max 'field3' => 'fieldA', 'field4' => 'fieldB' + # end + # discard 'const' + # end def average(*fields_or_field_map) field_map, _ = extract_field_map(fields_or_field_map) @@ -167,6 +282,20 @@ def average(*fields_or_field_map) private + def make_pipe(type, parameters) + pipe = type.new(*parameters) + + # Enforce 1 Buffer or >= 1 Aggregator rule + if tail_pipe.kind_of?(Java::CascadingPipe::Every) + raise 'Buffer must be sole aggregation' if tail_pipe.buffer? || (tail_pipe.aggregator? && pipe.buffer?) + end + + @tail_pipe = pipe + @scope = Scope.outgoing_scope(tail_pipe, [scope]) + + tail_pipe + end + # Builds a series of every pipes for aggregation. # # Args can either be a list of fields to aggregate and an options hash or diff --git a/lib/cascading/assembly.rb b/lib/cascading/assembly.rb index 70e2e8c..eb4e032 100644 --- a/lib/cascading/assembly.rb +++ b/lib/cascading/assembly.rb @@ -325,6 +325,7 @@ def branch(name, &block) # count # sum 'val1', 'val2', :type => :long # end + # discard 'const' # end def group_by(*args_with_options, &block) options, group_fields = args_with_options.extract_options!, fields(args_with_options) From 8b2d62d7662f6e62834d1b9da99cafdc18cee4c3 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Fri, 26 Apr 2013 07:36:50 -0500 Subject: [PATCH 29/36] Update ExprStub documentation --- lib/cascading/aggregations.rb | 2 +- lib/cascading/expr_stub.rb | 30 ++++++++++++++++++++++-------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/lib/cascading/aggregations.rb b/lib/cascading/aggregations.rb index b08a5d7..fb7f2a9 100644 --- a/lib/cascading/aggregations.rb +++ b/lib/cascading/aggregations.rb @@ -231,7 +231,7 @@ def count(name = 'count') # [mapping] Map of input to output field names if renaming is desired. # Results in output fields sorted by input field. # [type] Controls the type of the output, specified using values from the - # JAVA_TYPE_MAP as in Janino expressions (:double, :long, etc.) + # Cascading::JAVA_TYPE_MAP as in Janino expressions (:double, :long, etc.) # # Examples: # assembly 'aggregate' do diff --git a/lib/cascading/expr_stub.rb b/lib/cascading/expr_stub.rb index c17d3eb..d1f96ae 100644 --- a/lib/cascading/expr_stub.rb +++ b/lib/cascading/expr_stub.rb @@ -3,15 +3,15 @@ class ExprStub attr_accessor :expression, :types, :input_expression # ExprStub requires a Janino expression decorated with field types. For - # example: '"Found: " + (x:int + y:int) + " " + z:string'. Type names are - # defined in Cascading::JAVA_TYPE_MAP. + # example: + # expr('"Found: " + (x:int + y:int) + " " + z:string') + # Type names are defined in Cascading::JAVA_TYPE_MAP. def initialize(expression) @input_expression = expression @expression = expression.dup @types = {} # Simple regexp based parser for types - JAVA_TYPE_MAP.each do |sym, klass| @expression.gsub!(/[A-Za-z0-9_]+:#{sym.to_s}/) do |match| name = match.split(/:/).first.gsub(/\s+/, "") @@ -21,22 +21,33 @@ def initialize(expression) end end - # Extract Java names and types from @types hash + # Extract Java names and types from @types hash. Cascading constructors + # often require two separate Java Arrays in this fashion. def names_and_types names, types = split_hash(@types) [names.to_java(java.lang.String), types.to_java(java.lang.Class)] end + # Prints the original input expression. def to_s @input_expression end # Convenience constructor for an ExprStub that optionally performs # validation. Takes a string to use as a Janino expression and an optional - # options hash. By default, the param :validate is set to true (performs - # expression validation using default actual argument values) and the param - # :validate_with is set to {} (which doesn't override any of the default - # actual argument values used for validation). + # options hash. + # + # The named options are: + # [validate] A boolean indicating whether expression validation using + # default actual argument values should be performed. Defaults + # to true. + # [validate_with] A hash mapping field names (or symbols) to the value that + # should be used for validation. Strings default to nil, + # so if you have previously filtered nulls you might use a + # marker value like 'nulls_filtered'. Defaults to {}. + # + # Example: + # insert 'x_eq_y' => expr('x:string.equals(y:string)', :validate_with => { :x => 'nulls_filtered' }) def self.expr(expression, options = {}) options = { :validate => true, :validate_with => {} }.merge(options) expr_stub = expression.kind_of?(ExprStub) ? expression : ExprStub.new(expression).compile @@ -74,6 +85,9 @@ def validate(actual_args = {}) self.eval(test_values.merge(actual_args)) end + # Given a scope, validates that the fields required by this ExprStub are + # available in the values fields of the scope. Returns those values fields + # which are unused in the expression. def validate_scope(scope) validate_fields(scope.values_fields.to_a) end From 01009a3e8b36a96a90ad82089d681a6ad6764219 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Fri, 26 Apr 2013 08:27:42 -0500 Subject: [PATCH 30/36] Update Mode documentation --- lib/cascading/mode.rb | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/lib/cascading/mode.rb b/lib/cascading/mode.rb index 9b073b8..dbbe8f7 100644 --- a/lib/cascading/mode.rb +++ b/lib/cascading/mode.rb @@ -1,9 +1,8 @@ module Cascading - # A Cascading::Mode encapsulates the idea of the execution mode for your - # flows. The default is Hadoop mode, but you can request that your code run - # in Cascading local mode. If you subsequently use a tap or a scheme that - # has no local implementation, the mode will be converted back to Hadoop - # mode. + # A Mode encapsulates the idea of the execution mode for your flows. The + # default is Hadoop mode, but you can request that your code run in Cascading + # local mode. If you subsequently use a tap or a scheme that has no local + # implementation, the mode will be converted back to Hadoop mode. class Mode attr_reader :local @@ -19,6 +18,8 @@ def self.parse(mode) end end + # Constructs a Mode given a flag indicating if it should be Cascading local + # mode. def initialize(local) @local = local end @@ -37,9 +38,9 @@ def source_tap(name, tap) end # Builds a c.f.Flow given properties, name, sources, sinks, and pipes from - # a Cascading::Flow. The current mode is adjusted based on the taps and - # schemes of the sources and sinks, then the correct taps are selected - # before building the flow. + # a Flow. The current mode is adjusted based on the taps and schemes of + # the sources and sinks, then the correct taps are selected before building + # the flow. def connect_flow(properties, name, sources, sinks, pipes) update_local_mode(sources, sinks) sources = select_taps(sources) From 9bdff8cce26e747aebe8dff442451c4032330612 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Fri, 26 Apr 2013 08:32:37 -0500 Subject: [PATCH 31/36] Update SubAssembly documentation --- lib/cascading/sub_assembly.rb | 13 ++++++++----- lib/cascading/tap.rb | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/lib/cascading/sub_assembly.rb b/lib/cascading/sub_assembly.rb index 022c120..61d11e3 100644 --- a/lib/cascading/sub_assembly.rb +++ b/lib/cascading/sub_assembly.rb @@ -4,17 +4,15 @@ module Cascading # Allows you to plugin c.p.SubAssemblies to a cascading.jruby Assembly. # # Assumptions: - # * You will either use the tail_pipe of the calling Assembly, or overwrite - # its incoming_scopes (as do join and union) - # * Your subassembly will have only 1 tail pipe; branching is not - # supported. This allows you to continue operating upon the tail of the - # SubAssembly within the calling Assembly + # * You will either use the tail_pipe of the calling Assembly, or overwrite its incoming_scopes (as do join and union) + # * Your subassembly will have only 1 tail pipe; branching is not supported. This allows you to continue operating upon the tail of the SubAssembly within the calling Assembly # * You will not use nested c.p.SubAssemblies # # This is a low-level tool, so be careful. class SubAssembly attr_reader :assembly, :sub_assembly, :tail_pipe, :scope + # Construct a SubAssembly within the given Assembly def initialize(assembly, sub_assembly) @assembly = assembly @sub_assembly = sub_assembly @@ -26,6 +24,11 @@ def initialize(assembly, sub_assembly) raise 'SubAssembly must set exactly 1 tail in constructor' unless sub_assembly.tails.size == 1 end + # Complete the addition of the SubAssembly to the Assembly. Propagates + # Scope through the SubAssembly and updates the tail_pipe of the + # SubAssembly for passing back to the enclosing Assembly. May accept many + # incoming pipes, but typically only recieves the tail_pipe of the + # enclosing Assembly. def finalize(pipes, incoming_scopes) # Build adjacency list for sub_assembly graph = {} diff --git a/lib/cascading/tap.rb b/lib/cascading/tap.rb index 9ba4391..3583a5f 100644 --- a/lib/cascading/tap.rb +++ b/lib/cascading/tap.rb @@ -2,7 +2,7 @@ module Cascading # A BaseTap wraps up a pair of Cascading taps, one for Cascading local mode # and the other for Hadoop mode. Note that these are optional, but at least # one must be provided for most taps. A SequenceFile is a notable example of - # a Scheme for which their is no Cascading local mode version, so a Tap you + # a Scheme for which there is no Cascading local mode version, so a Tap you # build with it will have no local_tap. class BaseTap attr_reader :local_tap, :hadoop_tap From 9b023feedb6bee49d6fecf9340a489844a954a8e Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Fri, 26 Apr 2013 08:45:41 -0500 Subject: [PATCH 32/36] Document Scope --- lib/cascading/scope.rb | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/lib/cascading/scope.rb b/lib/cascading/scope.rb index e7e619a..446b396 100644 --- a/lib/cascading/scope.rb +++ b/lib/cascading/scope.rb @@ -1,23 +1,35 @@ module Cascading + # Scope is a wrapper for a the private Cascading c.f.p.Scope object used to + # connect the dataflow graph by resolving fields. cascading.jruby wraps this + # facility so that it may be used to propagate field names at composition + # time (not Cascading plan time) in the same way they will later be + # propagated by the planner. class Scope attr_accessor :scope + # Construct a Scope given the Cascading c.f.p.Scope to wrap. def initialize(scope) @scope = scope end + # Copy one Scope into another; relies upon the copy constructor of + # c.f.p.Scope. def copy Scope.new(Java::CascadingFlowPlanner::Scope.new(@scope)) end + # Build a c.f.p.Scope for a Flow, which is empty except for its name. def self.flow_scope(name) Java::CascadingFlowPlanner::Scope.new(name) end + # Build an empty Scope, wrapping an empty c.f.p.Scope. def self.empty_scope(name) Scope.new(Java::CascadingFlowPlanner::Scope.new(name)) end + # Build a Scope for a single source Tap. The flow_scope is propagated + # through this call into a new Scope. def self.source_scope(name, tap, flow_scope) incoming_scopes = java.util.HashSet.new incoming_scopes.add(flow_scope) @@ -27,28 +39,30 @@ def self.source_scope(name, tap, flow_scope) Scope.new(java_scope) end + # Build a Scope for an arbitrary flow element. This is used to update the + # Scope at each stage in a pipe Assembly. def self.outgoing_scope(flow_element, incoming_scopes) java_scopes = incoming_scopes.compact.map{ |s| s.scope } Scope.new(outgoing_scope_for(flow_element, java.util.HashSet.new(java_scopes))) end + # The values fields of the Scope, which indicate the fields in the current + # dataflow tuple. def values_fields @scope.out_values_fields end + # The grouping fields of the Scope, which indicate the keys of an + # group/cogroup. def grouping_fields @scope.out_grouping_fields end - def scope_fields_to_s(accessor) - begin - fields = @scope.send(accessor) - fields.nil? ? 'null' : fields.to_s - rescue Exception => e - 'ERROR' - end - end - + # Prints a detailed description of this Scope, including its type and + # various selectors, fields, and key fields. Data is bubbled up directly + # from the Cascading c.f.p.Scope. This output can be useful for debugging + # the propagation of fields through your job (see Flow#debug_scope and + # Assembly#debug_scope, which both rely upon this method). def to_s kind = 'Unknown' kind = 'Tap' if @scope.tap? @@ -77,6 +91,15 @@ def to_s private + def scope_fields_to_s(accessor) + begin + fields = @scope.send(accessor) + fields.nil? ? 'null' : fields.to_s + rescue Exception => e + 'ERROR' + end + end + def self.outgoing_scope_for(flow_element, incoming_scopes) begin flow_element.outgoing_scope_for(incoming_scopes) From 53dfc4df47cbc3eaede53fa9dc9d6924b2cd8e75 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Fri, 26 Apr 2013 13:16:19 -0500 Subject: [PATCH 33/36] Relicense cascading.jruby from LGPL to Apache 2.0 --- LICENSE.txt | 173 ++++------------------------------------------------ 1 file changed, 13 insertions(+), 160 deletions(-) diff --git a/LICENSE.txt b/LICENSE.txt index fc8a5de..331d4b0 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,165 +1,18 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 +License: + Project and contact information: http://github.com/mrwalker/cascading.jruby - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 - This version of the GNU Lesser General Public License incorporates -the terms and conditions of version 3 of the GNU General Public -License, supplemented by the additional permissions listed below. + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. - 0. Additional Definitions. +Third-party Licenses: - As used herein, "this License" refers to version 3 of the GNU Lesser -General Public License, and the "GNU GPL" refers to version 3 of the GNU -General Public License. - - "The Library" refers to a covered work governed by this License, -other than an Application or a Combined Work as defined below. - - An "Application" is any work that makes use of an interface provided -by the Library, but which is not otherwise based on the Library. -Defining a subclass of a class defined by the Library is deemed a mode -of using an interface provided by the Library. - - A "Combined Work" is a work produced by combining or linking an -Application with the Library. The particular version of the Library -with which the Combined Work was made is also called the "Linked -Version". - - The "Minimal Corresponding Source" for a Combined Work means the -Corresponding Source for the Combined Work, excluding any source code -for portions of the Combined Work that, considered in isolation, are -based on the Application, and not on the Linked Version. - - The "Corresponding Application Code" for a Combined Work means the -object code and/or source code for the Application, including any data -and utility programs needed for reproducing the Combined Work from the -Application, but excluding the System Libraries of the Combined Work. - - 1. Exception to Section 3 of the GNU GPL. - - You may convey a covered work under sections 3 and 4 of this License -without being bound by section 3 of the GNU GPL. - - 2. Conveying Modified Versions. - - If you modify a copy of the Library, and, in your modifications, a -facility refers to a function or data to be supplied by an Application -that uses the facility (other than as an argument passed when the -facility is invoked), then you may convey a copy of the modified -version: - - a) under this License, provided that you make a good faith effort to - ensure that, in the event an Application does not supply the - function or data, the facility still operates, and performs - whatever part of its purpose remains meaningful, or - - b) under the GNU GPL, with none of the additional permissions of - this License applicable to that copy. - - 3. Object Code Incorporating Material from Library Header Files. - - The object code form of an Application may incorporate material from -a header file that is part of the Library. You may convey such object -code under terms of your choice, provided that, if the incorporated -material is not limited to numerical parameters, data structure -layouts and accessors, or small macros, inline functions and templates -(ten or fewer lines in length), you do both of the following: - - a) Give prominent notice with each copy of the object code that the - Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the object code with a copy of the GNU GPL and this license - document. - - 4. Combined Works. - - You may convey a Combined Work under terms of your choice that, -taken together, effectively do not restrict modification of the -portions of the Library contained in the Combined Work and reverse -engineering for debugging such modifications, if you also do each of -the following: - - a) Give prominent notice with each copy of the Combined Work that - the Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the Combined Work with a copy of the GNU GPL and this license - document. - - c) For a Combined Work that displays copyright notices during - execution, include the copyright notice for the Library among - these notices, as well as a reference directing the user to the - copies of the GNU GPL and this license document. - - d) Do one of the following: - - 0) Convey the Minimal Corresponding Source under the terms of this - License, and the Corresponding Application Code in a form - suitable for, and under terms that permit, the user to - recombine or relink the Application with a modified version of - the Linked Version to produce a modified Combined Work, in the - manner specified by section 6 of the GNU GPL for conveying - Corresponding Source. - - 1) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (a) uses at run time - a copy of the Library already present on the user's computer - system, and (b) will operate properly with a modified version - of the Library that is interface-compatible with the Linked - Version. - - e) Provide Installation Information, but only if you would otherwise - be required to provide such information under section 6 of the - GNU GPL, and only to the extent that such information is - necessary to install and execute a modified version of the - Combined Work produced by recombining or relinking the - Application with a modified version of the Linked Version. (If - you use option 4d0, the Installation Information must accompany - the Minimal Corresponding Source and Corresponding Application - Code. If you use option 4d1, you must provide the Installation - Information in the manner specified by section 6 of the GNU GPL - for conveying Corresponding Source.) - - 5. Combined Libraries. - - You may place library facilities that are a work based on the -Library side by side in a single library together with other library -facilities that are not Applications and are not covered by this -License, and convey such a combined library under terms of your -choice, if you do both of the following: - - a) Accompany the combined library with a copy of the same work based - on the Library, uncombined with any other library facilities, - conveyed under the terms of this License. - - b) Give prominent notice with the combined library that part of it - is a work based on the Library, and explaining where to find the - accompanying uncombined form of the same work. - - 6. Revised Versions of the GNU Lesser General Public License. - - The Free Software Foundation may publish revised and/or new versions -of the GNU Lesser General Public License from time to time. Such new -versions will be similar in spirit to the present version, but may -differ in detail to address new problems or concerns. - - Each version is given a distinguishing version number. If the -Library as you received it specifies that a certain numbered version -of the GNU Lesser General Public License "or any later version" -applies to it, you have the option of following the terms and -conditions either of that published version or of any later version -published by the Free Software Foundation. If the Library as you -received it does not specify a version number of the GNU Lesser -General Public License, you may choose any version of the GNU Lesser -General Public License ever published by the Free Software Foundation. - - If the Library as you received it specifies that a proxy can decide -whether future versions of the GNU Lesser General Public License shall -apply, that proxy's public statement of acceptance of any version is -permanent authorization for you to choose that version for the -Library. + All third-party dependencies are listed in ivy.xml. From 3bc44b90904b1df0c9ebbe26179ee7ac9480b5c8 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Fri, 26 Apr 2013 13:10:44 -0500 Subject: [PATCH 34/36] Prep for 1.0.0 release --- README.md | 2 +- cascading.jruby.gemspec | 2 +- lib/cascading.rb | 12 +++++++----- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 7105c6f..85ef09c 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ end.complete cascading.jruby provides a clean Ruby interface to Cascading, but doesn't attempt to add abstractions on top of it. Therefore, you should be acquainted with the [Cascading](http://docs.cascading.org/cascading/2.0/userguide/html/) [API](http://docs.cascading.org/cascading/2.0/javadoc/) before you begin. -For operations you can apply to your dataflow within a pipe assembly, see the [Assembly](http://rubydoc.info/gems/cascading.jruby/0.0.10/Cascading/Assembly) class. For operations available within a block passed to a group_by, union, or join, see the [Aggregations](http://rubydoc.info/gems/cascading.jruby/0.0.10/Cascading/Aggregations) class. +For operations you can apply to your dataflow within a pipe assembly, see the [Assembly](http://rubydoc.info/gems/cascading.jruby/1.0.0/Cascading/Assembly) class. For operations available within a block passed to a group_by, union, or join, see the [Aggregations](http://rubydoc.info/gems/cascading.jruby/1.0.0/Cascading/Aggregations) class. Note that the Ruby code you write merely constructs a Cascading job, so no JRuby runtime is required on your cluster. This stands in contrast with writing [Hadoop streaming jobs in Ruby](http://www.quora.com/How-do-the-different-options-for-Ruby-on-Hadoop-compare). To run cascading.jruby applications on a Hadoop cluster, you must use [Jading](https://github.com/mrwalker/jading) to package them into a job jar. diff --git a/cascading.jruby.gemspec b/cascading.jruby.gemspec index 7b60f10..50f39b4 100644 --- a/cascading.jruby.gemspec +++ b/cascading.jruby.gemspec @@ -2,7 +2,7 @@ Gem::Specification.new do |s| s.name = "cascading.jruby" - s.version = "0.0.10" + s.version = "1.0.0" s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= s.authors = ["Matt Walker", "Gr\303\251goire Marabout"] diff --git a/lib/cascading.rb b/lib/cascading.rb index 12ceb85..9d19936 100644 --- a/lib/cascading.rb +++ b/lib/cascading.rb @@ -2,24 +2,26 @@ module Cascading # :stopdoc: - VERSION = '0.0.10' + VERSION = '1.0.0' end +require 'cascading/aggregations' require 'cascading/assembly' require 'cascading/base' require 'cascading/cascade' require 'cascading/cascading' require 'cascading/cascading_exception' require 'cascading/expr_stub' +require 'cascading/filter_operations' require 'cascading/flow' +require 'cascading/identity_operations' require 'cascading/mode' require 'cascading/operations' -require 'cascading/identity_operations' -require 'cascading/filter_operations' require 'cascading/regex_operations' -require 'cascading/text_operations' require 'cascading/scope' +require 'cascading/sub_assembly' require 'cascading/tap' +require 'cascading/text_operations' -# include module to make them available at top package +# include module to make it available at top level include Cascading From 547833cea0ed11bc7a6e826f988cfbb721703e08 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Fri, 26 Apr 2013 13:58:52 -0500 Subject: [PATCH 35/36] Cleanup gemspec --- cascading.jruby.gemspec | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/cascading.jruby.gemspec b/cascading.jruby.gemspec index 50f39b4..10057fa 100644 --- a/cascading.jruby.gemspec +++ b/cascading.jruby.gemspec @@ -1,24 +1,25 @@ # -*- encoding: utf-8 -*- +#$: << File.join(File.dirname(__FILE__), '..', 'lib') +#require 'cascading' Gem::Specification.new do |s| s.name = "cascading.jruby" - s.version = "1.0.0" - - s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= + # TODO: in 2.0.0, Job will encapsulate Cascading module, so we can directly + # grab the version from there; for now, just hack it + #s.version = Cascading::VERSION + s.version = '1.0.0' + s.date = Time.now.strftime('%Y-%m-%d') + s.summary = "A JRuby DSL for Cascading" + s.homepage = "http://github.com/mrwalker/cascading.jruby" + s.email = "matt.r.walker@gmail.com" s.authors = ["Matt Walker", "Gr\303\251goire Marabout"] - s.description = "cascading.jruby is a small DSL above Cascading, written in JRuby" - s.email = "mwalker@etsy.com" - s.extra_rdoc_files = ["README.md", "LICENSE.txt"] + s.files = Dir.glob("lib/**/*.rb") - s.homepage = "http://github.com/etsy/cascading.jruby" - s.rdoc_options = ["--main", "README.md"] - s.require_paths = ["lib"] - s.rubyforge_project = "cascading.jruby" - s.rubygems_version = "1.8.21" - s.summary = "A JRuby DSL for Cascading" s.test_files = Dir.glob("test/**/*.rb") + s.require_paths = ["lib"] + + s.rdoc_options = ["--main", "README.md"] + s.extra_rdoc_files = ["README.md", "LICENSE.txt"] - if s.respond_to? :specification_version then - s.specification_version = 3 - end + s.description = "cascading.jruby is a small DSL above Cascading, written in JRuby" end From 2c0131f063a108a38418324f8c588b0aa8841962 Mon Sep 17 00:00:00 2001 From: Matt Walker Date: Sun, 11 Jan 2015 04:52:19 -0600 Subject: [PATCH 36/36] Doc fix --- lib/cascading/scope.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/cascading/scope.rb b/lib/cascading/scope.rb index 446b396..0062449 100644 --- a/lib/cascading/scope.rb +++ b/lib/cascading/scope.rb @@ -1,5 +1,5 @@ module Cascading - # Scope is a wrapper for a the private Cascading c.f.p.Scope object used to + # Scope is a wrapper for the private Cascading c.f.p.Scope object used to # connect the dataflow graph by resolving fields. cascading.jruby wraps this # facility so that it may be used to propagate field names at composition # time (not Cascading plan time) in the same way they will later be