diff --git a/.travis.yml b/.travis.yml
index 09f916d..1348cf0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,5 +2,4 @@ language: ruby
rvm:
- jruby-18mode
notifications:
- recipients:
- - mwalker@etsy.com
+ irc: "irc.freenode.org#etsydoop"
diff --git a/Gemfile b/Gemfile
index a65514d..ae50466 100644
--- a/Gemfile
+++ b/Gemfile
@@ -1,6 +1,6 @@
-source :rubygems
+source 'https://rubygems.org'
group :test do
- gem 'rake', '0.8.7'
+ gem 'rake', '10.0.3'
gem 'rspec', '1.1.11'
end
diff --git a/Gemfile.lock b/Gemfile.lock
index bda8983..7b6363d 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -1,12 +1,12 @@
GEM
- remote: http://rubygems.org/
+ remote: https://rubygems.org/
specs:
- rake (0.8.7)
+ rake (10.0.3)
rspec (1.1.11)
PLATFORMS
java
DEPENDENCIES
- rake (= 0.8.7)
+ rake (= 10.0.3)
rspec (= 1.1.11)
diff --git a/LICENSE.txt b/LICENSE.txt
index fc8a5de..331d4b0 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,165 +1,18 @@
- GNU LESSER GENERAL PUBLIC LICENSE
- Version 3, 29 June 2007
+License:
+ Project and contact information: http://github.com/mrwalker/cascading.jruby
- Copyright (C) 2007 Free Software Foundation, Inc.
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
- This version of the GNU Lesser General Public License incorporates
-the terms and conditions of version 3 of the GNU General Public
-License, supplemented by the additional permissions listed below.
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
- 0. Additional Definitions.
+Third-party Licenses:
- As used herein, "this License" refers to version 3 of the GNU Lesser
-General Public License, and the "GNU GPL" refers to version 3 of the GNU
-General Public License.
-
- "The Library" refers to a covered work governed by this License,
-other than an Application or a Combined Work as defined below.
-
- An "Application" is any work that makes use of an interface provided
-by the Library, but which is not otherwise based on the Library.
-Defining a subclass of a class defined by the Library is deemed a mode
-of using an interface provided by the Library.
-
- A "Combined Work" is a work produced by combining or linking an
-Application with the Library. The particular version of the Library
-with which the Combined Work was made is also called the "Linked
-Version".
-
- The "Minimal Corresponding Source" for a Combined Work means the
-Corresponding Source for the Combined Work, excluding any source code
-for portions of the Combined Work that, considered in isolation, are
-based on the Application, and not on the Linked Version.
-
- The "Corresponding Application Code" for a Combined Work means the
-object code and/or source code for the Application, including any data
-and utility programs needed for reproducing the Combined Work from the
-Application, but excluding the System Libraries of the Combined Work.
-
- 1. Exception to Section 3 of the GNU GPL.
-
- You may convey a covered work under sections 3 and 4 of this License
-without being bound by section 3 of the GNU GPL.
-
- 2. Conveying Modified Versions.
-
- If you modify a copy of the Library, and, in your modifications, a
-facility refers to a function or data to be supplied by an Application
-that uses the facility (other than as an argument passed when the
-facility is invoked), then you may convey a copy of the modified
-version:
-
- a) under this License, provided that you make a good faith effort to
- ensure that, in the event an Application does not supply the
- function or data, the facility still operates, and performs
- whatever part of its purpose remains meaningful, or
-
- b) under the GNU GPL, with none of the additional permissions of
- this License applicable to that copy.
-
- 3. Object Code Incorporating Material from Library Header Files.
-
- The object code form of an Application may incorporate material from
-a header file that is part of the Library. You may convey such object
-code under terms of your choice, provided that, if the incorporated
-material is not limited to numerical parameters, data structure
-layouts and accessors, or small macros, inline functions and templates
-(ten or fewer lines in length), you do both of the following:
-
- a) Give prominent notice with each copy of the object code that the
- Library is used in it and that the Library and its use are
- covered by this License.
-
- b) Accompany the object code with a copy of the GNU GPL and this license
- document.
-
- 4. Combined Works.
-
- You may convey a Combined Work under terms of your choice that,
-taken together, effectively do not restrict modification of the
-portions of the Library contained in the Combined Work and reverse
-engineering for debugging such modifications, if you also do each of
-the following:
-
- a) Give prominent notice with each copy of the Combined Work that
- the Library is used in it and that the Library and its use are
- covered by this License.
-
- b) Accompany the Combined Work with a copy of the GNU GPL and this license
- document.
-
- c) For a Combined Work that displays copyright notices during
- execution, include the copyright notice for the Library among
- these notices, as well as a reference directing the user to the
- copies of the GNU GPL and this license document.
-
- d) Do one of the following:
-
- 0) Convey the Minimal Corresponding Source under the terms of this
- License, and the Corresponding Application Code in a form
- suitable for, and under terms that permit, the user to
- recombine or relink the Application with a modified version of
- the Linked Version to produce a modified Combined Work, in the
- manner specified by section 6 of the GNU GPL for conveying
- Corresponding Source.
-
- 1) Use a suitable shared library mechanism for linking with the
- Library. A suitable mechanism is one that (a) uses at run time
- a copy of the Library already present on the user's computer
- system, and (b) will operate properly with a modified version
- of the Library that is interface-compatible with the Linked
- Version.
-
- e) Provide Installation Information, but only if you would otherwise
- be required to provide such information under section 6 of the
- GNU GPL, and only to the extent that such information is
- necessary to install and execute a modified version of the
- Combined Work produced by recombining or relinking the
- Application with a modified version of the Linked Version. (If
- you use option 4d0, the Installation Information must accompany
- the Minimal Corresponding Source and Corresponding Application
- Code. If you use option 4d1, you must provide the Installation
- Information in the manner specified by section 6 of the GNU GPL
- for conveying Corresponding Source.)
-
- 5. Combined Libraries.
-
- You may place library facilities that are a work based on the
-Library side by side in a single library together with other library
-facilities that are not Applications and are not covered by this
-License, and convey such a combined library under terms of your
-choice, if you do both of the following:
-
- a) Accompany the combined library with a copy of the same work based
- on the Library, uncombined with any other library facilities,
- conveyed under the terms of this License.
-
- b) Give prominent notice with the combined library that part of it
- is a work based on the Library, and explaining where to find the
- accompanying uncombined form of the same work.
-
- 6. Revised Versions of the GNU Lesser General Public License.
-
- The Free Software Foundation may publish revised and/or new versions
-of the GNU Lesser General Public License from time to time. Such new
-versions will be similar in spirit to the present version, but may
-differ in detail to address new problems or concerns.
-
- Each version is given a distinguishing version number. If the
-Library as you received it specifies that a certain numbered version
-of the GNU Lesser General Public License "or any later version"
-applies to it, you have the option of following the terms and
-conditions either of that published version or of any later version
-published by the Free Software Foundation. If the Library as you
-received it does not specify a version number of the GNU Lesser
-General Public License, you may choose any version of the GNU Lesser
-General Public License ever published by the Free Software Foundation.
-
- If the Library as you received it specifies that a proxy can decide
-whether future versions of the GNU Lesser General Public License shall
-apply, that proxy's public statement of acceptance of any version is
-permanent authorization for you to choose that version for the
-Library.
+ All third-party dependencies are listed in ivy.xml.
diff --git a/README.md b/README.md
index c6bf9c2..85ef09c 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ cascade 'wordcount', :mode => :local do
source 'input', tap(input_path)
assembly 'input' do
- split_rows 'line', 'word', :pattern => /[.,]*\s+/, :output => 'word'
+ split_rows 'line', /[.,]*\s+/, 'word', :output => 'word'
group_by 'word' do
count
end
@@ -28,8 +28,8 @@ end.complete
cascading.jruby provides a clean Ruby interface to Cascading, but doesn't attempt to add abstractions on top of it. Therefore, you should be acquainted with the [Cascading](http://docs.cascading.org/cascading/2.0/userguide/html/) [API](http://docs.cascading.org/cascading/2.0/javadoc/) before you begin.
-For operations you can apply to your dataflow within a pipe assembly, see the [Assembly](http://rubydoc.info/gems/cascading.jruby/0.0.10/Cascading/Assembly) class. For operations available within a block passed to a group_by, union, or join, see the [Aggregations](http://rubydoc.info/gems/cascading.jruby/0.0.10/Cascading/Aggregations) class.
+For operations you can apply to your dataflow within a pipe assembly, see the [Assembly](http://rubydoc.info/gems/cascading.jruby/1.0.0/Cascading/Assembly) class. For operations available within a block passed to a group_by, union, or join, see the [Aggregations](http://rubydoc.info/gems/cascading.jruby/1.0.0/Cascading/Aggregations) class.
-Note that the Ruby code you write merely constructs a Cascading job, so no JRuby runtime is required on your cluster. This stands in contrast with writing [Hadoop streaming jobs in Ruby](http://www.quora.com/How-do-the-different-options-for-Ruby-on-Hadoop-compare). To run cascading.jruby applications on a Hadoop cluster, you must use [Jading](https://github.com/etsy/jading) to package them into a job jar.
+Note that the Ruby code you write merely constructs a Cascading job, so no JRuby runtime is required on your cluster. This stands in contrast with writing [Hadoop streaming jobs in Ruby](http://www.quora.com/How-do-the-different-options-for-Ruby-on-Hadoop-compare). To run cascading.jruby applications on a Hadoop cluster, you must use [Jading](https://github.com/mrwalker/jading) to package them into a job jar.
-cascading.jruby has been tested on JRuby versions 1.2.0, 1.4.0, 1.5.3, 1.6.5, and 1.6.7.2.
+cascading.jruby has been tested on JRuby versions 1.2.0, 1.4.0, 1.5.3, 1.6.5, 1.6.7.2, 1.7.0, and 1.7.3.
diff --git a/cascading.jruby.gemspec b/cascading.jruby.gemspec
index a9c27c2..10057fa 100644
--- a/cascading.jruby.gemspec
+++ b/cascading.jruby.gemspec
@@ -1,24 +1,25 @@
# -*- encoding: utf-8 -*-
+#$: << File.join(File.dirname(__FILE__), '..', 'lib')
+#require 'cascading'
Gem::Specification.new do |s|
s.name = "cascading.jruby"
- s.version = "0.0.10"
-
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+ # TODO: in 2.0.0, Job will encapsulate Cascading module, so we can directly
+ # grab the version from there; for now, just hack it
+ #s.version = Cascading::VERSION
+ s.version = '1.0.0'
+ s.date = Time.now.strftime('%Y-%m-%d')
+ s.summary = "A JRuby DSL for Cascading"
+ s.homepage = "http://github.com/mrwalker/cascading.jruby"
+ s.email = "matt.r.walker@gmail.com"
s.authors = ["Matt Walker", "Gr\303\251goire Marabout"]
- s.description = "cascading.jruby is a small DSL above Cascading, written in JRuby"
- s.email = "mwalker@etsy.com"
- s.extra_rdoc_files = ["LICENSE.txt"]
- s.files = ["lib/cascading.rb", "lib/cascading/aggregations.rb", "lib/cascading/assembly.rb", "lib/cascading/base.rb", "lib/cascading/cascade.rb", "lib/cascading/cascading.rb", "lib/cascading/cascading_exception.rb", "lib/cascading/expr_stub.rb", "lib/cascading/ext/array.rb", "lib/cascading/flow.rb", "lib/cascading/mode.rb", "lib/cascading/operations.rb", "lib/cascading/scope.rb", "lib/cascading/sub_assembly.rb", "lib/cascading/tap.rb"]
- s.homepage = "http://github.com/etsy/cascading.jruby"
- s.rdoc_options = ["--main", "README.md"]
+
+ s.files = Dir.glob("lib/**/*.rb")
+ s.test_files = Dir.glob("test/**/*.rb")
s.require_paths = ["lib"]
- s.rubyforge_project = "cascading.jruby"
- s.rubygems_version = "1.8.21"
- s.summary = "A JRuby DSL for Cascading"
- s.test_files = ["test/test_aggregations.rb", "test/test_assembly.rb", "test/test_cascade.rb", "test/test_cascading.rb", "test/test_exceptions.rb", "test/test_flow.rb", "test/test_local_execution.rb", "test/test_operations.rb"]
- if s.respond_to? :specification_version then
- s.specification_version = 3
- end
+ s.rdoc_options = ["--main", "README.md"]
+ s.extra_rdoc_files = ["README.md", "LICENSE.txt"]
+
+ s.description = "cascading.jruby is a small DSL above Cascading, written in JRuby"
end
diff --git a/lib/cascading.rb b/lib/cascading.rb
index 64e61b3..9d19936 100644
--- a/lib/cascading.rb
+++ b/lib/cascading.rb
@@ -2,20 +2,26 @@
module Cascading
# :stopdoc:
- VERSION = '0.0.10'
+ VERSION = '1.0.0'
end
+require 'cascading/aggregations'
require 'cascading/assembly'
require 'cascading/base'
require 'cascading/cascade'
require 'cascading/cascading'
require 'cascading/cascading_exception'
require 'cascading/expr_stub'
+require 'cascading/filter_operations'
require 'cascading/flow'
+require 'cascading/identity_operations'
require 'cascading/mode'
require 'cascading/operations'
+require 'cascading/regex_operations'
require 'cascading/scope'
+require 'cascading/sub_assembly'
require 'cascading/tap'
+require 'cascading/text_operations'
-# include module to make them available at top package
+# include module to make it available at top level
include Cascading
diff --git a/lib/cascading/aggregations.rb b/lib/cascading/aggregations.rb
index 2980748..fb7f2a9 100644
--- a/lib/cascading/aggregations.rb
+++ b/lib/cascading/aggregations.rb
@@ -1,28 +1,39 @@
-require 'cascading/operations'
require 'cascading/scope'
require 'cascading/ext/array'
module Cascading
+ # Aggregations is the context available to you within the block of a group_by,
+ # union, or join that allows you to apply Every pipes to the result of those
+ # operations. You may apply aggregators and buffers within this context
+ # subject to several rules laid out by Cascading.
+ #
# Rules enforced by Aggregations:
# * Contains either 1 Buffer or >= 1 Aggregator (explicitly checked)
- # * No GroupBys, CoGroups, Joins, or Merges (methods for these pipes do not
- # exist on Aggregations)
+ # * No GroupBys, CoGroups, Joins, or Merges (methods for these pipes do not exist on Aggregations)
# * No Eaches (Aggregations#each does not exist)
# * Aggregations may not branch (Aggregations#branch does not exist)
#
# Externally enforced rules:
# * May be empty (in which case, Aggregations is not instantiated)
- # * Must follow a GroupBy or CoGroup (not a Join or Merge)
+ # * Must follow a GroupBy or CoGroup (not a HashJoin or Merge)
#
# Optimizations:
- # * If the leading Group is a GroupBy and all subsequent Everies are
- # Aggregators that have a corresponding AggregateBy, Aggregations can replace
- # the GroupBy/Aggregator pipe with a single composite AggregateBy
+ # * If the leading Group is a GroupBy and all subsequent Everies are Aggregators that have a corresponding AggregateBy, Aggregations can replace the GroupBy/Aggregator pipe with a single composite AggregateBy
+ #
+ # Aggregator and buffer DSL standard optional parameter names:
+ # [input] c.p.Every argument selector
+ # [into] c.o.Operation field declaration
+ # [output] c.p.Every output selector
class Aggregations
- include Operations
-
attr_reader :assembly, :tail_pipe, :scope, :aggregate_bys
+ # Do not use this constructor directly; instead, pass a block containing
+ # the desired aggregations to a group_by, union, or join and it will be
+ # instantiated for you.
+ #
+ # Builds the context in which a sequence of Every aggregations may be
+ # evaluated in the given assembly appended to the given group pipe and with
+ # the given incoming_scopes.
def initialize(assembly, group, incoming_scopes)
@assembly = assembly
@tail_pipe = group
@@ -32,23 +43,14 @@ def initialize(assembly, group, incoming_scopes)
@aggregate_bys = tail_pipe.is_group_by ? [] : nil
end
+ # Prints information about the scope of these Aggregations at the point at
+ # which it is called. This allows you to trace the propagation of field
+ # names through your job and is handy for debugging. See Scope for
+ # details.
def debug_scope
puts "Current scope of aggregations for '#{assembly.name}':\n #{scope}\n----------\n"
end
- def make_pipe(type, parameters)
- pipe = type.new(*parameters)
-
- # Enforce 1 Buffer or >= 1 Aggregator rule
- if tail_pipe.kind_of?(Java::CascadingPipe::Every)
- raise 'Buffer must be sole aggregation' if tail_pipe.buffer? || (tail_pipe.aggregator? && pipe.buffer?)
- end
-
- @tail_pipe = pipe
- @scope = Scope.outgoing_scope(tail_pipe, [scope])
- end
- private :make_pipe
-
# We can replace these aggregations with the corresponding composite
# AggregateBy if the leading Group was a GroupBy and all subsequent
# Aggregators had a corresponding AggregateBy (which we've encoded in the
@@ -69,13 +71,27 @@ def finalize
# Builds an every pipe and adds it to the current list of aggregations.
# Note that this list may be either exactly 1 Buffer or any number of
- # Aggregators.
- def every(*args)
- options = args.extract_options!
-
- in_fields = fields(args)
+ # Aggregators. Exactly one of :aggregator or :buffer must be specified and
+ # :aggregator may be accompanied by a corresponding :aggregate_by.
+ #
+ # The named options are:
+ # [aggregator] A Cascading Aggregator, mutually exclusive with :buffer.
+ # [aggregate_by] A Cascading AggregateBy that corresponds to the given
+ # :aggregator. Only makes sense with the :aggregator option
+ # and does not exist for all Aggregators. Providing nothing
+ # or nil will cause all Aggregations to operate as normal,
+ # without being compiled into a composite AggregateBy.
+ # [buffer] A Cascading Buffer, mutually exclusive with :aggregator.
+ # [output] c.p.Every output selector.
+ #
+ # Example:
+ # every 'field1', 'field2', :aggregator => sum_aggregator, :aggregate_by => sum_by, :output => all_fields
+ # every fields(input_fields), :buffer => Java::SomePackage::SomeBuffer.new, :output => all_fields
+ def every(*args_with_options)
+ options, in_fields = args_with_options.extract_options!, fields(args_with_options)
out_fields = fields(options[:output])
operation = options[:aggregator] || options[:buffer]
+ raise 'every requires either :aggregator or :buffer' unless operation
if options[:aggregate_by] && aggregate_bys
aggregate_bys << options[:aggregate_by]
@@ -84,71 +100,152 @@ def every(*args)
end
parameters = [tail_pipe, in_fields, operation, out_fields].compact
- make_pipe(Java::CascadingPipe::Every, parameters)
- end
+ every = make_pipe(Java::CascadingPipe::Every, parameters)
+ raise ':aggregator specified but c.o.Buffer provided' if options[:aggregator] && every.is_buffer
+ raise ':buffer specified but c.o.Aggregator provided' if options[:buffer] && every.is_aggregator
- def assert_group(*args)
- options = args.extract_options!
+ every
+ end
- assertion = args[0]
+ # Builds an every assertion pipe given a c.o.a.Assertion and adds it to the
+ # current list of aggregations. Note this breaks a chain of AggregateBys.
+ #
+ # The named options are:
+ # [level] The assertion level; defaults to strict.
+ def assert_group(assertion, options = {})
assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
parameters = [tail_pipe, assertion_level, assertion]
make_pipe(Java::CascadingPipe::Every, parameters)
end
- def assert_group_size_equals(*args)
- options = args.extract_options!
-
- assertion = Java::CascadingOperationAssertion::AssertGroupSizeEquals.new(args[0])
+ # Builds a pipe that asserts the size of the current group is the specified
+ # size for all groups.
+ def assert_group_size_equals(size, options = {})
+ assertion = Java::CascadingOperationAssertion::AssertGroupSizeEquals.new(size)
assert_group(assertion, options)
end
- # Builds a series of every pipes for aggregation.
+ # Computes the minima of the specified fields within each group. Fields
+ # may be a list or a map for renaming. Note that fields are sorted by
+ # input name when a map is provided.
#
- # Args can either be a list of fields to aggregate and an options hash or
- # a hash that maps input field name to output field name (similar to
- # insert) and an options hash.
+ # The named options are:
+ # [ignore] Java Array of Objects of values to be ignored.
#
- # Options include:
- # * :ignore a Java Array of Objects (for min and max) or Tuples
- # (for first and last) of values for the aggregator to ignore
- # * function is a symbol that is the method to call to construct
- # the Cascading Aggregator.
- def composite_aggregator(args, function)
- field_map, options = extract_field_map(args)
+ # Examples:
+ # assembly 'aggregate' do
+ # ...
+ # insert 'const' => 1
+ # group_by 'const' do
+ # min 'field1', 'field2'
+ # min 'field3' => 'fieldA', 'field4' => 'fieldB'
+ # end
+ # discard 'const'
+ # end
+ def min(*args_with_options)
+ composite_aggregator(args_with_options, Java::CascadingOperationAggregator::Min)
+ end
- field_map.each do |in_field, out_field|
- agg = self.send(function, out_field, options)
- every(in_field, :aggregator => agg, :output => all_fields)
- end
- raise "Composite aggregator '#{function.to_s.gsub('_function', '')}' invoked on 0 fields" if field_map.empty?
+ # Computes the maxima of the specified fields within each group. Fields
+ # may be a list or a map for renaming. Note that fields are sorted by
+ # input name when a map is provided.
+ #
+ # The named options are:
+ # [ignore] Java Array of Objects of values to be ignored.
+ #
+ # Examples:
+ # assembly 'aggregate' do
+ # ...
+ # insert 'const' => 1
+ # group_by 'const' do
+ # max 'field1', 'field2'
+ # max 'field3' => 'fieldA', 'field4' => 'fieldB'
+ # end
+ # discard 'const'
+ # end
+ def max(*args_with_options)
+ composite_aggregator(args_with_options, Java::CascadingOperationAggregator::Max)
+ end
+
+ # Returns the first value within each group for the specified fields.
+ # Fields may be a list or a map for renaming. Note that fields are sorted
+ # by input name when a map is provided.
+ #
+ # The named options are:
+ # [ignore] Java Array of Tuples which should be ignored
+ #
+ # Examples:
+ # assembly 'aggregate' do
+ # ...
+ # group_by 'key1', 'key2' do
+ # first 'field1', 'field2'
+ # first 'field3' => 'fieldA', 'field4' => 'fieldB'
+ # end
+ # end
+ def first(*args_with_options)
+ composite_aggregator(args_with_options, Java::CascadingOperationAggregator::First)
end
- def min(*args); composite_aggregator(args, :min_function); end
- def max(*args); composite_aggregator(args, :max_function); end
- def first(*args); composite_aggregator(args, :first_function); end
- def last(*args); composite_aggregator(args, :last_function); end
+ # Returns the last value within each group for the specified fields.
+ # Fields may be a list or a map for renaming. Note that fields are sorted
+ # by input name when a map is provided.
+ #
+ # The named options are:
+ # [ignore] Java Array of Tuples which should be ignored
+ #
+ # Examples:
+ # assembly 'aggregate' do
+ # ...
+ # group_by 'key1', 'key2' do
+ # last 'field1', 'field2'
+ # last 'field3' => 'fieldA', 'field4' => 'fieldB'
+ # end
+ # end
+ def last(*args_with_options)
+ composite_aggregator(args_with_options, Java::CascadingOperationAggregator::Last)
+ end
- # Counts elements of a group. May optionally specify the name of the
- # output count field (defaults to 'count').
+ # Counts elements of each group. May optionally specify the name of the
+ # output count field, which defaults to 'count'.
+ #
+ # Examples:
+ # assembly 'aggregate' do
+ # ...
+ # group_by 'key1', 'key2' do
+ # count
+ # count 'key1_key2_count'
+ # end
+ # end
def count(name = 'count')
count_aggregator = Java::CascadingOperationAggregator::Count.new(fields(name))
count_by = Java::CascadingPipeAssembly::CountBy.new(fields(name))
every(last_grouping_fields, :aggregator => count_aggregator, :output => all_fields, :aggregate_by => count_by)
end
- # Sums one or more fields. Fields to be summed may either be provided as
- # the arguments to sum (in which case they will be aggregated into a field
- # of the same name in the given order), or via a hash using the :mapping
- # parameter (in which case they will be aggregated from the field named by
- # the key into the field named by the value after being sorted). The type
- # of the output sum may be controlled with the :type parameter.
- def sum(*args)
- options = args.extract_options!
+ # Sums the specified fields within each group. Fields may be a list or
+ # provided through the :mapping option for renaming. Note that fields are
+ # sorted by name when a map is provided.
+ #
+ # The named options are:
+ # [mapping] Map of input to output field names if renaming is desired.
+ # Results in output fields sorted by input field.
+ # [type] Controls the type of the output, specified using values from the
+ # Cascading::JAVA_TYPE_MAP as in Janino expressions (:double, :long, etc.)
+ #
+ # Examples:
+ # assembly 'aggregate' do
+ # ...
+ # group_by 'key1', 'key2' do
+ # sum 'field1', 'field2', :type => :long
+ # sum :mapping => { 'field3' => 'fieldA', 'field4' => 'fieldB' }, :type => :double
+ # end
+ # end
+ def sum(*args_with_options)
+ options, in_fields = args_with_options.extract_options!, args_with_options
type = JAVA_TYPE_MAP[options[:type]]
- mapping = options[:mapping] ? options[:mapping].sort : args.zip(args)
+ mapping = options[:mapping] ? options[:mapping].sort : in_fields.zip(in_fields)
mapping.each do |in_field, out_field|
sum_aggregator = Java::CascadingOperationAggregator::Sum.new(*[fields(out_field), type].compact)
# NOTE: SumBy requires a type in wip-286, unlike Sum (see Sum.java line 42 for default)
@@ -158,10 +255,22 @@ def sum(*args)
raise "sum invoked on 0 fields (note :mapping must be provided to explicitly rename fields)" if mapping.empty?
end
- # Averages one or more fields. The contract of average is identical to
- # that of other composite aggregators, but it accepts no options.
- def average(*args)
- field_map, _ = extract_field_map(args)
+ # Averages the specified fields within each group. Fields may be a list or
+ # a map for renaming. Note that fields are sorted by input name when a map
+ # is provided.
+ #
+ # Examples:
+ # assembly 'aggregate' do
+ # ...
+ # insert 'const' => 1
+ # group_by 'const' do
+ # max 'field1', 'field2'
+ # max 'field3' => 'fieldA', 'field4' => 'fieldB'
+ # end
+ # discard 'const'
+ # end
+ def average(*fields_or_field_map)
+ field_map, _ = extract_field_map(fields_or_field_map)
field_map.each do |in_field, out_field|
average_aggregator = Java::CascadingOperationAggregator::Average.new(fields(out_field))
@@ -173,6 +282,42 @@ def average(*args)
private
+ def make_pipe(type, parameters)
+ pipe = type.new(*parameters)
+
+ # Enforce 1 Buffer or >= 1 Aggregator rule
+ if tail_pipe.kind_of?(Java::CascadingPipe::Every)
+ raise 'Buffer must be sole aggregation' if tail_pipe.buffer? || (tail_pipe.aggregator? && pipe.buffer?)
+ end
+
+ @tail_pipe = pipe
+ @scope = Scope.outgoing_scope(tail_pipe, [scope])
+
+ tail_pipe
+ end
+
+ # Builds a series of every pipes for aggregation.
+ #
+ # Args can either be a list of fields to aggregate and an options hash or
+ # a hash that maps input field name to output field name (similar to
+ # insert) and an options hash.
+ #
+ # The named options are:
+ # [ignore] Java Array of Objects (for min and max) or Tuples (for first and
+ # last) of values for the aggregator to ignore.
+ def composite_aggregator(args, aggregator)
+ field_map, options = extract_field_map(args)
+
+ field_map.each do |in_field, out_field|
+ every(
+ in_field,
+ :aggregator => aggregator.new(*[fields(out_field), options[:ignore]].compact),
+ :output => all_fields
+ )
+ end
+ raise "Composite aggregator '#{aggregator}' invoked on 0 fields" if field_map.empty?
+ end
+
# Extracts a field mapping, input field => output field, by accepting a
# hash in the first argument. If no hash is provided, then maps arguments
# onto themselves which names outputs the same as inputs. Additionally
diff --git a/lib/cascading/assembly.rb b/lib/cascading/assembly.rb
index 684b4c1..eb4e032 100644
--- a/lib/cascading/assembly.rb
+++ b/lib/cascading/assembly.rb
@@ -1,15 +1,50 @@
require 'cascading/base'
require 'cascading/operations'
+require 'cascading/identity_operations'
+require 'cascading/filter_operations'
+require 'cascading/regex_operations'
+require 'cascading/text_operations'
require 'cascading/aggregations'
require 'cascading/sub_assembly'
require 'cascading/ext/array'
module Cascading
+ # An Assembly is a sequence of Cascading pipes (Each, GroupBy, CoGroup,
+ # Every, and SubAssembly). This class will serve as your primary mechanism
+ # for doing work within a flow and contains all the functions and filters you
+ # will apply to a pipe (Eaches), as well as group_by, union, and join. For
+ # aggregators and buffers, please see Aggregations.
+ #
+ # Function and filter DSL rules:
+ # * Use positional arguments for required parameters
+ # * Use options = {} for optional parameters
+ # * Use *args sparingly, specifically when you need to accept a varying length list of fields
+ # * If you require both a varying length list of fields and optional parameters, then see the Array#extract_options! extension
+ # * If you choose to name a required parameter, add it to options = {} and throw an exception if the caller does not provide it
+ # * If you have a require parameter that is provided by one of a set of options names, throw an exception if the caller does not provide at least one value (see :function and :filter in Assembly#each for an example)
+ #
+ # Function and filter DSL standard optional parameter names:
+ # [input] c.p.Each argument selector
+ # [into] c.o.Operation field declaration
+ # [output] c.p.Each output selector
+ #
+ # A note on aliases: when a DSL method uniquely wraps a single Cascading
+ # operation, we attempt to provide an alias that matches the Cascading
+ # operation. However, Cascading operations are often nouns rather than verbs,
+ # and the latter are preferable for a dataflow DSL.
class Assembly < Cascading::Node
- include Operations
-
attr_reader :head_pipe, :tail_pipe
+ # Do not use this constructor directly; instead, use Flow#assembly or
+ # Assembly#branch to build assemblies.
+ #
+ # Builds an Assembly given a name, parent, and optional outgoing_scopes
+ # (necessary only for branching).
+ #
+ # An assembly's name is quite important as it will determine:
+ # * The sources from which it will read, if any
+ # * The name to be used in joins or unions downstream
+ # * The name to be used to sink the output of the assembly downstream
def initialize(name, parent, outgoing_scopes = {})
super(name, parent)
@@ -27,6 +62,11 @@ def initialize(name, parent, outgoing_scopes = {})
@incoming_scopes = [scope]
end
+ # Produces a textual description of this Assembly. The description details
+ # the structure of the Assembly, its input and output fields and any
+ # children (branches). The offset parameter allows for this describe to be
+ # nested within a calling context, which lets us indent the structural
+ # hierarchy of a job.
def describe(offset = '')
incoming_scopes_desc = "#{@incoming_scopes.map{ |incoming_scope| incoming_scope.values_fields.to_a.inspect }.join(', ')}"
incoming_scopes_desc = "(#{incoming_scopes_desc})" unless @incoming_scopes.size == 1
@@ -35,199 +75,231 @@ def describe(offset = '')
description
end
+ # Rather than the immediate parent, this method returns the parent flow of
+ # this Assembly. If this is a branch, we must traverse the parents of
+ # parent assemblies.
def parent_flow
return parent if parent.kind_of?(Flow)
parent.parent_flow
end
+ # Accesses the outgoing scope of this Assembly at the point at which it is
+ # called. This is useful for grabbing the values_fields at any point in
+ # the construction of the Assembly. See Scope for details.
def scope
@outgoing_scopes[name]
end
+ # Prints information about the scope of this Assembly at the point at which
+ # it is called. This allows you to trace the propagation of field names
+ # through your job and is handy for debugging. See Scope for details.
def debug_scope
puts "Current scope for '#{name}':\n #{scope}\n----------\n"
end
- def make_pipe(type, parameters)
- @tail_pipe = type.new(*parameters)
- @outgoing_scopes[name] = Scope.outgoing_scope(tail_pipe, [scope])
-
- tail_pipe
- end
- private :make_pipe
-
- def populate_incoming_scopes(assembly_names, group_fields_args = {})
- # NOTE: this overrides the existing incoming_scopes, which changes the
- # way describe will function on this assembly
- pipes, @incoming_scopes, group_fields = [], [], []
- assembly_names.each do |assembly_name|
- assembly = parent_flow.find_child(assembly_name)
- raise "Could not find assembly '#{assembly_name}' from '#{name}'" unless assembly
-
- pipes << assembly.tail_pipe
- @incoming_scopes << assembly.scope
- group_fields << fields(group_fields_args[assembly_name]) if group_fields_args[assembly_name]
- end
- [pipes, group_fields]
- end
- private :populate_incoming_scopes
-
- def apply_aggregations(group, incoming_scopes, &block)
- aggregations = Aggregations.new(self, group, incoming_scopes)
- aggregations.instance_eval(&block) if block_given?
-
- # Sorting of any type means that we cannot use the AggregateBy optimization
- if aggregations.can_aggregate_by? && !group.is_sorted && !group.is_sort_reversed
- grouping_fields = group.key_selectors.values.first
- group.key_selectors.values.each do |key_fields|
- raise "Grouping fields mismatch: #{grouping_fields} expected; #{key_fields} found from #{group.key_selectors}" unless key_fields == grouping_fields
- end
-
- aggregate_by = sub_assembly(Java::CascadingPipeAssembly::AggregateBy.new(
- name,
- group.previous,
- grouping_fields,
- aggregations.aggregate_bys.to_java(Java::CascadingPipeAssembly::AggregateBy)
- ), group.previous, incoming_scopes)
-
- aggregate_by
- else
- aggregations.finalize if block_given?
- @tail_pipe = aggregations.tail_pipe
- @outgoing_scopes[name] = aggregations.scope
-
- group
- end
- end
- private :apply_aggregations
-
+ # Prints detail about this Assembly including its name, head pipe, and tail
+ # pipe.
def to_s
"#{name} : head pipe : #{head_pipe} - tail pipe: #{tail_pipe}"
end
- def prepare_join(*args, &block)
- options = args.extract_options!
-
- pipes, _ = populate_incoming_scopes(args)
-
- group_fields_args = options[:on]
- raise 'join requires :on parameter' unless group_fields_args
-
- if group_fields_args.kind_of?(String)
- group_fields_args = [group_fields_args]
- end
-
- group_fields = []
- if group_fields_args.kind_of?(Array)
- pipes.size.times do
- group_fields << fields(group_fields_args)
- end
- elsif group_fields_args.kind_of?(Hash)
- pipes, group_fields = populate_incoming_scopes(group_fields_args.keys.sort, group_fields_args)
- else
- raise "Unsupported data type for :on in join: '#{group_fields_args.class}'"
- end
-
- raise 'join requires non-empty :on parameter' if group_fields_args.empty?
- group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
- incoming_fields = @incoming_scopes.map{ |s| s.values_fields }
- declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
- joiner = options[:joiner]
- is_hash_join = options[:hash] || false
-
- case joiner
- when :inner, 'inner', nil
- joiner = Java::CascadingPipeJoiner::InnerJoin.new
- when :left, 'left'
- joiner = Java::CascadingPipeJoiner::LeftJoin.new
- when :right, 'right'
- joiner = Java::CascadingPipeJoiner::RightJoin.new
- when :outer, 'outer'
- joiner = Java::CascadingPipeJoiner::OuterJoin.new
- when Array
- joiner = joiner.map do |t|
- case t
- when true, 1, :inner then true
- when false, 0, :outer then false
- else fail "invalid mixed joiner entry: #{t}"
- end
- end
- joiner = Java::CascadingPipeJoiner::MixedJoin.new(joiner.to_java(:boolean))
- end
-
- if is_hash_join
- raise ArgumentError, "hash joins don't support aggregations" if block_given?
- parameters = [
- pipes.to_java(Java::CascadingPipe::Pipe),
- group_fields,
- declared_fields,
- joiner
- ]
- group_assembly = Java::CascadingPipe::HashJoin.new(*parameters)
- else
- result_group_fields = dedup_fields(*group_fields)
- parameters = [
- pipes.to_java(Java::CascadingPipe::Pipe),
- group_fields,
- declared_fields,
- result_group_fields,
- joiner
- ]
- group_assembly = Java::CascadingPipe::CoGroup.new(*parameters)
- end
- apply_aggregations(group_assembly, @incoming_scopes, &block)
- end
- private :prepare_join
-
# Builds a HashJoin pipe. This should be used carefully, as the right side
- # of the join is accumulated entirely in memory. Requires a list of assembly
- # names to join and :on to specify the join_fields.
- def hash_join(*args, &block)
- options = args.extract_options!
+ # of the join is accumulated entirely in memory. Requires a list of
+ # assembly names to join and :on to specify the join_fields. Note that a
+ # hash_join "takes over" the Assembly in which it is built, so it is
+ # typically the first statement within the block of the assembly or branch.
+ # Additionally, a hash join does not accept a block for aggregations like
+ # other joins; this restriction is enforced here, but comes directly from
+ # Cascading.
+ #
+ # The named options are:
+ # [on] The keys of the join, an array of strings if they are the same in
+ # all inputs, or a hash mapping assembly names to key names if they
+ # differ across inputs.
+ # [declared_fields] By default, a deduplicated array of incoming field
+ # names (see Cascading::dedup_fields). Specifies the
+ # names of the fields that will be available to
+ # aggregations or post-join if no aggregations are
+ # specified.
+ # [joiner] A specification of the c.p.j.Joiner to use. Values like :inner
+ # and 'inner', :right and 'right' are accepted, as well as an
+ # array specifying mixed joins. Typically, this is not provided,
+ # but one of the higher level join methods on Assembly is used
+ # directly (like Assembly#inner_join or Assembly#right_join).
+ #
+ # Example:
+ # assembly 'join_left_right' do
+ # hash_join 'left', 'right', :on => ['key1', 'key2'], :joiner => :inner
+ # end
+ def hash_join(*args_with_options)
+ raise ArgumentError, "HashJoin doesn't support aggregations so the block provided to hash_join will be ignored" if block_given?
+
+ options, assembly_names = args_with_options.extract_options!, args_with_options
options[:hash] = true
- args << options
- prepare_join(*args, &block)
+ prepare_join(assembly_names, options)
end
# Builds a join (CoGroup) pipe. Requires a list of assembly names to join
- # and :on to specify the group_fields.
- def join(*args, &block)
- options = args.extract_options!
+ # and :on to specify the group_fields. Note that a join "takes over" the
+ # Assembly in which it is built, so it is typically the first statement
+ # within the block of the assembly or branch. The block passed to this
+ # method will be evaluated in the context of Aggregations, not Assembly.
+ #
+ # The named options are:
+ # [on] The keys of the join, an array of strings if they are the same in
+ # all inputs, or a hash mapping assembly names to key names if they
+ # differ across inputs.
+ # [declared_fields] By default, a deduplicated array of incoming field
+ # names (see Cascading::dedup_fields). Specifies the
+ # names of the fields that will be available to
+ # aggregations or post-join if no aggregations are
+ # specified.
+ # [joiner] A specification of the c.p.j.Joiner to use. Values like :inner
+ # and 'inner', :right and 'right' are accepted, as well as an
+ # array specifying mixed joins. Typically, this is not provided,
+ # but one of the higher level join methods on Assembly is used
+ # directly (like Assembly#inner_join or Assembly#right_join).
+ #
+ # Example:
+ # assembly 'join_left_right' do
+ # join 'left', 'right', :on => ['key1', 'key2'], :joiner => :inner do
+ # sum 'val1', 'val2', :type => :long
+ # end
+ # end
+ def join(*args_with_options, &block)
+ options, assembly_names = args_with_options.extract_options!, args_with_options
options[:hash] = false
- args << options
- prepare_join(*args, &block)
+ prepare_join(assembly_names, options, &block)
end
alias co_group join
- def inner_join(*args, &block)
- options = args.extract_options!
+ # Builds an inner join (CoGroup) pipe. Requires a list of assembly names to
+ # join and :on to specify the group_fields.
+ #
+ # The named options are:
+ # [on] The keys of the join, an array of strings if they are the same in
+ # all inputs, or a hash mapping assembly names to key names if they
+ # differ across inputs.
+ # [declared_fields] By default, a deduplicated array of incoming field
+ # names (see Cascading::dedup_fields). Specifies the
+ # names of the fields that will be available to
+ # aggregations or post-join if no aggregations are
+ # specified.
+ #
+ # Example:
+ # assembly 'join_left_right' do
+ # inner_join 'left', 'right', :on => ['key1', 'key2']
+ # sum 'val1', 'val2', :type => :long
+ # end
+ # end
+ def inner_join(*args_with_options, &block)
+ options = args_with_options.extract_options!
options[:joiner] = :inner
- args << options
- join(*args, &block)
+ args_with_options << options
+ join(*args_with_options, &block)
end
- def left_join(*args, &block)
- options = args.extract_options!
+ # Builds a left join (CoGroup) pipe. Requires a list of assembly names to
+ # join and :on to specify the group_fields.
+ #
+ # The named options are:
+ # [on] The keys of the join, an array of strings if they are the same in
+ # all inputs, or a hash mapping assembly names to key names if they
+ # differ across inputs.
+ # [declared_fields] By default, a deduplicated array of incoming field
+ # names (see Cascading::dedup_fields). Specifies the
+ # names of the fields that will be available to
+ # aggregations or post-join if no aggregations are
+ # specified.
+ #
+ # Example:
+ # assembly 'join_left_right' do
+ # left_join 'left', 'right', :on => ['key1', 'key2'] do
+ # sum 'val1', 'val2', :type => :long
+ # end
+ # end
+ def left_join(*args_with_options, &block)
+ options = args_with_options.extract_options!
options[:joiner] = :left
- args << options
- join(*args, &block)
+ args_with_options << options
+ join(*args_with_options, &block)
end
- def right_join(*args, &block)
- options = args.extract_options!
+ # Builds a right join (CoGroup) pipe. Requires a list of assembly names to
+ # join and :on to specify the group_fields.
+ #
+ # The named options are:
+ # [on] The keys of the join, an array of strings if they are the same in
+ # all inputs, or a hash mapping assembly names to key names if they
+ # differ across inputs.
+ # [declared_fields] By default, a deduplicated array of incoming field
+ # names (see Cascading::dedup_fields). Specifies the
+ # names of the fields that will be available to
+ # aggregations or post-join if no aggregations are
+ # specified.
+ #
+ # Example:
+ # assembly 'join_left_right' do
+ # right_join 'left', 'right', :on => ['key1', 'key2'] do
+ # sum 'val1', 'val2', :type => :long
+ # end
+ # end
+ def right_join(*args_with_options, &block)
+ options = args_with_options.extract_options!
options[:joiner] = :right
- args << options
- join(*args, &block)
+ args_with_options << options
+ join(*args_with_options, &block)
end
- def outer_join(*args, &block)
- options = args.extract_options!
+ # Builds an outer join (CoGroup) pipe. Requires a list of assembly names to
+ # join and :on to specify the group_fields.
+ #
+ # The named options are:
+ # [on] The keys of the join, an array of strings if they are the same in
+ # all inputs, or a hash mapping assembly names to key names if they
+ # differ across inputs.
+ # [declared_fields] By default, a deduplicated array of incoming field
+ # names (see Cascading::dedup_fields). Specifies the
+ # names of the fields that will be available to
+ # aggregations or post-join if no aggregations are
+ # specified.
+ #
+ # Example:
+ # assembly 'join_left_right' do
+ # outer_join 'left', 'right', :on => ['key1', 'key2'] do
+ # sum 'val1', 'val2', :type => :long
+ # end
+ # end
+ def outer_join(*args_with_options, &block)
+ options = args_with_options.extract_options!
options[:joiner] = :outer
- args << options
- join(*args, &block)
+ args_with_options << options
+ join(*args_with_options, &block)
end
- # Builds a new branch.
+ # Builds a child Assembly that branches this Assembly given a name and
+ # block.
+ #
+ # An assembly's name is quite important as it will determine:
+ # * The sources from which it will read, if any
+ # * The name to be used in joins or unions downstream
+ # * The name to be used to sink the output of the assembly downstream
+ #
+ # Many branches may be built within an assembly. The result of a branch is
+ # the same as the Flow#assembly constructor, an Assembly object.
+ #
+ # Example:
+ # assembly 'some_work' do
+ # ...
+ #
+ # branch 'more_work' do
+ # ...
+ # end
+ #
+ # branch 'yet_more_work' do
+ # ...
+ # end
+ # end
def branch(name, &block)
raise "Could not build branch '#{name}'; block required" unless block_given?
assembly = Assembly.new(name, self, @outgoing_scopes)
@@ -236,11 +308,27 @@ def branch(name, &block)
assembly
end
- # Builds a new GroupBy pipe that groups on the fields given in args.
- # Any block passed to this method should contain only Everies.
- def group_by(*args, &block)
- options = args.extract_options!
- group_fields = fields(args)
+ # Builds a new GroupBy pipe that groups on the fields given in
+ # args_with_options. The block passed to this method will be evaluated in
+ # the context of Aggregations, not Assembly.
+ #
+ # The named options are:
+ # [sort_by] Optional keys for within-group sort.
+ # [reverse] Boolean that can reverse the order of within-group sorting
+ # (only makes sense given :sort_by keys).
+ #
+ # Example:
+ # assembly 'total' do
+ # ...
+ # insert 'const' => 1
+ # group_by 'const' do
+ # count
+ # sum 'val1', 'val2', :type => :long
+ # end
+ # discard 'const'
+ # end
+ def group_by(*args_with_options, &block)
+ options, group_fields = args_with_options.extract_options!, fields(args_with_options)
sort_fields = fields(options[:sort_by])
reverse = options[:reverse]
@@ -251,16 +339,31 @@ def group_by(*args, &block)
# Unifies multiple incoming pipes sharing the same field structure using a
# GroupBy. Accepts :on like join and :sort_by and :reverse like group_by,
# as well as a block which may be used for a sequence of Every
- # aggregations.
+ # aggregations. The block passed to this method will be evaluated in the
+ # context of Aggregations, not Assembly.
#
# By default, groups only on the first field (see line 189 of GroupBy.java)
- def union(*args, &block)
- options = args.extract_options!
+ #
+ # The named options are:
+ # [on] The keys of the union, which defaults to the first field in the
+ # first input assembly.
+ # [sort_by] Optional keys for sorting.
+ # [reverse] Boolean that can reverse the order of sorting
+ # (only makes sense given :sort_by keys).
+ #
+ # Example:
+ # assembly 'union_left_right' do
+ # union 'left', 'right' do
+ # sum 'val1', 'val2', :type => :long
+ # end
+ # end
+ def union(*args_with_options, &block)
+ options, assembly_names = args_with_options.extract_options!, args_with_options
group_fields = fields(options[:on])
sort_fields = fields(options[:sort_by])
reverse = options[:reverse]
- pipes, _ = populate_incoming_scopes(args)
+ pipes, _ = populate_incoming_scopes(assembly_names)
# Must provide group_fields to ensure field name propagation
group_fields = fields(@incoming_scopes.first.values_fields.get(0)) unless group_fields
@@ -273,10 +376,15 @@ def union(*args, &block)
end
alias :union_pipes :union
- # Allows you to plugin c.p.SubAssemblies to a cascading.jruby Assembly
- # under certain assumptions. Note the default is to extend the tail pipe
- # of this Assembly using a linear SubAssembly. See SubAssembly class for
- # details.
+ # Allows you to plugin c.p.SubAssemblies to an Assembly under certain
+ # assumptions. Note the default is to extend the tail pipe of this
+ # Assembly using a linear SubAssembly. See SubAssembly class for details.
+ #
+ # Example:
+ # assembly 'id_rows' do
+ # ...
+ # sub_assembly Java::CascadingPipeAssembly::Discard.new(tail_pipe, fields('id'))
+ # end
def sub_assembly(sub_assembly, pipes = [tail_pipe], incoming_scopes = [scope])
sub_assembly = SubAssembly.new(self, sub_assembly)
sub_assembly.finalize(pipes, incoming_scopes)
@@ -287,17 +395,24 @@ def sub_assembly(sub_assembly, pipes = [tail_pipe], incoming_scopes = [scope])
sub_assembly
end
- # Builds a basic _each_ pipe, and adds it to the current assembly.
- # --
+ # Builds a basic each pipe and adds it to the current Assembly. Default
+ # arguments are all_fields, a default inherited from c.o.Each. Exactly one
+ # of :function and :filter must be specified and filters do not support an
+ # :output selector.
+ #
+ # The named options are:
+ # [filter] A Cascading Filter, mutually exclusive with :function.
+ # [function] A Cascading Function, mutually exclusive with :filter.
+ # [output] c.p.Each output selector, only valid with :function.
+ #
# Example:
- # each 'line', :function => regex_splitter(['name', 'val1', 'val2', 'id'], :pattern => /[.,]*\s+/), :output => ['id', 'name', 'val1', 'val2']
- def each(*args)
- options = args.extract_options!
-
- in_fields = fields(args)
- out_fields = fields(options[:output])
-
+ # each fields(input_fields), :function => Java::CascadingOperation::Identity.new
+ # each 'field1', 'field2', :function => Java::CascadingOperation::Identity.new
+ def each(*args_with_options)
+ options, in_fields = args_with_options.extract_options!, fields(args_with_options)
+ out_fields = fields(options[:output]) # Default Fields.RESULTS from c.o.Each
operation = options[:filter] || options[:function]
+ raise 'each requires either :filter or :function' unless operation
raise 'c.p.Each does not support applying an output selector to a c.o.Filter' if options[:filter] && options[:output]
parameters = [tail_pipe, in_fields, operation, out_fields].compact
@@ -308,468 +423,156 @@ def each(*args)
each
end
- # Restricts the current assembly to the specified fields.
- # --
- # Example:
- # project "field1", "field2"
- def project(*args)
- each fields(args), :function => Java::CascadingOperation::Identity.new
- end
-
- # Removes the specified fields from the current assembly.
- # --
- # Example:
- # discard "field1", "field2"
- def discard(*args)
- discard_fields = fields(args)
- keep_fields = difference_fields(scope.values_fields, discard_fields)
- project(*keep_fields.to_a)
- end
-
- # Renames fields according to the mapping provided.
- # --
- # Example:
- # rename "old_name" => "new_name"
- def rename(name_map)
- old_names = scope.values_fields.to_a
- new_names = old_names.map{ |name| name_map[name] || name }
- invalid = name_map.keys.sort - old_names
- raise "invalid names: #{invalid.inspect}" unless invalid.empty?
-
- each all_fields, :function => Java::CascadingOperation::Identity.new(fields(new_names))
- end
-
- def cast(type_map)
- names = type_map.keys.sort
- types = JAVA_TYPE_MAP.values_at(*type_map.values_at(*names))
- fields = fields(names)
- types = types.to_java(java.lang.Class)
- each fields, :function => Java::CascadingOperation::Identity.new(fields, types)
- end
-
- def copy(*args)
- options = args.extract_options!
- from = args[0] || all_fields
- into = args[1] || options[:into] || all_fields
- each fields(from), :function => Java::CascadingOperation::Identity.new(fields(into)), :output => all_fields
- end
-
- # A pipe that does nothing.
- def pass(*args)
- each all_fields, :function => Java::CascadingOperation::Identity.new
- end
+ include Operations
+ include IdentityOperations
+ include FilterOperations
+ include RegexOperations
+ include TextOperations
- def assert(*args)
- options = args.extract_options!
- assertion = args[0]
+ # Builds an each assertion pipe given a c.o.a.Assertion and adds it to the
+ # current Assembly.
+ #
+ # The named options are:
+ # [level] The assertion level; defaults to strict.
+ def assert(assertion, options = {})
assertion_level = options[:level] || Java::CascadingOperation::AssertionLevel::STRICT
parameters = [tail_pipe, assertion_level, assertion]
make_pipe(Java::CascadingPipe::Each, parameters)
end
- # Builds a debugging pipe.
- #
- # Without arguments, it generate a simple debug pipe, that prints all tuple to the standard
- # output.
- #
- # The other named options are:
- # * :print_fields a boolean. If is set to true, then it prints every 10 tuples.
- #
- def debug(*args)
- options = args.extract_options!
- print_fields = options[:print_fields] || true
- parameters = [print_fields].compact
- debug = Java::CascadingOperation::Debug.new(*parameters)
- debug.print_tuple_every = options[:tuple_interval] || 1
- debug.print_fields_every = options[:fields_interval] || 10
- each(all_fields, :filter => debug)
- end
-
- # Builds a pipe that assert the size of the tuple is the size specified in parameter.
- #
- # The method accept an unique uname argument : a number indicating the size expected.
- def assert_size_equals(*args)
- options = args.extract_options!
- assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(args[0])
+ # Builds a pipe that asserts the size of the tuple is the specified size.
+ def assert_size_equals(size, options = {})
+ assertion = Java::CascadingOperationAssertion::AssertSizeEquals.new(size)
assert(assertion, options)
end
- # Builds a pipe that assert the none of the fields in the tuple are null.
- def assert_not_null(*args)
- options = args.extract_options!
+ # Builes a pipe that asserts none of the fiels in the tuple are null.
+ def assert_not_null(options = {})
assertion = Java::CascadingOperationAssertion::AssertNotNull.new
assert(assertion, options)
end
- # Builds a _parse_ pipe. This pipe will parse the fields specified in input (first unamed arguments),
- # using a specified regex pattern.
- #
- # If provided, the unamed arguments must be the fields to be parsed. If not provided, then all incoming
- # fields are used.
- #
- # The named options are:
- # * :pattern a string or regex. Specifies the regular expression used for parsing the argument fields.
- # * :output a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
- def parse(*args)
- options = args.extract_options!
- fields = args || all_fields
- pattern = options[:pattern]
- output = options[:output] || all_fields
- each(fields, :function => regex_parser(pattern, options), :output => output)
- end
+ private
- # Builds a pipe that splits a field into other fields, using a specified regular expression.
- #
- # The first unnamed argument is the field to be split.
- # The second unnamed argument is an array of strings indicating the fields receiving the result of the split.
- #
- # The named options are:
- # * :pattern a string or regex. Specifies the regular expression used for splitting the argument fields.
- # * :output a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
- def split(*args)
- options = args.extract_options!
- fields = options[:into] || args[1]
- pattern = options[:pattern] || /[.,]*\s+/
- output = options[:output] || all_fields
- each(args[0], :function => regex_splitter(fields, :pattern => pattern), :output=>output)
- end
-
- # Builds a pipe that splits a field into new rows, using a specified regular expression.
- #
- # The first unnamed argument is the field to be split.
- # The second unnamed argument is the field receiving the result of the split.
- #
- # The named options are:
- # * :pattern a string or regex. Specifies the regular expression used for splitting the argument fields.
- # * :output a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
- def split_rows(*args)
- options = args.extract_options!
- fields = options[:into] || args[1]
- pattern = options[:pattern] || /[.,]*\s+/
- output = options[:output] || all_fields
- each(args[0], :function => regex_split_generator(fields, :pattern => pattern), :output=>output)
- end
-
- # Builds a pipe that emits a new row for each regex group matched in a field, using a specified regular expression.
- #
- # The first unnamed argument is the field to be matched against.
- # The second unnamed argument is the field receiving the result of the match.
- #
- # The named options are:
- # * :pattern a string or regex. Specifies the regular expression used for matching the argument fields.
- # * :output a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
- def match_rows(*args)
- options = args.extract_options!
- fields = options[:into] || args[1]
- pattern = options[:pattern] || /[\w]+/
- output = options[:output] || all_fields
- each(args[0], :function => regex_generator(fields, :pattern => pattern), :output=>output)
- end
-
- # Builds a pipe that parses the specified field as a date using hte provided format string.
- # The unamed argument specifies the field to format.
- #
- # The named options are:
- # * :into a string. It specifies the receiving field. By default, it will be named after
- # the input argument.
- # * :pattern a string. Specifies the date format.
- # * :output a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
- def parse_date(*args)
- options = args.extract_options!
- field = options[:into] || "#{args[0]}_parsed"
- output = options[:output] || all_fields
- pattern = options[:pattern] || "yyyy/MM/dd"
-
- each args[0], :function => date_parser(field, pattern), :output => output
- end
+ def make_pipe(type, parameters)
+ @tail_pipe = type.new(*parameters)
+ @outgoing_scopes[name] = Scope.outgoing_scope(tail_pipe, [scope])
- # Builds a pipe that format a date using a specified format pattern.
- #
- # The unamed argument specifies the field to format.
- #
- # The named options are:
- # * :into a string. It specifies the receiving field. By default, it will be named after
- # the input argument.
- # * :pattern a string. Specifies the date format.
- # * :timezone a string. Specifies the timezone (defaults to UTC).
- # * :output a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
- def format_date(*args)
- options = args.extract_options!
- field = options[:into] || "#{args[0]}_formatted"
- pattern = options[:pattern] || "yyyy/MM/dd"
- output = options[:output] || all_fields
-
- each args[0], :function => date_formatter(field, pattern, options[:timezone]), :output => output
+ tail_pipe
end
- # Builds a pipe that perform a query/replace based on a regular expression.
- #
- # The first unamed argument specifies the input field.
- #
- # The named options are:
- # * :pattern a string or regex. Specifies the pattern to look for in the input field. This non-optional argument
- # can also be specified as a second _unamed_ argument.
- # * :replacement a string. Specifies the replacement.
- # * :output a string or array of strings. Specifies the outgoing fields (all fields will be output by default)
- def replace(*args)
- options = args.extract_options!
-
- pattern = options[:pattern] || args[1]
- replacement = options[:replacement] || args[2]
- into = options[:into] || "#{args[0]}_replaced"
- output = options[:output] || all_fields
-
- each args[0], :function => regex_replace(into, pattern, replacement), :output => output
- end
+ def populate_incoming_scopes(assembly_names, group_fields_args = {})
+ # NOTE: this overrides the existing incoming_scopes, which changes the
+ # way describe will function on this assembly
+ pipes, @incoming_scopes, group_fields = [], [], []
+ assembly_names.each do |assembly_name|
+ assembly = parent_flow.find_child(assembly_name)
+ raise "Could not find assembly '#{assembly_name}' from '#{name}'" unless assembly
- # Builds a pipe that inserts values into the current tuple.
- #
- # The method takes a hash as parameter. This hash contains as keys the names of the fields to insert
- # and as values, the values they must contain. For example:
- #
- # insert {"who" => "Grégoire", "when" => Time.now.strftime("%Y-%m-%d") }
- #
- # will insert two new fields: a field _who_ containing the string "Grégoire", and a field _when_ containing
- # the formatted current date.
- # The methods outputs all fields.
- # The named options are:
- def insert(args)
- args.keys.sort.each do |field_name|
- value = args[field_name]
-
- if value.kind_of?(ExprStub)
- value.validate_scope(scope)
- each all_fields, :function => expression_function(field_name, :expression => value.expression, :parameters => value.types), :output => all_fields
- else
- each all_fields, :function => insert_function([field_name], :values => [value]), :output => all_fields
- end
+ pipes << assembly.tail_pipe
+ @incoming_scopes << assembly.scope
+ group_fields << fields(group_fields_args[assembly_name]) if group_fields_args[assembly_name]
end
+ [pipes, group_fields]
end
- # Builds a pipe that filters the tuples based on an expression or a pattern (but not both !).
- #
- # The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
- #
- # The named options are:
- # * :pattern a string. Specifies a regular expression pattern used to filter the tuples. If this
- # option is provided, then the filter is regular expression-based. This is incompatible with the _expression_ option.
- # * :expression a string. Specifies a Janino expression used to filter the tuples. This option has the
- # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
- # expression-based. This is incompatible with the _pattern_ option.
- # * :validate a boolean. Passed into Cascading#expr to enable or disable
- # expression validation. Defaults to true.
- # * :validate_with a hash. Actual arguments used by Cascading#expr for
- # expression validation. Defaults to {}.
- def filter(*args)
- options = args.extract_options!
- from = options.delete(:from) || all_fields
- expression = options.delete(:expression) || args.shift
- regex = options.delete(:pattern)
- validate = options.has_key?(:validate) ? options.delete(:validate) : true
- validate_with = options.has_key?(:validate_with) ? options.delete(:validate_with) : {}
-
- if expression
- stub = expr(expression, { :validate => validate, :validate_with => validate_with })
- types, expression = stub.types, stub.expression
-
- stub.validate_scope(scope)
- each from, :filter => expression_filter(
- :parameters => types,
- :expression => expression
- )
- elsif regex
- each from, :filter => regex_filter(regex, options)
- end
- end
+ def apply_aggregations(group, incoming_scopes, &block)
+ aggregations = Aggregations.new(self, group, incoming_scopes)
+ aggregations.instance_eval(&block) if block_given?
- def filter_null(*args)
- options = args.extract_options!
- each(args, :filter => Java::CascadingOperationFilter::FilterNull.new)
- end
- alias reject_null filter_null
+ # Sorting of any type means that we cannot use the AggregateBy optimization
+ if aggregations.can_aggregate_by? && !group.is_sorted && !group.is_sort_reversed
+ grouping_fields = group.key_selectors.values.first
+ group.key_selectors.values.each do |key_fields|
+ raise "Grouping fields mismatch: #{grouping_fields} expected; #{key_fields} found from #{group.key_selectors}" unless key_fields == grouping_fields
+ end
- def filter_not_null(*args)
- options = args.extract_options!
- each(args, :filter => Java::CascadingOperationFilter::FilterNotNull.new)
- end
- alias where_null filter_not_null
+ aggregate_by = sub_assembly(Java::CascadingPipeAssembly::AggregateBy.new(
+ name,
+ group.previous,
+ grouping_fields,
+ aggregations.aggregate_bys.to_java(Java::CascadingPipeAssembly::AggregateBy)
+ ), group.previous, incoming_scopes)
- # Builds a pipe that rejects the tuples based on an expression.
- #
- # The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
- #
- # The named options are:
- # * :expression a string. Specifies a Janino expression used to filter the tuples. This option has the
- # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
- # expression-based.
- # * :validate a boolean. Passed into Cascading#expr to enable or disable
- # expression validation. Defaults to true.
- # * :validate_with a hash. Actual arguments used by Cascading#expr for
- # expression validation. Defaults to {}.
- def reject(*args)
- options = args.extract_options
- raise "Regex not allowed" if options && options[:pattern]
-
- filter(*args)
- end
+ aggregate_by
+ else
+ aggregations.finalize if block_given?
+ @tail_pipe = aggregations.tail_pipe
+ @outgoing_scopes[name] = aggregations.scope
- # Builds a pipe that includes just the tuples matching an expression.
- #
- # The first unamed argument, if provided, is a filtering expression (using the Janino syntax).
- #
- # The named options are:
- # * :expression a string. Specifies a Janino expression used to select the tuples. This option has the
- # same effect than providing it as first unamed argument. If this option is provided, then the filter is Janino
- # expression-based.
- # * :validate a boolean. Passed into Cascading#expr to enable or disable
- # expression validation. Defaults to true.
- # * :validate_with a hash. Actual arguments used by Cascading#expr for
- # expression validation. Defaults to {}.
- def where(*args)
- options = args.extract_options
- raise "Regex not allowed" if options && options[:pattern]
-
- if options[:expression]
- _, imports, expr = options[:expression].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
- options[:expression] = "#{imports}!(#{expr})"
- elsif args[0]
- _, imports, expr = args[0].match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
- args[0] = "#{imports}!(#{expr})"
+ group
end
-
- filter(*args)
end
- # Builds a pipe that evaluates the specified Janino expression and insert it in a new field in the tuple.
- #
- # The named options are:
- # * :from a string or array of strings. Specifies the input fields.
- # * :express a string. The janino expression.
- # * :into a string. Specified the name of the field to insert with the result of the evaluation.
- # * :parameters a hash. Specifies the type mapping for the parameters. See Cascading::Operations.expression_function.
- def eval_expression(*args)
- options = args.extract_options!
-
- into = options.delete(:into)
- from = options.delete(:from) || all_fields
- output = options.delete(:output) || all_fields
- options[:expression] ||= args.shift
- options[:parameters] ||= args.shift
-
- each from, :function => expression_function(into, options), :output=>output
- end
+ def prepare_join(assembly_names, options, &block)
+ pipes, _ = populate_incoming_scopes(assembly_names)
- # Builds a pipe that returns distinct tuples based on the provided fields.
- #
- # The method accepts optional unamed argument specifying the fields to base the distinct on
- # (all fields, by default).
- def distinct(*args)
- raise "Distinct is badly broken"
- fields = args[0] || all_fields
- group_by *fields
- pass
- end
-
- def join_fields(*args)
- options = args.extract_options!
- output = options[:output] || all_fields
+ group_fields_args = options[:on]
+ raise 'join requires :on parameter' unless group_fields_args
- each args, :function => field_joiner(options), :output => output
- end
+ if group_fields_args.kind_of?(String)
+ group_fields_args = [group_fields_args]
+ end
- # Ungroups, or unpivots, a tuple (see Cascading's UnGroup at http://docs.cascading.org/cascading/2.0/javadoc/cascading/operation/function/UnGroup.html).
- #
- # You must provide :key and you must provide only one of :value_selectors
- # and :num_values.
- #
- # The named options are:
- # * :key required array of field names to replicate on every
- # output row in an ungrouped group.
- # * :value_selectors an array of field names to ungroup. Each
- # field will be ungrouped into an output tuple along with the key fields
- # in the order provided.
- # * :num_values an integer specifying the number of fields to
- # ungroup into each output tuple (excluding the key fields). All input
- # fields will be ungrouped.
- # * :input an array of field names that specifies the fields to
- # input to UnGroup. Defaults to all_fields.
- # * :into an array of field names. Default set by UnGroup.
- # * :output an array of field names that specifies the fields to
- # produce as output of UnGroup. Defaults to all_fields.
- def ungroup(*args)
- options = args.extract_options!
- input = options[:input] || all_fields
- into = fields(options[:into])
- output = options[:output] || all_fields
- key = fields(options[:key])
-
- raise 'You must provide exactly one of :value_selectors or :num_values to ungroup' unless options.has_key?(:value_selectors) ^ options.has_key?(:num_values)
- value_selectors = options[:value_selectors].map{ |vs| fields(vs) }.to_java(Java::CascadingTuple::Fields) if options.has_key?(:value_selectors)
- num_values = options[:num_values] if options.has_key?(:num_values)
-
- parameters = [into, key, value_selectors, num_values].compact
- each input, :function => Java::CascadingOperationFunction::UnGroup.new(*parameters), :output => output
- end
+ group_fields = []
+ if group_fields_args.kind_of?(Array)
+ pipes.size.times do
+ group_fields << fields(group_fields_args)
+ end
+ elsif group_fields_args.kind_of?(Hash)
+ pipes, group_fields = populate_incoming_scopes(group_fields_args.keys.sort, group_fields_args)
+ else
+ raise "Unsupported data type for :on in join: '#{group_fields_args.class}'"
+ end
- # Inserts one of two values into the dataflow based upon the result of the
- # supplied filter on the input fields. This is primarily useful for
- # creating indicators from filters.
- #
- # Parameters:
- # * input name of field to apply the filter.
- # * filter Cascading Filter to apply.
- # * keep_value Java value to produce when the filter would keep
- # the given input.
- # * remove_value Java value to produce when the filter would
- # remove the given input.
- #
- # The named options are:
- # * :into an output field name, defaulting to 'filter_value'.
- # * :output an array of field names that specifies the fields to
- # retain in the output tuple. Defaults to all_fields.
- def set_value(input, filter, keep_value, remove_value, params = {})
- into = fields(params[:into] || 'filter_value')
- output = params[:output] || all_fields
- each input, :function => Java::CascadingOperationFunction::SetValue.new(into, filter, keep_value, remove_value), :output => output
- end
+ raise 'join requires non-empty :on parameter' if group_fields_args.empty?
+ group_fields = group_fields.to_java(Java::CascadingTuple::Fields)
+ incoming_fields = @incoming_scopes.map{ |s| s.values_fields }
+ declared_fields = fields(options[:declared_fields] || dedup_fields(*incoming_fields))
+ joiner = options[:joiner]
+ is_hash_join = options[:hash] || false
- # Efficient way of inserting a null indicator for any field, even one that
- # cannot be coerced to a string. This is accomplished using Cascading's
- # FilterNull and SetValue operators rather than Janino. 1 is produced if
- # the field is null and 0 otherwise.
- #
- # Parameters:
- # * input name of field to check for null.
- #
- # The named options are:
- # * :into an output field name, defaulting to 'is_null'.
- # * :output an array of field names that specifies the fields to
- # retain in the output tuple. Defaults to all_fields.
- def null_indicator(input, params = {})
- into = fields(params[:into] || 'is_null')
- output = params[:output] || all_fields
- set_value input, Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, :into => into, :output => output
- end
+ case joiner
+ when :inner, 'inner', nil
+ joiner = Java::CascadingPipeJoiner::InnerJoin.new
+ when :left, 'left'
+ joiner = Java::CascadingPipeJoiner::LeftJoin.new
+ when :right, 'right'
+ joiner = Java::CascadingPipeJoiner::RightJoin.new
+ when :outer, 'outer'
+ joiner = Java::CascadingPipeJoiner::OuterJoin.new
+ when Array
+ joiner = joiner.map do |t|
+ case t
+ when true, 1, :inner then true
+ when false, 0, :outer then false
+ else fail "invalid mixed joiner entry: #{t}"
+ end
+ end
+ joiner = Java::CascadingPipeJoiner::MixedJoin.new(joiner.to_java(:boolean))
+ end
- # Given a field and a regex, returns an indicator that is 1 if the string
- # contains at least 1 match and 0 otherwise.
- #
- # Parameters:
- # * input field name or names that specifies the fields over which
- # to perform the match.
- # * pattern regex to apply to the input.
- #
- # The named options are:
- # * :into an output field name, defaulting to 'regex_contains'.
- # * :output an array of field names that specifies the fields to
- # retain in the output tuple. Defaults to all_fields.
- def regex_contains(input, pattern, params = {})
- input = fields(input)
- pattern = pattern.to_s # Supports JRuby regexes
- into = fields(params[:into] || 'regex_contains')
- output = params[:output] || all_fields
- set_value input, Java::CascadingOperationRegex::RegexFilter.new(pattern), 1.to_java, 0.to_java, :into => into, :output => output
+ if is_hash_join
+ parameters = [
+ pipes.to_java(Java::CascadingPipe::Pipe),
+ group_fields,
+ declared_fields,
+ joiner
+ ]
+ group_assembly = Java::CascadingPipe::HashJoin.new(*parameters)
+ else
+ result_group_fields = dedup_fields(*group_fields)
+ parameters = [
+ pipes.to_java(Java::CascadingPipe::Pipe),
+ group_fields,
+ declared_fields,
+ result_group_fields,
+ joiner
+ ]
+ group_assembly = Java::CascadingPipe::CoGroup.new(*parameters)
+ end
+ apply_aggregations(group_assembly, @incoming_scopes, &block)
end
end
end
diff --git a/lib/cascading/base.rb b/lib/cascading/base.rb
index 80c30aa..bedb5c6 100644
--- a/lib/cascading/base.rb
+++ b/lib/cascading/base.rb
@@ -1,7 +1,22 @@
module Cascading
+ # A Node is a Cascade, Flow, or Assembly, all of which are composite
+ # structures that describe the hierarchical structure of your job. A Cascade
+ # may contain many Flows and a Flow and Assembly may contain many Assemblies
+ # (branches in the case of the Assembly). Nodes are named, contain parent
+ # and child pointers, and keep track of their children both by name and by
+ # insertion order.
+ #
+ # Nodes must be uniquely named within the scope of their parent so that they
+ # unambiguously looked up for connecting pipes within a flow. However, we
+ # only ensure that children are uniquely named upon insertion; full
+ # uniqueness isn't required until Node#find_child is called (this allows for
+ # name reuse in a few limited circumstances that was important when migrating
+ # the Etsy workload to enforce these constraints).
class Node
attr_accessor :name, :parent, :children, :child_names, :last_child
+ # A Node requires a name and a parent when it is constructed. Children are
+ # added later with Node#add_child.
def initialize(name, parent)
@name = name
@parent = parent
@@ -23,10 +38,15 @@ def add_child(node)
node
end
+ # The qualified name of a node is formed from the name of all nodes in the
+ # path from the root to that node.
def qualified_name
parent ? "#{parent.qualified_name}.#{name}" : name
end
+ # Produces a textual description of this Node. This method is overridden
+ # by all classes inheriting Node, so it serves mainly as a template for
+ # describing a node with children.
def describe(offset = '')
"#{offset}#{name}:node\n#{child_names.map{ |child| children[child].describe("#{offset} ") }.join("\n")}"
end
@@ -44,6 +64,8 @@ def find_child(name)
all_children_with_name.first
end
+ # Returns the root Node, the topmost parent of the hierarchy (typically a
+ # Cascade or Flow).
def root
return self unless parent
parent.root
diff --git a/lib/cascading/cascade.rb b/lib/cascading/cascade.rb
index 3a9de80..1fa3138 100644
--- a/lib/cascading/cascade.rb
+++ b/lib/cascading/cascade.rb
@@ -2,6 +2,13 @@
require 'yaml'
module Cascading
+ # A Cascade wraps a c.c.Cascade. A Cascade is composed of Flows, which are
+ # constructed using the Cascade#flow method within the block passed to the
+ # Cascading::cascade constructor. Many flows may be nested within a Cascade.
+ #
+ # Note that you are not required to use a Cascade to wrap your job. Instead,
+ # you could start with a top-level Flow, which you might prefer if you have
+ # no need of a c.c.Cascade's make-like semantics wrt sinks.
class Cascade < Cascading::Node
extend Registerable
@@ -10,46 +17,72 @@ class Cascade < Cascading::Node
# Do not use this constructor directly; instead, use Cascading::cascade to
# build cascades.
#
- # Builds a cascade given the specified name. Optionally accepts
- # :properties which will be used as the default properties for all child
- # flows. Properties must be a Ruby Hash with string keys and values and
- # will be copied before being passed into each flow in the cascade. See
- # Cascading::Flow#initialize for details on how flows handle properties.
- # Optionally accepts a :mode which will be used as the default mode for all
- # child flows. See Cascading::Mode.parse for details.
- def initialize(name, params = {})
- @properties = params[:properties] || {}
- @mode = params[:mode]
+ # Builds a Cascade given a name.
+ #
+ # The named options are:
+ # [properties] Properties hash which will be used as the default properties
+ # for all child flows. Properties must be a Ruby Hash with
+ # string keys and values and will be copied before being
+ # passed into each flow in the cascade. See Flow#initialize
+ # for details on how flows handle properties.
+ # [mode] Mode which will be used as the default mode for all child flows.
+ # See Mode.parse for details.
+ def initialize(name, options = {})
+ @properties = options[:properties] || {}
+ @mode = options[:mode]
super(name, nil) # A Cascade cannot have a parent
self.class.add(name, self)
end
- # Builds a child flow given a name and block. Optionally accepts
- # :properties which will override the default properties stroed in this
- # cascade. Optionally accepts a :mode, which will override the default
- # mode stored in this cascade.
- def flow(name, params = {}, &block)
+ # Builds a child Flow in this Cascade given a name and block.
+ #
+ # The named options are:
+ # [properties] Properties hash which will override the default properties
+ # stored in this cascade.
+ # [mode] Mode which will override the default mode stored in this cascade.
+ #
+ # Example:
+ # cascade 'wordcount', :mode => :local do
+ # flow 'first_step' do
+ # ...
+ # end
+ #
+ # flow 'second_step' do
+ # ...
+ # end
+ # end
+ def flow(name, options = {}, &block)
raise "Could not build flow '#{name}'; block required" unless block_given?
- params[:properties] ||= properties.dup
- params[:mode] ||= mode
+ options[:properties] ||= properties.dup
+ options[:mode] ||= mode
- flow = Flow.new(name, self, params)
+ flow = Flow.new(name, self, options)
add_child(flow)
flow.instance_eval(&block)
flow
end
+ # Produces a textual description of this Cascade. The description details
+ # the structure of the Cascade, the sources and sinks of each Flow, and the
+ # input and output fields of each Assembly. The offset parameter allows
+ # for this describe to be nested within a calling context, which lets us
+ # indent the structural hierarchy of a job.
def describe(offset = '')
"#{offset}#{name}:cascade\n#{child_names.map{ |child| children[child].describe("#{offset} ") }.join("\n")}"
end
+ # Writes out the DOT file describing the structure of this Cascade.
+ #
+ # NOTE: will be at Job in later version and also present on Flow
def draw(dir)
@children.each do |name, flow|
flow.connect.writeDOT("#{dir}/#{name}.dot")
end
end
+ # Builds a map, keyed by flow name, of the sink metadata for each child
+ # flow. Currently, this contains only the field names of each sink.
def sink_metadata
@children.inject({}) do |sink_fields, (name, flow)|
sink_fields[name] = flow.sink_metadata
@@ -57,12 +90,16 @@ def sink_metadata
end
end
+ # Writes the mapping produced by Cascade#sink_metadata to a file at the
+ # given path in YAML.
def write_sink_metadata(file_name)
File.open(file_name, 'w') do |file|
YAML.dump(sink_metadata, file)
end
end
+ # Connects this Cascade, producing a c.c.Cascade, which is then completed,
+ # executing it. Child flows are connected, so no parameters are required.
def complete
begin
Java::CascadingCascade::CascadeConnector.new.connect(name, make_flows(@children)).complete
diff --git a/lib/cascading/cascading.rb b/lib/cascading/cascading.rb
index 763d313..59846c4 100644
--- a/lib/cascading/cascading.rb
+++ b/lib/cascading/cascading.rb
@@ -1,6 +1,33 @@
+require 'cascading/cascade'
+require 'cascading/flow'
require 'cascading/expr_stub'
+# The Cascading module contains all of the cascading.jruby DSL. Inserting the
+# following into your script:
+# require 'rubygems'
+# require 'cascading'
+# includes this module at the top level, making all of its features available.
+#
+# To build a dataflow like the one in the README.md or
+# {samples}[http://github.com/mrwalker/cascading.jruby/tree/master/samples],
+# start by looking at Cascade or Flow. These are the
+# highest level structures you'll use to put together your job.
+#
+# Within a flow, you'll connect sources to sinks by way of Assembly, which
+# refers to "pipe assemblies" from Cascading. Within an Assembly, you'll use
+# functions and filters (see Operations, IdentityOperations, RegexOperations,
+# FilterOperations, and TextOperations) as well as Assembly#group_by,
+# Assembly#union, and Assembly#join. You can provide those last pipes with a
+# block that can select operations from Aggregations.
+#
+# Finally, you'll want to address the execution of your job, whether it be
+# locally testing or running remotely on a Hadoop cluster. See the Mode class
+# for the available modes, and parameterize your script such that it can operate
+# in Cascading local mode locally and in Hadoop mode when run in a jar produced
+# with {Jading}[http://github.com/mrwalker/jading].
module Cascading
+ # Mapping that defines a convenient syntax for specifying Java classes, used
+ # in Janino expressions and elsewhere.
JAVA_TYPE_MAP = {
:int => java.lang.Integer.java_class, :long => java.lang.Long.java_class,
:bool => java.lang.Boolean.java_class, :double => java.lang.Double.java_class,
@@ -24,44 +51,84 @@ module Cascading
# directly building their own cascades and flows so that jading can send them
# default properties.
- # Builds a top-level cascade given a name and a block. Optionally accepts a
- # :mode, as explained in Cascading::Cascade#initialize.
- def cascade(name, params = {}, &block)
+ # Builds a top-level Cascade given a name and a block.
+ #
+ # The named options are:
+ # [properties] See Cascade#initialize
+ # [mode] See Cascade#initialize
+ #
+ # Example:
+ # cascade 'wordcount', :mode => :local do
+ # flow 'first_step' do
+ # ...
+ # end
+ #
+ # flow 'second_step' do
+ # ...
+ # end
+ # end
+ def cascade(name, options = {}, &block)
raise "Could not build cascade '#{name}'; block required" unless block_given?
- raise 'Cascading::cascade does not accept the :properties param only the global $jobconf_properties' if params[:properties]
+ raise 'Cascading::cascade does not accept the :properties param only the global $jobconf_properties' if options[:properties]
- params[:properties] = $jobconf_properties.dup if defined?($jobconf_properties) && $jobconf_properties
+ options[:properties] = $jobconf_properties.dup if defined?($jobconf_properties) && $jobconf_properties
- cascade = Cascade.new(name, params)
+ cascade = Cascade.new(name, options)
cascade.instance_eval(&block)
cascade
end
- # Builds a top-level flow given a name and block for applications built of
- # flows with no cascades. Optionally accepts a :mode, as explained in
- # Cascading::Flow#initialize.
- def flow(name, params = {}, &block)
+ # Builds a top-level Flow given a name and block for applications built of
+ # flows with no cascades.
+ #
+ # The named options are:
+ # [properties] See Flow#initialize
+ # [mode] See Flow#initialize
+ #
+ # Example:
+ # flow 'wordcount', :mode => :local do
+ # assembly 'first_step' do
+ # ...
+ # end
+ #
+ # assembly 'second_step' do
+ # ...
+ # end
+ # end
+ def flow(name, options = {}, &block)
raise "Could not build flow '#{name}'; block required" unless block_given?
- raise 'Cascading::flow does not accept the :properties param only the global $jobconf_properties' if params[:properties]
+ raise 'Cascading::flow does not accept the :properties param only the global $jobconf_properties' if options[:properties]
- params[:properties] = $jobconf_properties.dup if defined?($jobconf_properties) && $jobconf_properties
+ options[:properties] = $jobconf_properties.dup if defined?($jobconf_properties) && $jobconf_properties
- flow = Flow.new(name, nil, params)
+ flow = Flow.new(name, nil, options)
flow.instance_eval(&block)
flow
end
+ # Produces a textual description of all Cascades in the global registry. The
+ # description details the structure of the Cascades, the sources and sinks of
+ # each Flow, and the input and output fields of each Assembly.
+ #
+ # NOTE: will be moved to Job in later version
def describe
Cascade.all.map{ |cascade| cascade.describe }.join("\n")
end
alias desc describe
# See ExprStub.expr
- def expr(expression, params = {})
- ExprStub.expr(expression, params)
+ def expr(expression, options = {})
+ ExprStub.expr(expression, options)
end
- # Creates a cascading.tuple.Fields instance from a string or an array of strings.
+ # Utility method for creating Cascading c.t.Fields from a field name (string)
+ # or list of field names (array of strings). If the input fields is already a
+ # c.t.Fields or nil, it is passed through. This allows for flexible use of
+ # the method at multiple layers in the DSL.
+ #
+ # Example:
+ # cascading_fields = fields(['first', 'second', 'third'])
+ # # cascading_fields.to_a == ['first', 'second', 'third']
def fields(fields)
if fields.nil?
return nil
@@ -76,27 +143,45 @@ def fields(fields)
return Java::CascadingTuple::Fields.new([fields].flatten.map{ |f| f.kind_of?(Fixnum) ? java.lang.Integer.new(f) : f }.to_java(java.lang.Comparable))
end
+ # Convenience method wrapping c.t.Fields::ALL
def all_fields
Java::CascadingTuple::Fields::ALL
end
- def union_fields(*fields)
- fields(fields.inject([]){ |acc, arr| acc | arr.to_a })
- end
-
- def difference_fields(*fields)
- fields(fields[1..-1].inject(fields.first.to_a){ |acc, arr| acc - arr.to_a })
+ # Convenience method wrapping c.t.Fields::VALUES
+ def last_grouping_fields
+ Java::CascadingTuple::Fields::VALUES
end
- def copy_fields(fields)
- fields.select(all_fields)
+ # Computes fields formed by removing remove_fields from base_fields. Operates
+ # only on named fields, not positional fields.
+ #
+ # Example:
+ # base_fields = fields(['a', 'b', 'c'])
+ # remove_fields = fields(['b'])
+ # result_fields = difference_fields(base_fields, remove_fields)
+ # # results_fields.to_a == ['a', 'c']
+ def difference_fields(base_fields, remove_fields)
+ fields(base_fields.to_a - remove_fields.to_a)
end
+ # Combines fields deduplicating them with trailing underscores as necessary.
+ # This is used in joins to avoid requiring the caller to unique fields before
+ # they are joined.
def dedup_fields(*fields)
raise 'Can only be applied to declarators' unless fields.all?{ |f| f.is_declarator? }
fields(dedup_field_names(*fields.map{ |f| f.to_a }))
end
+ # Helper used by dedup_fields that operates on arrays of field names rather
+ # than fields objects.
+ #
+ # Example:
+ # left_names = ['a', 'b']
+ # mid_names = ['a', 'c']
+ # right_names = ['a', 'd']
+ # deduped_names = dedup_field_names(left_names, mid_names, right_names)
+ # # deduped_names == ['a', 'b', 'a_', 'c', 'a__', 'd']
def dedup_field_names(*names)
names.inject([]) do |acc, arr|
acc + arr.map{ |e| search_field_name(acc, e) }
@@ -106,30 +191,22 @@ def dedup_field_names(*names)
def search_field_name(names, candidate)
names.include?(candidate) ? search_field_name(names, "#{candidate}_") : candidate
end
-
- def last_grouping_fields
- Java::CascadingTuple::Fields::VALUES
- end
-
- def results_fields
- Java::CascadingTuple::Fields::RESULTS
- end
+ private :search_field_name
# Creates a TextLine scheme (can be used in both Cascading local and hadoop
- # modes). Positional args are used if :source_fields is not
- # provided.
+ # modes). Positional args are used if :source_fields is not provided.
#
# The named options are:
- # * :source_fields a string or array of strings. Specifies the
- # fields to be read from a source with this scheme. Defaults to ['offset', 'line'].
- # * :sink_fields a string or array of strings. Specifies the fields
- # to be written to a sink with this scheme. Defaults to all_fields.
- # * :compression a symbol, either :enable or
- # :disable, that governs the TextLine scheme's compression. Defaults
- # to the default TextLine compression (only applies to c.s.h.TextLine).
- def text_line_scheme(*args)
- options = args.extract_options!
- source_fields = fields(options[:source_fields] || (args.empty? ? ['offset', 'line'] : args))
+ # [source_fields] Fields to be read from a source with this scheme. Defaults
+ # to ['offset', 'line'].
+ # [sink_fields] Fields to be written to a sink with this scheme. Defaults to
+ # all_fields.
+ # [compression] A symbol, either :enable or :disable, that
+ # governs the TextLine scheme's compression. Defaults to the
+ # default TextLine compression (only applies to c.s.h.TextLine).
+ def text_line_scheme(*args_with_options)
+ options, source_fields = args_with_options.extract_options!, args_with_options
+ source_fields = fields(options[:source_fields] || (source_fields.empty? ? ['offset', 'line'] : source_fields))
sink_fields = fields(options[:sink_fields]) || all_fields
sink_compression = case options[:compression]
when :enable then Java::CascadingSchemeHadoop::TextLine::Compress::ENABLE
@@ -153,17 +230,30 @@ def sequence_file_scheme(*fields)
}
end
+ # Convenience access to MultiTap.multi_source_tap. This constructor is more
+ # "DSL-like" because it allows you to pass taps directly as actual args rather
+ # than in an array:
+ # multi_source_tap tap1, tap2, tap3, ..., tapn
+ #
+ # See MultiTap.multi_source_tap for more details.
def multi_source_tap(*taps)
MultiTap.multi_source_tap(taps)
end
+ # Convenience access to MultiTap.multi_sink_tap. This constructor is more
+ # "DSL-like" because it allows you to pass taps directly as actual args rather
+ # than in an array:
+ # multi_sink_tap tap1, tap2, tap3, ..., tapn
+ #
+ # See MultiTap.multi_sink_tap for more details.
def multi_sink_tap(*taps)
MultiTap.multi_sink_tap(taps)
end
- # Creates a Cascading::Tap given a path and optional :scheme and :sink_mode.
- def tap(path, params = {})
- Tap.new(path, params)
+ # Convenience constructor for a Tap, that accepts the same options as that
+ # class' constructor. See Tap for more details.
+ def tap(path, options = {})
+ Tap.new(path, options)
end
# Constructs properties to be passed to Flow#complete or Cascade#complete
diff --git a/lib/cascading/expr_stub.rb b/lib/cascading/expr_stub.rb
index 014f70f..d1f96ae 100644
--- a/lib/cascading/expr_stub.rb
+++ b/lib/cascading/expr_stub.rb
@@ -3,15 +3,15 @@ class ExprStub
attr_accessor :expression, :types, :input_expression
# ExprStub requires a Janino expression decorated with field types. For
- # example: '"Found: " + (x:int + y:int) + " " + z:string'. Type names are
- # defined in Cascading::JAVA_TYPE_MAP.
+ # example:
+ # expr('"Found: " + (x:int + y:int) + " " + z:string')
+ # Type names are defined in Cascading::JAVA_TYPE_MAP.
def initialize(expression)
@input_expression = expression
@expression = expression.dup
@types = {}
# Simple regexp based parser for types
-
JAVA_TYPE_MAP.each do |sym, klass|
@expression.gsub!(/[A-Za-z0-9_]+:#{sym.to_s}/) do |match|
name = match.split(/:/).first.gsub(/\s+/, "")
@@ -21,21 +21,38 @@ def initialize(expression)
end
end
+ # Extract Java names and types from @types hash. Cascading constructors
+ # often require two separate Java Arrays in this fashion.
+ def names_and_types
+ names, types = split_hash(@types)
+ [names.to_java(java.lang.String), types.to_java(java.lang.Class)]
+ end
+
+ # Prints the original input expression.
def to_s
@input_expression
end
# Convenience constructor for an ExprStub that optionally performs
# validation. Takes a string to use as a Janino expression and an optional
- # params hash. By default, the param :validate is set to true (performs
- # expression validation using default actual argument values) and the param
- # :validate_with is set to {} (which doesn't override any of the default
- # actual argument values used for validation).
- def self.expr(expression, params = {})
- params = { :validate => true, :validate_with => {} }.merge(params)
+ # options hash.
+ #
+ # The named options are:
+ # [validate] A boolean indicating whether expression validation using
+ # default actual argument values should be performed. Defaults
+ # to true.
+ # [validate_with] A hash mapping field names (or symbols) to the value that
+ # should be used for validation. Strings default to nil,
+ # so if you have previously filtered nulls you might use a
+ # marker value like 'nulls_filtered'. Defaults to {}.
+ #
+ # Example:
+ # insert 'x_eq_y' => expr('x:string.equals(y:string)', :validate_with => { :x => 'nulls_filtered' })
+ def self.expr(expression, options = {})
+ options = { :validate => true, :validate_with => {} }.merge(options)
expr_stub = expression.kind_of?(ExprStub) ? expression : ExprStub.new(expression).compile
- expr_stub.validate(params[:validate_with]) if params[:validate]
- puts "Expression validation is disabled for '#{expression}'" unless params[:validate]
+ expr_stub.validate(options[:validate_with]) if options[:validate]
+ puts "Expression validation is disabled for '#{expression}'" unless options[:validate]
expr_stub
end
@@ -68,6 +85,9 @@ def validate(actual_args = {})
self.eval(test_values.merge(actual_args))
end
+ # Given a scope, validates that the fields required by this ExprStub are
+ # available in the values fields of the scope. Returns those values fields
+ # which are unused in the expression.
def validate_scope(scope)
validate_fields(scope.values_fields.to_a)
end
@@ -113,12 +133,6 @@ def evaluator
end
end
- # Extract Java names and types from @types hash
- def names_and_types
- names, types = split_hash(@types)
- [names.to_java(java.lang.String), types.to_java(java.lang.Class)]
- end
-
# Makes best effort to convert Ruby numbers into the Java numeric type
# exepcted by a Janino expression. However, if the conversion fails, it
# returns the original value so that the exception thrown will be from
diff --git a/lib/cascading/ext/array.rb b/lib/cascading/ext/array.rb
index 9f7fd56..c8def8e 100644
--- a/lib/cascading/ext/array.rb
+++ b/lib/cascading/ext/array.rb
@@ -1,8 +1,25 @@
+# Extensions to Arrays in support of variable length lists of field names. This
+# is not pretty, but supports DSL features like:
+# group_by 'field1', 'field2', :sort_by => 'field3' do
+# ...
+# end
+#
+# The most obvious limitation of the approach is that function definitions of
+# the form f(*args_with_options) are not self-documenting. To compensate for
+# this, documentation of all arguments and optional parameters must be provided
+# on the DSL method.
class Array
+ # Use this extension to extract the optional parameters from a
+ # *args_with_options argument.
+ # So if you have a function:
+ # def f(*args_with_options)
+ # You can destructively process the args_with_options as follows:
+ # options, just_args = args_with_options.extract_options!, args_with_options
def extract_options!
last.is_a?(::Hash) ? pop : {}
end
+ # Non-destructive form of Array#extract_options!
def extract_options
last.is_a?(::Hash) ? last : {}
end
diff --git a/lib/cascading/filter_operations.rb b/lib/cascading/filter_operations.rb
new file mode 100644
index 0000000..8ae26d8
--- /dev/null
+++ b/lib/cascading/filter_operations.rb
@@ -0,0 +1,101 @@
+module Cascading
+ # Module of filtering operations. Unlike some of the other functional
+ # operations modules, this one does not just wrap operations defined by
+ # Cascading in cascading.operation.filter. Instead, it provides some useful
+ # high-level DSL pipes which map many Cascading operations into a smaller
+ # number of DSL statements.
+ #
+ # Still, some are direct wrappers:
+ # filter\_null:: {FilterNull}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/filter/FilterNull.html]
+ # filter\_not\_null:: {FilterNotNull}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/filter/FilterNotNull.html]
+ module FilterOperations
+ # Filter the current assembly based on an expression or regex, but not both.
+ #
+ # The named options are:
+ # [expression] A Janino expression used to filter. Has access to all :input
+ # fields.
+ # [validate] Boolean passed to Cascading#expr to enable or disable
+ # expression validation. Defaults to true.
+ # [validate_with] Hash mapping field names to actual arguments used by
+ # Cascading#expr for expression validation. Defaults to {}.
+ # [regex] A regular expression used to filter.
+ # [remove_match] Boolean indicating if regex matches should be removed or
+ # kept. Defaults to false, which is a bit counterintuitive.
+ # [match_each_element] Boolean indicating if regex should match entire
+ # incoming tuple (joined with tabs) or each field
+ # individually. Defaults to false.
+ #
+ # Example:
+ # filter :input => 'field1', :regex => /\t/, :remove_match => true
+ # filter :expression => 'field1:long > 0 && "".equals(field2:string)'
+ def filter(options = {})
+ input_fields = options[:input] || all_fields
+ expression = options[:expression]
+ regex = options[:regex]
+
+ if expression
+ validate = options.has_key?(:validate) ? options[:validate] : true
+ validate_with = options[:validate_with] || {}
+
+ stub = expr(expression, { :validate => validate, :validate_with => validate_with })
+ stub.validate_scope(scope)
+
+ names, types = stub.names_and_types
+ each input_fields, :filter => Java::CascadingOperationExpression::ExpressionFilter.new(
+ stub.expression,
+ names,
+ types
+ )
+ elsif regex
+ parameters = [regex.to_s, options[:remove_match], options[:match_each_element]].compact
+ each input_fields, :filter => Java::CascadingOperationRegex::RegexFilter.new(*parameters)
+ else
+ raise 'filter requires one of :expression or :regex'
+ end
+ end
+
+ # Rejects tuples from the current assembly based on a Janino expression.
+ # This is just a wrapper for FilterOperations.filter.
+ #
+ # Example:
+ # reject 'field1:long > 0 && "".equals(field2:string)'
+ def reject(expression, options = {})
+ options[:expression] = expression
+ filter(options)
+ end
+
+ # Keeps tuples from the current assembly based on a Janino expression. This
+ # is a wrapper for FilterOperations.filter.
+ #
+ # Note that this is accomplished by inverting the given expression, and best
+ # attempt is made to support import statements prior to the expression. If
+ # this support should break, simply negate your expression and use
+ # FilterOperations.reject.
+ #
+ # Example:
+ # where 'field1:long > 0 && "".equals(field2:string)'
+ def where(expression, options = {})
+ _, imports, expr = expression.match(/^((?:\s*import.*;\s*)*)(.*)$/).to_a
+ options[:expression] = "#{imports}!(#{expr})"
+ filter(options)
+ end
+
+ # Rejects tuples from the current assembly if any input field is null.
+ #
+ # Example:
+ # filter_null 'field1', 'field2'
+ def filter_null(*input_fields)
+ each(input_fields, :filter => Java::CascadingOperationFilter::FilterNull.new)
+ end
+ alias reject_null filter_null
+
+ # Rejects tuples from the current assembly if any input field is not null.
+ #
+ # Example:
+ # filter_not_null 'field1', 'field2'
+ def filter_not_null(*input_fields)
+ each(input_fields, :filter => Java::CascadingOperationFilter::FilterNotNull.new)
+ end
+ alias where_null filter_not_null
+ end
+end
diff --git a/lib/cascading/flow.rb b/lib/cascading/flow.rb
index 631cc96..8254264 100644
--- a/lib/cascading/flow.rb
+++ b/lib/cascading/flow.rb
@@ -1,6 +1,10 @@
require 'cascading/assembly'
module Cascading
+ # A Flow wraps a c.f.Flow. A Flow is composed of Assemblies, which are
+ # constructed using the Flow#assembly method within the block passed to the
+ # Cascading::flow or Cascade#flow constructor. Many Assemblies may be nested
+ # within a Flow.
class Flow < Cascading::Node
extend Registerable
@@ -10,23 +14,46 @@ class Flow < Cascading::Node
# Do not use this constructor directly. Instead, use Cascading::flow to
# build top-level flows and Cascade#flow to build flows within a Cascade.
#
- # Builds a flow given a name and a parent node (a cascade or nil).
- # Optionally accepts :properties which allows external configuration of
- # this flow. The flow will side-effect the properties during composition,
- # then pass the modified properties along to the FlowConnector for
- # execution. See Cascading::Cascade#initialize for details on how
- # properties are propagated through cascades. Optionally accepts a :mode
- # which will determine the execution mode of this flow. See
- # Cascading::Mode.parse for details.
- def initialize(name, parent, params = {})
+ # Builds a Flow given a name and a parent node (a Cascade or nil).
+ #
+ # The named options are:
+ # [properties] Properties hash which allows external configuration of this
+ # flow. The flow will side-effect the properties during
+ # composition, then pass the modified properties along to the
+ # FlowConnector for execution. See Cascade#initialize for
+ # details on how properties are propagated through cascades.
+ # [mode] Mode which will determine the execution mode of this flow. See
+ # Mode.parse for details.
+ def initialize(name, parent, options = {})
@sources, @sinks, @incoming_scopes, @outgoing_scopes, @listeners = {}, {}, {}, {}, []
- @properties = params[:properties] || {}
- @mode = Mode.parse(params[:mode])
+ @properties = options[:properties] || {}
+ @mode = Mode.parse(options[:mode])
@flow_scope = Scope.flow_scope(name)
super(name, parent)
self.class.add(name, self)
end
+ # Builds a child Assembly in this Flow given a name and block.
+ #
+ # An assembly's name is quite important as it will determine:
+ # * The sources from which it will read, if any
+ # * The name to be used in joins or unions downstream
+ # * The name to be used to sink the output of the assembly downstream
+ #
+ # Many assemblies may be built within a flow. The Assembly#branch method
+ # is used for creating nested assemblies and produces objects of the same
+ # type as this constructor.
+ #
+ # Example:
+ # flow 'wordcount', :mode => :local do
+ # assembly 'first_step' do
+ # ...
+ # end
+ #
+ # assembly 'second_step' do
+ # ...
+ # end
+ # end
def assembly(name, &block)
raise "Could not build assembly '#{name}'; block required" unless block_given?
assembly = Assembly.new(name, self, @outgoing_scopes)
@@ -49,6 +76,11 @@ def sink(name, tap)
sinks[name] = tap
end
+ # Produces a textual description of this Flow. The description details the
+ # structure of the Flow, its sources and sinks, and the input and output
+ # fields of each Assembly. The offset parameter allows for this describe
+ # to be nested within a calling context, which lets us indent the
+ # structural hierarchy of a job.
def describe(offset = '')
description = "#{offset}#{name}:flow\n"
description += "#{sources.keys.map{ |source| "#{offset} #{source}:source :: #{incoming_scopes[source].values_fields.to_a.inspect}" }.join("\n")}\n"
@@ -57,18 +89,28 @@ def describe(offset = '')
description
end
+ # Accesses the outgoing scope of this Flow at the point at which it is
+ # called by default, or for the child specified by the given name, if
+ # specified. This is useful for grabbing the values_fields at any point in
+ # the construction of the Flow. See Scope for details.
def scope(name = nil)
raise 'Must specify name if no children have been defined yet' unless name || last_child
name ||= last_child.name
@outgoing_scopes[name]
end
+ # Prints information about the scope of this Flow at the point at which it
+ # is called by default, or for the child specified by the given name, if
+ # specified. This allows you to trace the propagation of field names
+ # through your job and is handy for debugging. See Scope for details.
def debug_scope(name = nil)
scope = scope(name)
name ||= last_child.name
puts "Scope for '#{name}':\n #{scope}"
end
+ # Builds a map, keyed by sink name, of the sink metadata for each sink.
+ # Currently, this contains only the field names of each sink.
def sink_metadata
@sinks.keys.inject({}) do |sink_metadata, sink_name|
raise "Cannot sink undefined assembly '#{sink_name}'" unless @outgoing_scopes[sink_name]
@@ -79,7 +121,16 @@ def sink_metadata
end
end
- # TODO: support all codecs, support list of codecs
+ # Property modifier that sets the codec and type of the compression for all
+ # sinks in this flow. Currently only supports o.a.h.i.c.DefaultCodec and
+ # o.a.h.i.c.GzipCodec, and the the NONE, RECORD, or BLOCK compressions
+ # types defined in o.a.h.i.SequenceFile.
+ #
+ # codec may be symbols like :default or :gzip and type may be symbols like
+ # :none, :record, or :block.
+ #
+ # Example:
+ # compress_output :default, :block
def compress_output(codec, type)
properties['mapred.output.compress'] = 'true'
properties['mapred.output.compression.codec'] = case codec
@@ -95,22 +146,28 @@ def compress_output(codec, type)
end
end
+ # Set the cascading.spill.list.threshold property in this flow's
+ # properties. See c.t.c.SpillableProps for details.
def set_spill_threshold(threshold)
- properties['cascading.cogroup.spill.threshold'] = threshold.to_s
+ properties['cascading.spill.list.threshold'] = threshold.to_s
end
+ # Adds the given path to the mapred.cache.files list property.
def add_file_to_distributed_cache(file)
add_to_distributed_cache(file, "mapred.cache.files")
end
+ # Adds the given path to the mapred.cache.archives list property.
def add_archive_to_distributed_cache(file)
add_to_distributed_cache(file, "mapred.cache.archives")
end
+ # Appends a FlowListener to the list of listeners for this flow.
def add_listener(listener)
@listeners << listener
end
+ # Handles locating a file cached from S3 on local disk. TODO: remove
def emr_local_path_for_distributed_cache_file(file)
# NOTE this needs to be *appended* to the property mapred.local.dir
if file =~ /^s3n?:\/\//
@@ -122,16 +179,9 @@ def emr_local_path_for_distributed_cache_file(file)
end
end
- def add_to_distributed_cache(file, property)
- v = properties[property]
-
- if v
- properties[property] = [v.split(/,/), file].flatten.join(",")
- else
- properties[property] = file
- end
- end
-
+ # Connects this Flow, producing a c.f.Flow without completing it (the Flow
+ # is not executed). This method is used by Cascade to connect its child
+ # Flows. To connect and complete a Flow, see Flow#complete.
def connect
puts "Connecting flow '#{name}' with properties:"
properties.keys.sort.each do |key|
@@ -149,6 +199,9 @@ def connect
mode.connect_flow(properties, name, sources, sinks, pipes)
end
+ # Completes this Flow after connecting it. This results in execution of
+ # the c.f.Flow built from this Flow. Use this method when executing a
+ # top-level Flow.
def complete
begin
flow = connect
@@ -161,6 +214,16 @@ def complete
private
+ def add_to_distributed_cache(file, property)
+ v = properties[property]
+
+ if v
+ properties[property] = [v.split(/,/), file].flatten.join(",")
+ else
+ properties[property] = file
+ end
+ end
+
def make_tap_parameter(taps, pipe_accessor)
taps.inject({}) do |map, (name, tap)|
assembly = find_child(name)
diff --git a/lib/cascading/identity_operations.rb b/lib/cascading/identity_operations.rb
new file mode 100644
index 0000000..8f2e7ea
--- /dev/null
+++ b/lib/cascading/identity_operations.rb
@@ -0,0 +1,82 @@
+module Cascading
+ # Module of pipe assemblies that wrap the Cascading Identity operation. These
+ # are split out only to group similar functionality.
+ module IdentityOperations
+ # Restricts the current assembly to the specified fields in the order in
+ # which they are specified (can be used to reorder fields).
+ #
+ # Example:
+ # project 'field1', 'field2'
+ def project(*input_fields)
+ each fields(input_fields), :function => Java::CascadingOperation::Identity.new
+ end
+
+ # Removes the specified fields from the current assembly.
+ #
+ # Example:
+ # discard 'field1', 'field2'
+ def discard(*input_fields)
+ discard_fields = fields(input_fields)
+ keep_fields = difference_fields(scope.values_fields, discard_fields)
+ project(*keep_fields.to_a)
+ end
+
+ # Renames fields according to the mapping provided, preserving the original
+ # field order. Throws an exception if non-existent fields are specified.
+ #
+ # Example:
+ # rename 'field1' => 'fieldA', 'field2' => 'fieldB'
+ #
+ # Produces: ['fieldA', 'fieldB'], assuming those were the only 2 input
+ # fields.
+ def rename(name_map)
+ original_fields = scope.values_fields.to_a
+ invalid = name_map.keys - original_fields
+ raise "Invalid field names in rename: #{invalid.inspect}" unless invalid.empty?
+
+ renamed_fields = original_fields.map{ |name| name_map[name] || name }
+
+ each original_fields, :function => Java::CascadingOperation::Identity.new(fields(renamed_fields))
+ end
+
+ # Coerces fields to the Java type selected from Cascading::JAVA_TYPE_MAP.
+ #
+ # Example:
+ # cast 'field1' => :int, 'field2' => :double
+ def cast(type_map)
+ input_fields = type_map.keys.sort
+ types = JAVA_TYPE_MAP.values_at(*type_map.values_at(*input_fields))
+ input_fields = fields(input_fields)
+ types = types.to_java(java.lang.Class)
+ each input_fields, :function => Java::CascadingOperation::Identity.new(input_fields, types)
+ end
+
+ # A field copy (not a pipe copy). Renames fields according to name_map,
+ # appending them to the fields in the assembly in the same order as the
+ # original fields from which they are copied. Throws an exception if
+ # non-existent fields are specified.
+ #
+ # Example:
+ # copy 'field1' => 'fieldA', 'field2' => 'fieldB'
+ #
+ # Produces: ['field1', 'field2', 'fieldA', 'fieldB'], assuming those were
+ # the only input fields.
+ def copy(name_map)
+ original_fields = scope.values_fields.to_a
+ invalid = name_map.keys - original_fields
+ raise "Invalid field names in copy: #{invalid.inspect}" unless invalid.empty?
+
+ # Original fields in name_map in their original order
+ input_fields = original_fields - (original_fields - name_map.keys)
+ into_fields = name_map.values_at(*input_fields)
+
+ each input_fields, :function => Java::CascadingOperation::Identity.new(fields(into_fields)), :output => all_fields
+ end
+
+ # A pipe copy (not a field copy). Can be used within a branch to copy a
+ # pipe.
+ def pass
+ each all_fields, :function => Java::CascadingOperation::Identity.new
+ end
+ end
+end
diff --git a/lib/cascading/mode.rb b/lib/cascading/mode.rb
index 91898e0..dbbe8f7 100644
--- a/lib/cascading/mode.rb
+++ b/lib/cascading/mode.rb
@@ -1,21 +1,25 @@
module Cascading
- # A Cascading::Mode encapsulates the idea of the execution mode for your
- # flows. The default is Hadoop mode, but you can request that your code run
- # in Cascading local mode. If you subsequently use a tap or a scheme that
- # has no local implementation, the mode will be converted back to Hadoop
- # mode.
+ # A Mode encapsulates the idea of the execution mode for your flows. The
+ # default is Hadoop mode, but you can request that your code run in Cascading
+ # local mode. If you subsequently use a tap or a scheme that has no local
+ # implementation, the mode will be converted back to Hadoop mode.
class Mode
attr_reader :local
- # Hadoop mode is the default. You must explicitly request Cascading local
- # mode with values 'local' or :local.
+ # Parses a specification of which mode, Cascading local mode or Hadoop mode,
+ # to execute in. Defaults to Hadoop mode. You may explicitly request
+ # Cascading local mode with values 'local' or :local. If you pass a Mode
+ # object to this method, it will be passed through.
def self.parse(mode)
case mode
+ when Mode then mode
when 'local', :local then Mode.new(true)
else Mode.new(false)
end
end
+ # Constructs a Mode given a flag indicating if it should be Cascading local
+ # mode.
def initialize(local)
@local = local
end
@@ -34,9 +38,9 @@ def source_tap(name, tap)
end
# Builds a c.f.Flow given properties, name, sources, sinks, and pipes from
- # a Cascading::Flow. The current mode is adjusted based on the taps and
- # schemes of the sources and sinks, then the correct taps are selected
- # before building the flow.
+ # a Flow. The current mode is adjusted based on the taps and schemes of
+ # the sources and sinks, then the correct taps are selected before building
+ # the flow.
def connect_flow(properties, name, sources, sinks, pipes)
update_local_mode(sources, sinks)
sources = select_taps(sources)
diff --git a/lib/cascading/operations.rb b/lib/cascading/operations.rb
index c36ee9d..c816b11 100644
--- a/lib/cascading/operations.rb
+++ b/lib/cascading/operations.rb
@@ -1,116 +1,118 @@
module Cascading
- # The Cascading::Operations module is deprecated. The original idea from long
- # ago is that it would be useful to mixin operator wrappers to places other
- # than Cascading::Assembly, but this is not true. Instead, put Eaches in
- # Cascading::Assembly, Everies in Cascading::Aggregations, and any more
- # generally useful utility code directly in the Cascading module
- # (cascading/cascading.rb).
- #
- # Further, the entire *args pattern should be deprecated as it leads to
- # functions that can only be understood by reading their code. Instead,
- # idiomatic Ruby (positional required params and a params hash for optional
- # args) should be used. See Cascading::Assembly#set_value for an example.
module Operations
- def identity
- Java::CascadingOperation::Identity.new
- end
-
- def aggregator_function(args, aggregator_klass)
- options = args.extract_options!
- ignore = options[:ignore]
-
- parameters = [Cascading.fields(args), ignore].compact
- aggregator_klass.new(*parameters)
- end
-
- def first_function(*args)
- aggregator_function(args, Java::CascadingOperationAggregator::First)
- end
-
- def min_function(*args)
- aggregator_function(args, Java::CascadingOperationAggregator::Min)
- end
-
- def max_function(*args)
- aggregator_function(args, Java::CascadingOperationAggregator::Max)
- end
-
- def last_function(*args)
- aggregator_function(args, Java::CascadingOperationAggregator::Last)
- end
-
- def regex_parser(*args)
- options = args.extract_options!
-
- pattern = args[0].to_s
- fields = Cascading.fields(options[:fields])
- groups = options[:groups].to_java(:int) if options[:groups]
- parameters = [fields, pattern, groups].compact
-
- Java::CascadingOperationRegex::RegexParser.new(*parameters)
- end
-
- def regex_splitter(*args)
- options = args.extract_options!
-
- fields = Cascading.fields(args)
- pattern = options[:pattern].to_s
- parameters = [fields, pattern].compact
- Java::CascadingOperationRegex::RegexSplitter.new(*parameters)
- end
-
- def regex_split_generator(*args)
- options = args.extract_options!
-
- fields = Cascading.fields(args)
- pattern = options[:pattern].to_s
- parameters = [fields, pattern].compact
- Java::CascadingOperationRegex::RegexSplitGenerator.new(*parameters)
- end
-
- def regex_generator(*args)
- options = args.extract_options!
-
- fields = Cascading.fields(args)
- pattern = options[:pattern].to_s
- parameters = [fields, pattern].compact
- Java::CascadingOperationRegex::RegexGenerator.new(*parameters)
- end
-
- def expression_function(*args)
- options = args.extract_options!
-
- fields = Cascading.fields(args)
- expression = options[:expression].to_s
- parameters = options[:parameters]
- parameter_names = []
- parameter_types = []
- if parameters.is_a? ::Hash
- parameters.each do |name, type|
- parameter_names << name
- parameter_types << type
+ # Debugs the current assembly at runtime, printing every tuple and fields
+ # every 10 tuples by default.
+ #
+ # The named options are:
+ # [prefix] String to prefix prints with.
+ # [print_fields] Boolean controlling field printing, defaults to false.
+ # [tuple_interval] Integer specifying interval between printed tuples
+ # [fields_interval] Integer specifying interval between printing fields
+ #
+ # Example:
+ # debug :prefix => 'DEBUG', :print_fields => true, :fields_interval => 1000
+ def debug(options = {})
+ input_fields = options[:input] || all_fields
+ prefix = options[:prefix]
+ print_fields = options[:print_fields]
+
+ debug = Java::CascadingOperation::Debug.new(*[prefix, print_fields].compact)
+
+ debug.print_tuple_every = options[:tuple_interval] || 1
+ debug.print_fields_every = options[:fields_interval] || 10
+
+ each(input_fields, :filter => debug)
+ end
+
+ # Inserts new fields into the current assembly. Values may be constants or
+ # expressions (see Cascading::expr). Fields will be inserted in
+ # lexicographic order (not necessarily the order provided).
+ #
+ # Example:
+ # insert 'field1' => 'constant_string', 'field2' => 0, 'field3' => expr('fieldA:long + fieldB:long')
+ def insert(insert_map)
+ insert_map.keys.sort.each do |field_name|
+ value = insert_map[field_name]
+
+ if value.kind_of?(ExprStub)
+ value.validate_scope(scope)
+ names, types = value.names_and_types
+ each(
+ all_fields,
+ :function => Java::CascadingOperationExpression::ExpressionFunction.new(fields(field_name), value.expression, names, types),
+ :output => all_fields
+ )
+ else # value is a constant
+ each(
+ all_fields,
+ :function => Java::CascadingOperation::Insert.new(fields(field_name), to_java_comparable_array([value])),
+ :output => all_fields
+ )
end
- parameter_names = parameter_names.to_java(java.lang.String)
- parameter_types = parameter_types.to_java(java.lang.Class)
-
- arguments = [fields, expression, parameter_names, parameter_types].compact
- elsif !parameters.nil?
- arguments = [fields, expression, parameters.java_class].compact
- else
- arguments = [fields, expression, java.lang.String.java_class].compact
end
-
- Java::CascadingOperationExpression::ExpressionFunction.new(*arguments)
end
- def insert_function(*args)
- options=args.extract_options!
- fields = Cascading.fields(args)
- values = options[:values]
-
- parameters = [fields, to_java_comparable_array(values)].compact
- Java::CascadingOperation::Insert.new(*parameters)
- end
+ # Ungroups, or unpivots, a tuple (see Cascading's {UnGroup}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/function/UnGroup.html]).
+ #
+ # You must provide exactly one of :value_selectors and :num_values.
+ #
+ # The named options are:
+ # [value_selectors] Array of field names to ungroup. Each field will be
+ # ungrouped into an output tuple along with the key fields
+ # in the order provided.
+ # [num_values] Integer specifying the number of fields to ungroup into each
+ # output tuple (excluding the key fields). All input fields
+ # will be ungrouped.
+ #
+ # Example:
+ # ungroup 'key', ['new_key', 'val], :value_selectors => ['val1', 'val2', 'val3'], :output => ['new_key', 'val']
+ def ungroup(key, into_fields, options = {})
+ input_fields = options[:input] || all_fields
+ output = options[:output] || all_fields
+
+ raise 'You must provide exactly one of :value_selectors or :num_values to ungroup' unless options.has_key?(:value_selectors) ^ options.has_key?(:num_values)
+ value_selectors = options[:value_selectors].map{ |vs| fields(vs) }.to_java(Java::CascadingTuple::Fields) if options.has_key?(:value_selectors)
+ num_values = options[:num_values] if options.has_key?(:num_values)
+
+ parameters = [fields(into_fields), fields(key), value_selectors, num_values].compact
+ each input_fields, :function => Java::CascadingOperationFunction::UnGroup.new(*parameters), :output => output
+ end
+
+ # Inserts one of two values into the dataflow based upon the result of the
+ # supplied filter on the input_fields. This is primarily useful for
+ # creating indicators from filters. keep_value specifies the Java value to
+ # produce when the filter would keep the given input and remove_value
+ # specifies the Java value to produce when the filter would remove the given
+ # input.
+ #
+ # Example:
+ # set_value 'field1', Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, 'is_field1_null'
+ def set_value(input_fields, filter, keep_value, remove_value, into_field, options = {})
+ output = options[:output] || all_fields
+ each input_fields, :function => Java::CascadingOperationFunction::SetValue.new(fields(into_field), filter, keep_value, remove_value), :output => output
+ end
+
+ # Efficient way of inserting a null indicator for any field, even one that
+ # cannot be coerced to a string. This is accomplished using Cascading's
+ # FilterNull and SetValue operators rather than Janino. 1 is produced if
+ # the field is null and 0 otherwise.
+ #
+ # Example:
+ # null_indicator 'field1', 'is_field1_null'
+ def null_indicator(input_field, into_field, options = {})
+ set_value input_field, Java::CascadingOperationFilter::FilterNull.new, 1.to_java, 0.to_java, into_field, :output => options[:output]
+ end
+
+ # Given an input_field and a regex, returns an indicator that is 1 if the string
+ # contains at least 1 match and 0 otherwise.
+ #
+ # Example:
+ # regex_contains 'field1', /\w+\s+\w+/, 'does_field1_contain_pair'
+ def regex_contains(input_field, regex, into_field, options = {})
+ set_value input_field, Java::CascadingOperationRegex::RegexFilter.new(pattern.to_s), 1.to_java, 0.to_java, into_field, :output => options[:output]
+ end
+
+ private
def to_java_comparable_array(arr)
(arr.map do |v|
@@ -130,72 +132,5 @@ def coerce_to_java(v)
java.lang.String.new(v.to_s)
end
end
-
- def expression_filter(*args)
- options = args.extract_options!
- expression = (args[0] || options[:expression]).to_s
- parameters = options[:parameters]
- parameter_names = []
- parameter_types = []
- if parameters.is_a? ::Hash
- parameters.each do |name, type|
- parameter_names << name
- parameter_types << type
- end
- parameter_names = parameter_names.to_java(java.lang.String)
- parameter_types = parameter_types.to_java(java.lang.Class)
-
- arguments = [expression, parameter_names, parameter_types].compact
- elsif !parameters.nil?
- arguments = [expression, parameters.java_class].compact
- else
- arguments = [expression, java.lang.String.java_class].compact
- end
-
- Java::CascadingOperationExpression::ExpressionFilter.new(*arguments)
- end
-
- def date_parser(field, format)
- fields = fields(field)
- Java::CascadingOperationText::DateParser.new(fields, format)
- end
-
- def date_formatter(fields, format, timezone=nil)
- fields = fields(fields)
- timezone = Java::JavaUtil::TimeZone.get_time_zone(timezone) if timezone
- arguments = [fields, format, timezone].compact
- Java::CascadingOperationText::DateFormatter.new(*arguments)
- end
-
- def regex_filter(*args)
- options = args.extract_options!
-
- pattern = args[0]
- remove_match = options[:remove_match]
- match_each_element = options[:match_each_element]
- parameters = [pattern.to_s, remove_match, match_each_element].compact
- Java::CascadingOperationRegex::RegexFilter.new(*parameters)
- end
-
- def regex_replace(*args)
- options = args.extract_options!
-
- fields = fields(args[0])
- pattern = args[1]
- replacement = args[2]
- replace_all = options[:replace_all]
-
- parameters = [fields, pattern.to_s, replacement.to_s, replace_all].compact
- Java::CascadingOperationRegex::RegexReplace.new(*parameters)
- end
-
- def field_joiner(*args)
- options = args.extract_options!
- delimiter = options[:delimiter] || ','
- fields = fields(options[:into])
-
- parameters = [fields, delimiter].compact
- Java::CascadingOperationText::FieldJoiner.new(*parameters)
- end
end
end
diff --git a/lib/cascading/regex_operations.rb b/lib/cascading/regex_operations.rb
new file mode 100644
index 0000000..daa02fc
--- /dev/null
+++ b/lib/cascading/regex_operations.rb
@@ -0,0 +1,133 @@
+module Cascading
+ # Module of pipe assemblies that wrap operations defined in the Cascading
+ # cascading.operations.regex package. These are split out only to group
+ # similar functionality.
+ #
+ # All DSL regex pipes require an input_field, a regex, and either a single
+ # into_field or one or more into_fields. Requiring a single input field
+ # allows us to raise an exception early if the wrong input is specified and
+ # avoids the non-intuitive situation where the first of many fields is
+ # silently taken as in Cascading. Requiring a regex means you don't have to
+ # go looking for defaults in code. And into_field(s) means we can propagate
+ # field names through the dataflow.
+ #
+ # Mapping of DSL pipes into Cascading regex operations:
+ # parse:: {RegexParser}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexParser.html]
+ # split:: {RegexSplitter}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexSplitter.html]
+ # split\_rows:: {RegexSplitGenerator}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexSplitGenerator.html]
+ # match\_rows:: {RegexGenerator}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexGenerator.html]
+ # replace:: {RegexReplace}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/regex/RegexReplace.html]
+ module RegexOperations
+ # Parses the given input_field using the specified regular expression to
+ # produce one output per group in that expression.
+ #
+ # The named options are:
+ # [groups] Array of integers specifying which groups to capture if you want
+ # a subset of groups.
+ #
+ # Example:
+ # parse 'field1', /(\w+)\s+(\w+)/, ['out1', 'out2'], :groups => [1, 2]
+ def parse(input_field, regex, into_fields, options = {})
+ groups = options[:groups].to_java(:int) if options[:groups]
+ output = options[:output] || all_fields # Overrides Cascading default
+
+ input_field = fields(input_field)
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
+
+ parameters = [fields(into_fields), regex.to_s, groups].compact
+ each(
+ input_field,
+ :function => Java::CascadingOperationRegex::RegexParser.new(*parameters),
+ :output => output
+ )
+ end
+ alias regex_parser parse
+
+ # Splits the given input_field into multiple fields using the specified
+ # regular expression.
+ #
+ # Example:
+ # split 'line', /\s+/, ['out1', 'out2']
+ def split(input_field, regex, into_fields, options = {})
+ output = options[:output] || all_fields # Overrides Cascading default
+
+ input_field = fields(input_field)
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
+
+ each(
+ input_field,
+ :function => Java::CascadingOperationRegex::RegexSplitter.new(fields(into_fields), regex.to_s),
+ :output => output
+ )
+ end
+ alias regex_splitter split
+
+ # Splits the given input_field into new rows using the specified regular
+ # expression.
+ #
+ # Example:
+ # split_rows 'line', /\s+/, 'word'
+ def split_rows(input_field, regex, into_field, options = {})
+ output = options[:output] || all_fields # Overrides Cascading default
+
+ input_field = fields(input_field)
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
+ into_field = fields(into_field)
+ raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1
+
+ each(
+ input_field,
+ :function => Java::CascadingOperationRegex::RegexSplitGenerator.new(into_field, regex.to_s),
+ :output => output
+ )
+ end
+ alias regex_split_generator split_rows
+
+ # Emits a new row for each regex group matched in input_field using the
+ # specified regular expression.
+ #
+ # Example:
+ # match_rows 'line', /(\w+)\s+(\w+)/, 'word'
+ def match_rows(input_field, regex, into_field, options = {})
+ output = options[:output] || all_fields # Overrides Cascading default
+
+ input_field = fields(input_field)
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
+ into_field = fields(into_field)
+ raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1
+
+ each(
+ input_field,
+ :function => Java::CascadingOperationRegex::RegexGenerator.new(into_field, regex.to_s),
+ :output => output
+ )
+ end
+ alias regex_generator match_rows
+
+ # Performs a query/replace on the given input_field using the specified
+ # regular expression and replacement.
+ #
+ # The named options are:
+ # [replace_all] Boolean indicating if all matches should be replaced;
+ # defaults to true (the Cascading default).
+ #
+ # Example:
+ # replace 'line', /[.,]*\s+/, 'tab_separated_line', "\t"
+ def replace(input_field, regex, into_field, replacement, options = {})
+ output = options[:output] || all_fields # Overrides Cascading default
+
+ input_field = fields(input_field)
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
+ into_field = fields(into_field)
+ raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1
+
+ parameters = [into_field, regex.to_s, replacement.to_s, options[:replace_all]].compact
+ each(
+ input_field,
+ :function => Java::CascadingOperationRegex::RegexReplace.new(*parameters),
+ :output => output
+ )
+ end
+ alias regex_replace replace
+ end
+end
diff --git a/lib/cascading/scope.rb b/lib/cascading/scope.rb
index e7e619a..0062449 100644
--- a/lib/cascading/scope.rb
+++ b/lib/cascading/scope.rb
@@ -1,23 +1,35 @@
module Cascading
+ # Scope is a wrapper for the private Cascading c.f.p.Scope object used to
+ # connect the dataflow graph by resolving fields. cascading.jruby wraps this
+ # facility so that it may be used to propagate field names at composition
+ # time (not Cascading plan time) in the same way they will later be
+ # propagated by the planner.
class Scope
attr_accessor :scope
+ # Construct a Scope given the Cascading c.f.p.Scope to wrap.
def initialize(scope)
@scope = scope
end
+ # Copy one Scope into another; relies upon the copy constructor of
+ # c.f.p.Scope.
def copy
Scope.new(Java::CascadingFlowPlanner::Scope.new(@scope))
end
+ # Build a c.f.p.Scope for a Flow, which is empty except for its name.
def self.flow_scope(name)
Java::CascadingFlowPlanner::Scope.new(name)
end
+ # Build an empty Scope, wrapping an empty c.f.p.Scope.
def self.empty_scope(name)
Scope.new(Java::CascadingFlowPlanner::Scope.new(name))
end
+ # Build a Scope for a single source Tap. The flow_scope is propagated
+ # through this call into a new Scope.
def self.source_scope(name, tap, flow_scope)
incoming_scopes = java.util.HashSet.new
incoming_scopes.add(flow_scope)
@@ -27,28 +39,30 @@ def self.source_scope(name, tap, flow_scope)
Scope.new(java_scope)
end
+ # Build a Scope for an arbitrary flow element. This is used to update the
+ # Scope at each stage in a pipe Assembly.
def self.outgoing_scope(flow_element, incoming_scopes)
java_scopes = incoming_scopes.compact.map{ |s| s.scope }
Scope.new(outgoing_scope_for(flow_element, java.util.HashSet.new(java_scopes)))
end
+ # The values fields of the Scope, which indicate the fields in the current
+ # dataflow tuple.
def values_fields
@scope.out_values_fields
end
+ # The grouping fields of the Scope, which indicate the keys of an
+ # group/cogroup.
def grouping_fields
@scope.out_grouping_fields
end
- def scope_fields_to_s(accessor)
- begin
- fields = @scope.send(accessor)
- fields.nil? ? 'null' : fields.to_s
- rescue Exception => e
- 'ERROR'
- end
- end
-
+ # Prints a detailed description of this Scope, including its type and
+ # various selectors, fields, and key fields. Data is bubbled up directly
+ # from the Cascading c.f.p.Scope. This output can be useful for debugging
+ # the propagation of fields through your job (see Flow#debug_scope and
+ # Assembly#debug_scope, which both rely upon this method).
def to_s
kind = 'Unknown'
kind = 'Tap' if @scope.tap?
@@ -77,6 +91,15 @@ def to_s
private
+ def scope_fields_to_s(accessor)
+ begin
+ fields = @scope.send(accessor)
+ fields.nil? ? 'null' : fields.to_s
+ rescue Exception => e
+ 'ERROR'
+ end
+ end
+
def self.outgoing_scope_for(flow_element, incoming_scopes)
begin
flow_element.outgoing_scope_for(incoming_scopes)
diff --git a/lib/cascading/sub_assembly.rb b/lib/cascading/sub_assembly.rb
index 022c120..61d11e3 100644
--- a/lib/cascading/sub_assembly.rb
+++ b/lib/cascading/sub_assembly.rb
@@ -4,17 +4,15 @@ module Cascading
# Allows you to plugin c.p.SubAssemblies to a cascading.jruby Assembly.
#
# Assumptions:
- # * You will either use the tail_pipe of the calling Assembly, or overwrite
- # its incoming_scopes (as do join and union)
- # * Your subassembly will have only 1 tail pipe; branching is not
- # supported. This allows you to continue operating upon the tail of the
- # SubAssembly within the calling Assembly
+ # * You will either use the tail_pipe of the calling Assembly, or overwrite its incoming_scopes (as do join and union)
+ # * Your subassembly will have only 1 tail pipe; branching is not supported. This allows you to continue operating upon the tail of the SubAssembly within the calling Assembly
# * You will not use nested c.p.SubAssemblies
#
# This is a low-level tool, so be careful.
class SubAssembly
attr_reader :assembly, :sub_assembly, :tail_pipe, :scope
+ # Construct a SubAssembly within the given Assembly
def initialize(assembly, sub_assembly)
@assembly = assembly
@sub_assembly = sub_assembly
@@ -26,6 +24,11 @@ def initialize(assembly, sub_assembly)
raise 'SubAssembly must set exactly 1 tail in constructor' unless sub_assembly.tails.size == 1
end
+ # Complete the addition of the SubAssembly to the Assembly. Propagates
+ # Scope through the SubAssembly and updates the tail_pipe of the
+ # SubAssembly for passing back to the enclosing Assembly. May accept many
+ # incoming pipes, but typically only recieves the tail_pipe of the
+ # enclosing Assembly.
def finalize(pipes, incoming_scopes)
# Build adjacency list for sub_assembly
graph = {}
diff --git a/lib/cascading/tap.rb b/lib/cascading/tap.rb
index c1fb254..3583a5f 100644
--- a/lib/cascading/tap.rb
+++ b/lib/cascading/tap.rb
@@ -1,48 +1,63 @@
module Cascading
- # A Cascading::BaseTap wraps up a pair of Cascading taps, one for Cascading
- # local mode and the other for Hadoop mode.
+ # A BaseTap wraps up a pair of Cascading taps, one for Cascading local mode
+ # and the other for Hadoop mode. Note that these are optional, but at least
+ # one must be provided for most taps. A SequenceFile is a notable example of
+ # a Scheme for which there is no Cascading local mode version, so a Tap you
+ # build with it will have no local_tap.
class BaseTap
attr_reader :local_tap, :hadoop_tap
+ # Constructor that accepts the local_tap and hadoop_tap, which may be nil
def initialize(local_tap, hadoop_tap)
@local_tap = local_tap
@hadoop_tap = hadoop_tap
end
+ # Passes through printing the local_tap and hadoop_tap
def to_s
"Local: #{local_tap}, Hadoop: #{hadoop_tap}"
end
+ # Returns false if the local_tap is nil, true otherwise
def local?
!local_tap.nil?
end
+ # Returns false if the hadoop_tap is nil, true otherwise
def hadoop?
!hadoop_tap.nil?
end
end
- # A Cascading::Tap represents a non-aggregate tap with a scheme, path, and
- # optional sink_mode. c.t.l.FileTap is used in Cascading local mode and
- # c.t.h.Hfs is used in Hadoop mode. Whether or not these can be created is
- # governed by the :scheme parameter, which must contain at least one of
- # :local_scheme or :hadoop_scheme. Schemes like TextLine are supported in
- # both modes (by Cascading), but SequenceFile is only supported in Hadoop
- # mode.
+ # A Tap represents a non-aggregate tap with a scheme, path, and optional
+ # sink_mode. c.t.l.FileTap is used in Cascading local mode and c.t.h.Hfs is
+ # used in Hadoop mode. Whether or not these can be created is governed by the
+ # :scheme parameter, which must contain at least one of :local_scheme or
+ # :hadoop_scheme. Schemes like TextLine are supported in both modes (by
+ # Cascading), but SequenceFile is only supported in Hadoop mode.
class Tap < BaseTap
attr_reader :scheme, :path, :sink_mode
- def initialize(path, params = {})
+ # Builds a Tap given a required path
+ #
+ # The named options are:
+ # [scheme] A Hash which must contain at least one of :local_scheme or
+ # :hadoop_scheme but may contain both. Default is
+ # text_line_scheme, which works in both modes.
+ # [sink_mode] A symbol or string that may be :keep, :replace, or :append,
+ # and corresponds to the c.t.SinkMode enumeration. The default
+ # is :keep, which matches Cascading's default.
+ def initialize(path, options = {})
@path = path
- @scheme = params[:scheme] || text_line_scheme
+ @scheme = options[:scheme] || text_line_scheme
raise "Scheme must provide one of :local_scheme or :hadoop_scheme; received: '#{scheme.inspect}'" unless scheme[:local_scheme] || scheme[:hadoop_scheme]
- @sink_mode = case params[:sink_mode] || :keep
+ @sink_mode = case options[:sink_mode] || :keep
when :keep, 'keep' then Java::CascadingTap::SinkMode::KEEP
when :replace, 'replace' then Java::CascadingTap::SinkMode::REPLACE
when :append, 'append' then Java::CascadingTap::SinkMode::APPEND
- else raise "Unrecognized sink mode '#{params[:sink_mode]}'"
+ else raise "Unrecognized sink mode '#{options[:sink_mode]}'"
end
local_scheme = scheme[:local_scheme]
@@ -53,19 +68,28 @@ def initialize(path, params = {})
end
end
- # A Cascading::MultiTap represents one of Cascading's aggregate taps and is
- # built via static constructors that accept an array of Cascading::Taps. In
- # order for a mode (Cascading local or Hadoop) to be supported, all provided
- # taps must support it.
+ # A MultiTap represents one of Cascading's aggregate taps and is built via
+ # static constructors that accept an array of Taps. In order for a mode
+ # (Cascading local or Hadoop) to be supported, all provided taps must support
+ # it.
class MultiTap < BaseTap
+ # Do not call this constructor directly; instead, use one of
+ # MultiTap.multi_source_tap or MultiTap.multi_sink_tap.
def initialize(local_tap, hadoop_tap)
super(local_tap, hadoop_tap)
end
+ # Static constructor that builds a MultiTap wrapping a c.t.MultiSourceTap
+ # from the given array of Taps. The resulting MultiTap will only be
+ # available in Cascading local mode or Hadoop mode if all input taps support
+ # them.
def self.multi_source_tap(taps)
multi_tap(taps, Java::CascadingTap::MultiSourceTap)
end
+ # Static constructor that builds a MultiTap wrapping a c.t.MultiSinkTap from
+ # the given array of Taps. The resulting MultiTap will only be available in
+ # Cascading local mode or Hadoop mode if all input taps support them.
def self.multi_sink_tap(taps)
multi_tap(taps, Java::CascadingTap::MultiSinkTap)
end
diff --git a/lib/cascading/text_operations.rb b/lib/cascading/text_operations.rb
new file mode 100644
index 0000000..39e8339
--- /dev/null
+++ b/lib/cascading/text_operations.rb
@@ -0,0 +1,67 @@
+module Cascading
+ # Module of pipe assemblies that wrap operations defined in the Cascading
+ # cascading.operations.text package. These are split out only to group
+ # similar functionality.
+ #
+ # Mapping of DSL pipes into Cascading text operations:
+ # parse\_date:: {DateParser}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/text/DateParser.html]
+ # format\_date:: {DateFormatter}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/text/DateFormatter.html]
+ # join\_fields:: {FieldJoiner}[http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/text/FieldJoiner.html]
+ module TextOperations
+ # Parses the given input_field as a date using the provided format string.
+ #
+ # Example:
+ # parse_date 'text_date', 'yyyy/MM/dd', 'timestamp'
+ def parse_date(input_field, date_format, into_field, options = {})
+ output = options[:output] || all_fields # Overrides Cascading default
+
+ input_field = fields(input_field)
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
+ into_field = fields(into_field)
+ raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1
+
+ each(
+ input_field,
+ :function => Java::CascadingOperationText::DateParser.new(into_field, date_format),
+ :output => output
+ )
+ end
+
+ # Converts a timestamp into a formatted date string using the specified
+ # date_format.
+ #
+ # Example:
+ # format_date 'timestamp', 'yyyy/MM/dd', 'text_date'
+ def format_date(input_field, date_format, into_field, options = {})
+ output = options[:output] || all_fields # Overrides Cascading default
+
+ input_field = fields(input_field)
+ raise "input_field must declare exactly one field, was '#{input_field}'" unless input_field.size == 1
+ into_field = fields(into_field)
+ raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1
+
+ each(
+ input_field,
+ :function => Java::CascadingOperationText::DateFormatter.new(into_field, date_format),
+ :output => output
+ )
+ end
+
+ # Joins multiple fields into a single field given a delimiter.
+ #
+ # Example:
+ # join_fields ['field1', 'field2'], ',', 'comma_separated'
+ def join_fields(input_fields, delimiter, into_field)
+ output = options[:output] || all_fields # Overrides Cascading default
+
+ into_field = fields(into_field)
+ raise "into_field must declare exactly one field, was '#{into_field}'" unless into_field.size == 1
+
+ each(
+ input_fields,
+ :function => Java::CascadingOperationText::FieldJoiner.new(into_field, delimiter.to_s),
+ :output => output
+ )
+ end
+ end
+end
diff --git a/samples/branch.rb b/samples/branch.rb
index 64a6b74..3d98d4b 100755
--- a/samples/branch.rb
+++ b/samples/branch.rb
@@ -9,8 +9,7 @@
source 'input', tap('samples/data/data2.txt')
assembly 'input' do
- split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/
-
+ split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id']
branch 'branch1' do
group_by 'score1' do
count
diff --git a/samples/group_by.rb b/samples/group_by.rb
index ce44436..3083ebc 100755
--- a/samples/group_by.rb
+++ b/samples/group_by.rb
@@ -8,7 +8,7 @@
source 'input', tap('samples/data/data_group_by.tsv')
assembly 'input' do
- split 'line', ['id', 'city'], :output => ['id', 'city']
+ split 'line', /\t/, ['id', 'city'], :output => ['id', 'city']
branch 'group_by' do
group_by 'city', :sort_by => 'city' do
diff --git a/samples/join.rb b/samples/join.rb
index 6313ab9..7d50c62 100755
--- a/samples/join.rb
+++ b/samples/join.rb
@@ -10,15 +10,15 @@
source 'input3', tap('samples/data/data_join3.txt')
assembly 'input1' do
- split 'line', ['id', 'name']
+ split 'line', /\t/, ['id', 'name']
end
assembly 'input2' do
- split 'line', ['id', 'age']
+ split 'line', /\t/, ['id', 'age']
end
assembly 'input3' do
- split 'line', ['id', 'city']
+ split 'line', /\t/, ['id', 'city']
end
assembly 'join' do
diff --git a/samples/logwordcount.rb b/samples/logwordcount.rb
index 9e93bc8..b037bf5 100755
--- a/samples/logwordcount.rb
+++ b/samples/logwordcount.rb
@@ -10,7 +10,7 @@
source 'input', tap('samples/data/gutenberg/the_outline_of_science_vol_1')
assembly 'input' do
- split_rows 'line', 'word', :pattern => /[.,]*\s+/, :output => 'word'
+ split_rows 'line', /[.,]*\s+/, 'word', :output => 'word'
group_by 'word' do
count
end
diff --git a/samples/project.rb b/samples/project.rb
index 908f5e1..50bce24 100755
--- a/samples/project.rb
+++ b/samples/project.rb
@@ -10,7 +10,7 @@
source 'input', tap('samples/data/data2.txt')
assembly 'input' do
- split 'line', ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
+ split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
assert Java::CascadingOperationAssertion::AssertSizeEquals.new(4)
project 'name', 'score1', 'score2'
assert Java::CascadingOperationAssertion::AssertSizeEquals.new(3)
diff --git a/samples/rename.rb b/samples/rename.rb
index bbd47e1..bee222d 100755
--- a/samples/rename.rb
+++ b/samples/rename.rb
@@ -8,7 +8,7 @@
source 'input', tap('samples/data/data2.txt')
assembly 'input' do
- split 'line', ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
+ split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
assert Java::CascadingOperationAssertion::AssertSizeEquals.new(4)
rename 'name' => 'new_name', 'score1' => 'new_score1', 'score2' => 'new_score2'
assert Java::CascadingOperationAssertion::AssertSizeEquals.new(4)
diff --git a/samples/replace.rb b/samples/replace.rb
new file mode 100755
index 0000000..40e600e
--- /dev/null
+++ b/samples/replace.rb
@@ -0,0 +1,16 @@
+#! /usr/bin/env jruby
+$: << File.join(File.dirname(__FILE__), '..', 'lib')
+
+require 'cascading'
+
+cascade 'replace', :mode => :local do
+ flow 'replace' do
+ source 'input', tap('samples/data/data2.txt')
+
+ assembly 'input' do
+ replace 'line', /[.,]*\s+/, 'tab_separated_line', "\t", :output => 'tab_separated_line'
+ end
+
+ sink 'input', tap('output/replace', :sink_mode => :replace)
+ end
+end.complete
diff --git a/samples/scorenames.rb b/samples/scorenames.rb
index aadd23e..3cd3e51 100755
--- a/samples/scorenames.rb
+++ b/samples/scorenames.rb
@@ -10,7 +10,7 @@
source 'input', tap('samples/data/genealogy/names/dist.all.last')
assembly 'input' do
- split 'line', ['name', 'val1', 'val2', 'id']
+ split 'line', /[.,]*\s+/, ['name', 'val1', 'val2', 'id']
insert 'val3' => expr('val2:double < 40.0 ? val1:double : val2:double')
project 'name', 'val3', 'id'
end
diff --git a/samples/splitter.rb b/samples/splitter.rb
index 81042c9..021feaf 100755
--- a/samples/splitter.rb
+++ b/samples/splitter.rb
@@ -8,7 +8,7 @@
source 'input', tap('samples/data/data2.txt')
assembly 'input' do
- split 'line', ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
+ split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
group_by 'score1' do
count
end
diff --git a/samples/sub_assembly.rb b/samples/sub_assembly.rb
index a6101ec..a089c56 100755
--- a/samples/sub_assembly.rb
+++ b/samples/sub_assembly.rb
@@ -8,7 +8,7 @@
source 'input', tap('samples/data/data2.txt')
assembly 'input' do
- split 'line', ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
+ split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
assert Java::CascadingOperationAssertion::AssertSizeEquals.new(4)
sub_assembly Java::CascadingPipeAssembly::Discard.new(tail_pipe, fields('id'))
assert Java::CascadingOperationAssertion::AssertSizeEquals.new(3)
diff --git a/samples/ungroup.rb b/samples/ungroup.rb
index 0a70545..e8e94ab 100755
--- a/samples/ungroup.rb
+++ b/samples/ungroup.rb
@@ -11,22 +11,19 @@
source 'input', tap('samples/data/ungroup.tsv')
a = assembly 'input' do
- split 'line', ['key', 'val1', 'val2', 'val3'], :output => ['key', 'val1', 'val2', 'val3']
+ split 'line', /\t/, ['key', 'val1', 'val2', 'val3'], :output => ['key', 'val1', 'val2', 'val3']
branch 'ungroup_using_value_selectors' do
- #each all_fields, :function => Java::CascadingOperationFunction::UnGroup.new(fields(['new_key', 'val']), fields('key'), [fields('val1'), fields('val2'), fields('val3')].to_java(Java::CascadingTuple::Fields)), :output => ['new_key', 'val']
- ungroup :key => 'key', :value_selectors => ['val1', 'val2', 'val3'], :into => ['new_key', 'val'], :output => ['new_key', 'val']
+ ungroup 'key', ['new_key', 'val'], :value_selectors => ['val1', 'val2', 'val3'], :output => ['new_key', 'val']
end
branch 'ungroup_using_num_values' do
- #each all_fields, :function => Java::CascadingOperationFunction::UnGroup.new(fields(['new_key', 'val']), fields('key'), 1), :output => ['new_key', 'val']
- ungroup :key => 'key', :num_values => 1, :into => ['new_key', 'val'], :output => ['new_key', 'val']
+ ungroup 'key', ['new_key', 'val'], :num_values => 1, :output => ['new_key', 'val']
end
# This pairs up the first and last two fields with no "key"
branch 'ungroup_no_key' do
- #each all_fields, :function => Java::CascadingOperationFunction::UnGroup.new(fields(['left', 'right']), fields([]), 2), :output => ['left', 'right']
- ungroup :key => [], :num_values => 2, :into => ['left', 'right'], :output => ['left', 'right']
+ ungroup [], ['left', 'right'], :num_values => 2, :output => ['left', 'right']
end
end
diff --git a/samples/union.rb b/samples/union.rb
index 1e402ef..1e16772 100755
--- a/samples/union.rb
+++ b/samples/union.rb
@@ -10,7 +10,7 @@
source 'input', tap('samples/data/genealogy/names/dist.all.last')
assembly 'input' do
- split 'line', ['name', 'score1', 'score2', 'id']
+ split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id']
branch 'branch1' do
group_by 'score1', 'name' do
diff --git a/samples/unique.rb b/samples/unique.rb
index 3c75fa9..c2cc0c4 100755
--- a/samples/unique.rb
+++ b/samples/unique.rb
@@ -10,7 +10,7 @@
source 'input', tap('samples/data/data_group_by.tsv')
assembly 'input' do
- split 'line', ['id', 'city'], :output => ['id', 'city']
+ split 'line', /\t/, ['id', 'city'], :output => ['id', 'city']
branch 'unique' do
sub_assembly Java::CascadingPipeAssembly::Unique.new(tail_pipe, fields('city'))
diff --git a/spec/cascading_spec.rb b/spec/cascading_spec.rb
index a075661..faec9e1 100644
--- a/spec/cascading_spec.rb
+++ b/spec/cascading_spec.rb
@@ -85,12 +85,12 @@
source 'right', tap('spec/resource/join_input.txt', :scheme => text_line_scheme)
assembly 'left' do
- split 'line', ['x', 'y', 'z'], :pattern => /,/
+ split 'line', /,/, ['x', 'y', 'z']
project 'x', 'y', 'z'
end
assembly 'right' do
- split 'line', ['x', 'y', 'z'], :pattern => /,/
+ split 'line', /,/, ['x', 'y', 'z']
project 'x', 'y', 'z'
branch 'branch_join' do
diff --git a/spec/jruby_version_spec.rb b/spec/jruby_version_spec.rb
index f6bf3a4..e78f9fa 100644
--- a/spec/jruby_version_spec.rb
+++ b/spec/jruby_version_spec.rb
@@ -19,7 +19,7 @@
end
thrown.should == 'InvocationTargetException'
- if JRUBY_VERSION == '1.7.0'
+ if JRUBY_VERSION == '1.7.0' || JRUBY_VERSION == '1.7.3'
exception.java_class.should be Java::JavaLangReflect::InvocationTargetException.java_class
else
# How can this be? A nil exception?
@@ -58,7 +58,7 @@
result = e.validate
result.should == 0
end
- when '1.5.3', '1.6.5', '1.6.7.2', '1.7.0'
+ when '1.5.3', '1.6.5', '1.6.7.2', '1.7.0', '1.7.3'
it 'should handle Fixnum -> Integer for ExprStub#eval' do
e = ExprStub.new('x:int + y:int')
result = e.eval(:x => 2, :y => 3)
diff --git a/spec/scope_spec.rb b/spec/scope_spec.rb
index 3bacd26..c32b277 100644
--- a/spec/scope_spec.rb
+++ b/spec/scope_spec.rb
@@ -22,7 +22,7 @@
check_scope :values_fields => ['offset', 'line']
assert_size_equals 2
- split 'line', ['x', 'y'], :pattern => /,/
+ split 'line', /,/, ['x', 'y']
check_scope :values_fields => ['offset', 'line', 'x', 'y']
assert_size_equals 4
end
@@ -33,7 +33,7 @@
check_scope :values_fields => ['offset', 'line']
assert_size_equals 2
- split 'line', ['x', 'y'], :pattern => /,/, :output => ['x', 'y']
+ split 'line', /,/, ['x', 'y'], :output => ['x', 'y']
check_scope :values_fields => ['x', 'y']
assert_size_equals 2
end
diff --git a/spec/spec_util.rb b/spec/spec_util.rb
index 890b554..127a994 100644
--- a/spec/spec_util.rb
+++ b/spec/spec_util.rb
@@ -2,14 +2,14 @@
BUILD_DIR = 'build/spec'
module ScopeTests
- def check_scope(params = {})
- name_params = [params[:source]].compact
- scope = scope(*name_params)
- values_fields = params[:values_fields]
- grouping_fields = params[:grouping_fields] || values_fields
+ def check_scope(options = {})
+ name_options = [options[:source]].compact
+ scope = scope(*name_options)
+ values_fields = options[:values_fields]
+ grouping_fields = options[:grouping_fields] || values_fields
- debug = params[:debug]
- debug_scope(*name_params) if debug
+ debug = options[:debug]
+ debug_scope(*name_options) if debug
scope.values_fields.to_a.should == values_fields
scope.grouping_fields.to_a.should == grouping_fields
@@ -29,8 +29,8 @@ def test_flow(&block)
cascade.complete
end
-def test_assembly(params = {}, &block)
- branches = params[:branches] || []
+def test_assembly(options = {}, &block)
+ branches = options[:branches] || []
test_flow do
source 'input', tap('spec/resource/test_input.txt', :scheme => text_line_scheme)
@@ -49,9 +49,9 @@ def test_assembly(params = {}, &block)
end
end
-def test_join_assembly(params = {}, &block)
- branches = params[:branches] || []
- post_join_block = params[:post_join_block]
+def test_join_assembly(options = {}, &block)
+ branches = options[:branches] || []
+ post_join_block = options[:post_join_block]
test_flow do
source 'left', tap('spec/resource/join_input.txt', :scheme => text_line_scheme)
@@ -63,13 +63,13 @@ def test_join_assembly(params = {}, &block)
assembly 'left' do
check_scope :values_fields => ['offset', 'line']
- split 'line', ['x', 'y', 'z'], :pattern => /,/
+ split 'line', /,/, ['x', 'y', 'z']
check_scope :values_fields => ['offset', 'line', 'x', 'y', 'z']
end
assembly 'right' do
check_scope :values_fields => ['offset', 'line']
- split 'line', ['x', 'y', 'z'], :pattern => /,/
+ split 'line', /,/, ['x', 'y', 'z']
check_scope :values_fields => ['offset', 'line', 'x', 'y', 'z']
end
diff --git a/test/mock_assemblies.rb b/test/mock_assemblies.rb
index cfa8284..c926b9f 100644
--- a/test/mock_assemblies.rb
+++ b/test/mock_assemblies.rb
@@ -39,11 +39,11 @@ def mock_two_input_assembly(&block)
source 'test2', tap('test/data/data2.txt')
assembly 'test1' do
- split 'line', :pattern => /[.,]*\s+/, :into => ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
+ split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
end
assembly 'test2' do
- split 'line', :pattern => /[.,]*\s+/, :into => ['name', 'id', 'town'], :output => ['name', 'id', 'town']
+ split 'line', /[.,]*\s+/, ['name', 'id', 'town'], :output => ['name', 'id', 'town']
end
assembly = assembly 'test', &block
diff --git a/test/test_assembly.rb b/test/test_assembly.rb
index bda8e28..1b03ffd 100644
--- a/test/test_assembly.rb
+++ b/test/test_assembly.rb
@@ -23,7 +23,7 @@ def test_create_assembly_simple
def test_each_identity
assembly = mock_assembly do
- each 'offset', :function => identity
+ each 'offset', :function => Java::CascadingOperation::Identity.new
end
flow = assembly.parent
@@ -35,7 +35,7 @@ def test_each_identity
def test_create_each
# You can apply an Each to 0 fields
assembly = mock_assembly do
- each(:function => identity)
+ each(:function => Java::CascadingOperation::Identity.new)
end
assert_equal Java::CascadingPipe::Each, assembly.tail_pipe.class
@@ -547,7 +547,7 @@ def test_hash_join_with_block
end
end
end
- assert_equal "hash joins don't support aggregations", ex.message
+ assert_equal "HashJoin doesn't support aggregations so the block provided to hash_join will be ignored", ex.message
end
def test_branch_unique
@@ -579,7 +579,7 @@ def test_branch_single
assembly = mock_assembly do
branch 'branch1' do
branch 'branch2' do
- each 'line', :function => identity
+ each 'line', :function => Java::CascadingOperation::Identity.new
end
end
end
@@ -659,43 +659,41 @@ def test_sum_by_sub_assembly
assert_equal ['line', 'sum'], assembly.scope.grouping_fields.to_a
end
- def test_empty_where
+ def test_where
assembly = mock_assembly do
- split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/, :output => ['name', 'score1', 'score2', 'id']
- where
+ split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
+ where 'score1:double < score2:double'
end
assert_equal Java::CascadingPipe::Each, assembly.tail_pipe.class
-
- # Empty where compiles away
- assert_equal Java::CascadingOperationRegex::RegexSplitter, assembly.tail_pipe.operation.class
+ assert_equal Java::CascadingOperationExpression::ExpressionFilter, assembly.tail_pipe.operation.class
end
- def test_where
+ def test_where_with_import
assembly = mock_assembly do
- split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/, :output => ['name', 'score1', 'score2', 'id']
- where 'score1:double < score2:double'
+ split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
+ names = ['SMITH', 'JONES', 'BROWN']
+ where "import java.util.Arrays;\nArrays.asList(new String[] { \"#{names.join('", "')}\" }).contains(name:string)"
end
assert_equal Java::CascadingPipe::Each, assembly.tail_pipe.class
assert_equal Java::CascadingOperationExpression::ExpressionFilter, assembly.tail_pipe.operation.class
end
- def test_where_with_expression
+ def test_rename
assembly = mock_assembly do
- split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/, :output => ['name', 'score1', 'score2', 'id']
- where :expression => 'score1:double < score2:double'
+ split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
+ rename 'score2' => 'new_score2', 'score1' => 'new_score1', 'name' => 'new_name'
end
- assert_equal Java::CascadingPipe::Each, assembly.tail_pipe.class
- assert_equal Java::CascadingOperationExpression::ExpressionFilter, assembly.tail_pipe.operation.class
+ # Original order preserved
+ assert_equal ['new_name', 'new_score1', 'new_score2', 'id'], assembly.scope.values_fields.to_a
end
- def test_where_with_import
+ def test_copy
assembly = mock_assembly do
- split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/, :output => ['name', 'score1', 'score2', 'id']
- names = ['SMITH', 'JONES', 'BROWN']
- where "import java.util.Arrays;\nArrays.asList(new String[] { \"#{names.join('", "')}\" }).contains(name:string)"
+ split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
+ copy 'score2' => 'new_score2', 'id' => 'new_id', 'name' => 'new_name'
end
- assert_equal Java::CascadingPipe::Each, assembly.tail_pipe.class
- assert_equal Java::CascadingOperationExpression::ExpressionFilter, assembly.tail_pipe.operation.class
+ # Original order preserved in copied fields
+ assert_equal ['name', 'score1', 'score2', 'id', 'new_name', 'new_score2', 'new_id'], assembly.scope.values_fields.to_a
end
def test_smoke_test_describe
diff --git a/test/test_local_execution.rb b/test/test_local_execution.rb
index 5756acb..d262be1 100644
--- a/test/test_local_execution.rb
+++ b/test/test_local_execution.rb
@@ -36,7 +36,7 @@ def test_splitter
source 'copy', tap('test/data/data1.txt')
assembly 'copy' do
- split 'line', :pattern => /[.,]*\s+/, :into=>['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
+ split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
assert_size_equals 4
assert_not_null
debug :print_fields => true
@@ -70,14 +70,14 @@ def test_join1
source 'data2', tap('test/data/data2.txt')
assembly1 = assembly 'data1' do
- split 'line', :pattern => /[.,]*\s+/, :into => ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
+ split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
assert_size_equals 4
assert_not_null
debug :print_fields => true
end
assembly2 = assembly 'data2' do
- split 'line', :pattern => /[.,]*\s+/, :into => ['name', 'id', 'town'], :output => ['name', 'id', 'town']
+ split 'line', /[.,]*\s+/, ['name', 'id', 'town'], :output => ['name', 'id', 'town']
assert_size_equals 3
assert_not_null
debug :print_fields => true
@@ -106,12 +106,12 @@ def test_join2
source 'data2', tap('test/data/data2.txt')
assembly 'data1' do
- split 'line', :pattern => /[.,]*\s+/, :into => ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
+ split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
debug :print_fields => true
end
assembly 'data2' do
- split 'line', :pattern => /[.,]*\s+/, :into => ['name', 'code', 'town'], :output => ['name', 'code', 'town']
+ split 'line', /[.,]*\s+/, ['name', 'code', 'town'], :output => ['name', 'code', 'town']
debug :print_fields => true
end
@@ -135,7 +135,7 @@ def test_union
source 'data2', tap('test/data/data2.txt')
assembly 'data1' do
- split 'line', :pattern => /[.,]*\s+/, :into => ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
+ split 'line', /[.,]*\s+/, ['name', 'score1', 'score2', 'id'], :output => ['name', 'score1', 'score2', 'id']
assert_size_equals 4
assert_not_null
@@ -144,7 +144,7 @@ def test_union
end
assembly 'data2' do
- split 'line', :pattern => /[.,]*\s+/, :into => ['name', 'code', 'town'], :output => ['name', 'code', 'town']
+ split 'line', /[.,]*\s+/, ['name', 'code', 'town'], :output => ['name', 'code', 'town']
assert_size_equals 3
assert_not_null
diff --git a/test/test_operations.rb b/test/test_operations.rb
index 9f098d8..5d54d27 100644
--- a/test/test_operations.rb
+++ b/test/test_operations.rb
@@ -4,16 +4,6 @@
class TC_Operations < Test::Unit::TestCase
include Operations
- def test_aggregator_function_ignore_values
- min = min_function 'min_field', :ignore => [nil].to_java(:string)
- assert_not_nil min
- end
-
- def test_aggregator_function_ignore_tuples
- first = first_function 'first_field', :ignore => [Java::CascadingTuple::Tuple.new(-1)].to_java(Java::CascadingTuple::Tuple)
- assert_not_nil first
- end
-
def test_coerce_to_java_int
result = coerce_to_java(1)