From cce10d234be1b71720de68d8ee18eec690effd07 Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Mon, 17 Aug 2015 13:44:05 -0700 Subject: [PATCH 1/5] Add Prefix Span documentation --- docs/mllib-frequent-pattern-mining.md | 88 +++++++++++++++++++++++++++ docs/mllib-guide.md | 1 + 2 files changed, 89 insertions(+) diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md index bcc066a185526..72ccd223efa61 100644 --- a/docs/mllib-frequent-pattern-mining.md +++ b/docs/mllib-frequent-pattern-mining.md @@ -96,3 +96,91 @@ for (FPGrowth.FreqItemset itemset: model.freqItemsets().toJavaRDD().coll + +## Prefix Span + +Prefix Span is a sequential pattern mining algorithm described in +[Mortazavi-Asl et al., Mining Sequential Patterns by Pattern-Growth: The +PrefixSpan Approach](http://dx.doi.org/10.1109%2FTKDE.2004.77). We refer +the reader to the referenced paper for formalizing the sequential +pattern mining problem. + +MLlib's FP-growth implementation takes the following parameters: + +* `minSupport`: the minimum support required to be considered a frequent + sequential pattern. +* `maxPatternLength`: the maximum length of a frequent sequential + pattern. Any frequent pattern exceeding this length will not be + included in the results. +* `maxLocalProjDBSize`: the maximum number of items allowed in a + prefix-projected database before local iterative processing of the + projected databse begins. This parameter should be tuned with respect + to the size of your executors. + + + +**Examples** + +
+
+ +[`PrefixSpan`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan) implements the +Prefix Span algorithm. +Calling `PrefixSpan.run` returns a +[`PrefixSpanModel`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan) +that stores the frequent sequences with their frequencies. + +{% highlight scala %} +import org.apache.spark.mllib.fpm.PrefixSpan + +val sequences = Seq( + Array(Array(1, 2), Array(3)), + Array(Array(1), Array(3, 2), Array(1, 2)), + Array(Array(1, 2), Array(5)), + Array(Array(6))) +val rdd = sc.parallelize(sequences, 2).cache() + +val prefixSpan = new PrefixSpan() + .setMinSupport(0.5) + .setMaxPatternLength(5) +val model = prefixSpan.run(rdd) +model.freqSequences.collect().foreach { freqSequence => + println(freqSequence.sequence.map(_.mkString("(", ",", ")")).mkString("[",",","]") + ", " + freqSequence.freq) +} +{% endhighlight %} + +
+ +
+ +[`PrefixSpan`](api/java/org/apache/spark/mllib/fpm/PrefixSpan.html) implements the +Prefix Span algorithm. +Calling `PrefixSpan.run` returns a +[`PrefixSpanModel`](api/java/org/apache/spark/mllib/fpm/PrefixSpanModel.html) +that stores the frequent sequences with their frequencies. + +{% highlight java %} +import java.util.Arrays; +import java.util.List; + +import org.apache.spark.mllib.fpm.PrefixSpan; +import org.apache.spark.mllib.fpm.PrefixSpanModel; + +JavaRDD>> sequences = sc.parallelize(Arrays.asList( + Arrays.asList(Arrays.asList(1, 2), Arrays.asList(3)), + Arrays.asList(Arrays.asList(1), Arrays.asList(3, 2), Arrays.asList(1, 2)), + Arrays.asList(Arrays.asList(1, 2), Arrays.asList(5)), + Arrays.asList(Arrays.asList(6)) +), 2); +PrefixSpan prefixSpan = new PrefixSpan() + .setMinSupport(0.5) + .setMaxPatternLength(5); +PrefixSpanModel model = prefixSpan.run(sequences); +for (PrefixSpan.FreqSequence freqSeq: model.freqSequences().toJavaRDD().collect()) { + System.out.println(freqSeq.javaSequence() + ", " + freqSeq.freq()); +} +{% endhighlight %} + +
+
+ diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index e8000ff478300..096f6f1952126 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -48,6 +48,7 @@ This lists functionality included in `spark.mllib`, the main MLlib API. * [Feature extraction and transformation](mllib-feature-extraction.html) * [Frequent pattern mining](mllib-frequent-pattern-mining.html) * [FP-growth](mllib-frequent-pattern-mining.html#fp-growth) + * prefix span * [Evaluation Metrics](mllib-evaluation-metrics.html) * [Optimization (developer)](mllib-optimization.html) * [stochastic gradient descent](mllib-optimization.html#stochastic-gradient-descent-sgd) From 1729b5928a051a0c738aa686ea89078e61dc312c Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Mon, 17 Aug 2015 14:09:33 -0700 Subject: [PATCH 2/5] Java documentation fix --- docs/mllib-frequent-pattern-mining.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md index 72ccd223efa61..190c8c0daa882 100644 --- a/docs/mllib-frequent-pattern-mining.md +++ b/docs/mllib-frequent-pattern-mining.md @@ -145,7 +145,8 @@ val prefixSpan = new PrefixSpan() .setMaxPatternLength(5) val model = prefixSpan.run(rdd) model.freqSequences.collect().foreach { freqSequence => - println(freqSequence.sequence.map(_.mkString("(", ",", ")")).mkString("[",",","]") + ", " + freqSequence.freq) +println( + freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]") + ", " + freqSequence.freq) } {% endhighlight %} From 737b2271caf6a86805e6c34a74e26d428058924a Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Mon, 17 Aug 2015 15:26:31 -0700 Subject: [PATCH 3/5] Add link to prefix span section --- docs/mllib-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index 096f6f1952126..3532d8534a77b 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -48,7 +48,7 @@ This lists functionality included in `spark.mllib`, the main MLlib API. * [Feature extraction and transformation](mllib-feature-extraction.html) * [Frequent pattern mining](mllib-frequent-pattern-mining.html) * [FP-growth](mllib-frequent-pattern-mining.html#fp-growth) - * prefix span + * [prefix span](mllib-frequent-pattern-mining.html#prefix-span) * [Evaluation Metrics](mllib-evaluation-metrics.html) * [Optimization (developer)](mllib-optimization.html) * [stochastic gradient descent](mllib-optimization.html#stochastic-gradient-descent-sgd) From b9842e6c9b0b4318d637dcc0c00cee8acab0f243 Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Mon, 17 Aug 2015 16:05:12 -0700 Subject: [PATCH 4/5] Code review feedback --- docs/mllib-frequent-pattern-mining.md | 35 ++++++++++++++++----------- docs/mllib-guide.md | 2 +- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md index 190c8c0daa882..998f54f313f5a 100644 --- a/docs/mllib-frequent-pattern-mining.md +++ b/docs/mllib-frequent-pattern-mining.md @@ -97,15 +97,15 @@ for (FPGrowth.FreqItemset itemset: model.freqItemsets().toJavaRDD().coll -## Prefix Span +## PrefixSpan -Prefix Span is a sequential pattern mining algorithm described in -[Mortazavi-Asl et al., Mining Sequential Patterns by Pattern-Growth: The +PrefixSpan is a sequential pattern mining algorithm described in +[Pei et al., Mining Sequential Patterns by Pattern-Growth: The PrefixSpan Approach](http://dx.doi.org/10.1109%2FTKDE.2004.77). We refer the reader to the referenced paper for formalizing the sequential pattern mining problem. -MLlib's FP-growth implementation takes the following parameters: +MLlib's PrefixSpan implementation takes the following parameters: * `minSupport`: the minimum support required to be considered a frequent sequential pattern. @@ -121,11 +121,19 @@ MLlib's FP-growth implementation takes the following parameters: **Examples** +The following example illustrates PrefixSpan running on the sequences +(using same notation as Pei et al): + + <(12)3> + <1(32)(12)> + <(12)5> + <6> +
[`PrefixSpan`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan) implements the -Prefix Span algorithm. +PrefixSpan algorithm. Calling `PrefixSpan.run` returns a [`PrefixSpanModel`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan) that stores the frequent sequences with their frequencies. @@ -133,17 +141,16 @@ that stores the frequent sequences with their frequencies. {% highlight scala %} import org.apache.spark.mllib.fpm.PrefixSpan -val sequences = Seq( - Array(Array(1, 2), Array(3)), - Array(Array(1), Array(3, 2), Array(1, 2)), - Array(Array(1, 2), Array(5)), - Array(Array(6))) -val rdd = sc.parallelize(sequences, 2).cache() - +val sequences = sc.parallelize(Seq( + Array(Array(1, 2), Array(3)), + Array(Array(1), Array(3, 2), Array(1, 2)), + Array(Array(1, 2), Array(5)), + Array(Array(6)) + ), 2).cache() val prefixSpan = new PrefixSpan() .setMinSupport(0.5) .setMaxPatternLength(5) -val model = prefixSpan.run(rdd) +val model = prefixSpan.run(sequences) model.freqSequences.collect().foreach { freqSequence => println( freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]") + ", " + freqSequence.freq) @@ -155,7 +162,7 @@ println(
[`PrefixSpan`](api/java/org/apache/spark/mllib/fpm/PrefixSpan.html) implements the -Prefix Span algorithm. +PrefixSpan algorithm. Calling `PrefixSpan.run` returns a [`PrefixSpanModel`](api/java/org/apache/spark/mllib/fpm/PrefixSpanModel.html) that stores the frequent sequences with their frequencies. diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index 3532d8534a77b..7851175b98230 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -48,7 +48,7 @@ This lists functionality included in `spark.mllib`, the main MLlib API. * [Feature extraction and transformation](mllib-feature-extraction.html) * [Frequent pattern mining](mllib-frequent-pattern-mining.html) * [FP-growth](mllib-frequent-pattern-mining.html#fp-growth) - * [prefix span](mllib-frequent-pattern-mining.html#prefix-span) + * [PrefixSpan](mllib-frequent-pattern-mining.html#prefix-span) * [Evaluation Metrics](mllib-evaluation-metrics.html) * [Optimization (developer)](mllib-optimization.html) * [stochastic gradient descent](mllib-optimization.html#stochastic-gradient-descent-sgd) From cec483bff50a83802e507f69570d767e753b3da4 Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Mon, 17 Aug 2015 17:40:37 -0700 Subject: [PATCH 5/5] Code review fixes --- docs/mllib-frequent-pattern-mining.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md index 998f54f313f5a..8ea4389266484 100644 --- a/docs/mllib-frequent-pattern-mining.md +++ b/docs/mllib-frequent-pattern-mining.md @@ -117,17 +117,17 @@ MLlib's PrefixSpan implementation takes the following parameters: projected databse begins. This parameter should be tuned with respect to the size of your executors. - - **Examples** The following example illustrates PrefixSpan running on the sequences (using same notation as Pei et al): +~~~ <(12)3> <1(32)(12)> <(12)5> <6> +~~~
@@ -135,7 +135,7 @@ The following example illustrates PrefixSpan running on the sequences [`PrefixSpan`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan) implements the PrefixSpan algorithm. Calling `PrefixSpan.run` returns a -[`PrefixSpanModel`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan) +[`PrefixSpanModel`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpanModel) that stores the frequent sequences with their frequencies. {% highlight scala %}