From cce10d234be1b71720de68d8ee18eec690effd07 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Mon, 17 Aug 2015 13:44:05 -0700
Subject: [PATCH 1/5] Add Prefix Span documentation

---
 docs/mllib-frequent-pattern-mining.md | 88 +++++++++++++++++++++++++++
 docs/mllib-guide.md                   |  1 +
 2 files changed, 89 insertions(+)
diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md
index bcc066a185526..72ccd223efa61 100644
--- a/docs/mllib-frequent-pattern-mining.md
+++ b/docs/mllib-frequent-pattern-mining.md
@@ -96,3 +96,91 @@ for (FPGrowth.FreqItemset<String> itemset: model.freqItemsets().toJavaRDD().coll
 
 </div>
 </div>
+
+## Prefix Span
+
+Prefix Span is a sequential pattern mining algorithm described in
+[Mortazavi-Asl et al., Mining Sequential Patterns by Pattern-Growth: The
+PrefixSpan Approach](http://dx.doi.org/10.1109%2FTKDE.2004.77). We refer
+the reader to the referenced paper for formalizing the sequential
+pattern mining problem.
+
+MLlib's FP-growth implementation takes the following parameters:
+
+* `minSupport`: the minimum support required to be considered a frequent
+  sequential pattern.
+* `maxPatternLength`: the maximum length of a frequent sequential
+  pattern. Any frequent pattern exceeding this length will not be
+  included in the results.
+* `maxLocalProjDBSize`: the maximum number of items allowed in a
+  prefix-projected database before local iterative processing of the
+  projected databse begins. This parameter should be tuned with respect
+  to the size of your executors.
+
+
+
+**Examples**
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+[`PrefixSpan`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan) implements the
+Prefix Span algorithm.
+Calling `PrefixSpan.run` returns a
+[`PrefixSpanModel`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan)
+that stores the frequent sequences with their frequencies.
+
+{% highlight scala %}
+import org.apache.spark.mllib.fpm.PrefixSpan
+
+val sequences = Seq(
+  Array(Array(1, 2), Array(3)),
+  Array(Array(1), Array(3, 2), Array(1, 2)),
+  Array(Array(1, 2), Array(5)),
+  Array(Array(6)))
+val rdd = sc.parallelize(sequences, 2).cache()
+
+val prefixSpan = new PrefixSpan()
+  .setMinSupport(0.5)
+  .setMaxPatternLength(5)
+val model = prefixSpan.run(rdd)
+model.freqSequences.collect().foreach { freqSequence =>
+  println(freqSequence.sequence.map(_.mkString("(", ",", ")")).mkString("[",",","]") + ", " + freqSequence.freq)
+}
+{% endhighlight %}
+
+</div>
+
+<div data-lang="java" markdown="1">
+
+[`PrefixSpan`](api/java/org/apache/spark/mllib/fpm/PrefixSpan.html) implements the
+Prefix Span algorithm.
+Calling `PrefixSpan.run` returns a
+[`PrefixSpanModel`](api/java/org/apache/spark/mllib/fpm/PrefixSpanModel.html)
+that stores the frequent sequences with their frequencies.
+
+{% highlight java %}
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.mllib.fpm.PrefixSpan;
+import org.apache.spark.mllib.fpm.PrefixSpanModel;
+
+JavaRDD<List<List<Integer>>> sequences = sc.parallelize(Arrays.asList(
+  Arrays.asList(Arrays.asList(1, 2), Arrays.asList(3)),
+  Arrays.asList(Arrays.asList(1), Arrays.asList(3, 2), Arrays.asList(1, 2)),
+  Arrays.asList(Arrays.asList(1, 2), Arrays.asList(5)),
+  Arrays.asList(Arrays.asList(6))
+), 2);
+PrefixSpan prefixSpan = new PrefixSpan()
+  .setMinSupport(0.5)
+  .setMaxPatternLength(5);
+PrefixSpanModel<Integer> model = prefixSpan.run(sequences);
+for (PrefixSpan.FreqSequence<Integer> freqSeq: model.freqSequences().toJavaRDD().collect()) {
+  System.out.println(freqSeq.javaSequence() + ", " + freqSeq.freq());
+}
+{% endhighlight %}
+
+</div>
+</div>
+
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index e8000ff478300..096f6f1952126 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -48,6 +48,7 @@ This lists functionality included in `spark.mllib`, the main MLlib API.
 * [Feature extraction and transformation](mllib-feature-extraction.html)
 * [Frequent pattern mining](mllib-frequent-pattern-mining.html)
   * [FP-growth](mllib-frequent-pattern-mining.html#fp-growth)
+  * prefix span
 * [Evaluation Metrics](mllib-evaluation-metrics.html)
 * [Optimization (developer)](mllib-optimization.html)
   * [stochastic gradient descent](mllib-optimization.html#stochastic-gradient-descent-sgd)

From 1729b5928a051a0c738aa686ea89078e61dc312c Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Mon, 17 Aug 2015 14:09:33 -0700
Subject: [PATCH 2/5] Java documentation fix

---
 docs/mllib-frequent-pattern-mining.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md
index 72ccd223efa61..190c8c0daa882 100644
--- a/docs/mllib-frequent-pattern-mining.md
+++ b/docs/mllib-frequent-pattern-mining.md
@@ -145,7 +145,8 @@ val prefixSpan = new PrefixSpan()
   .setMaxPatternLength(5)
 val model = prefixSpan.run(rdd)
 model.freqSequences.collect().foreach { freqSequence =>
-  println(freqSequence.sequence.map(_.mkString("(", ",", ")")).mkString("[",",","]") + ", " + freqSequence.freq)
+println(
+  freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]") + ", " + freqSequence.freq)
 }
 {% endhighlight %}
 

From 737b2271caf6a86805e6c34a74e26d428058924a Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Mon, 17 Aug 2015 15:26:31 -0700
Subject: [PATCH 3/5] Add link to prefix span section

---
 docs/mllib-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 096f6f1952126..3532d8534a77b 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -48,7 +48,7 @@ This lists functionality included in `spark.mllib`, the main MLlib API.
 * [Feature extraction and transformation](mllib-feature-extraction.html)
 * [Frequent pattern mining](mllib-frequent-pattern-mining.html)
   * [FP-growth](mllib-frequent-pattern-mining.html#fp-growth)
-  * prefix span
+  * [prefix span](mllib-frequent-pattern-mining.html#prefix-span)
 * [Evaluation Metrics](mllib-evaluation-metrics.html)
 * [Optimization (developer)](mllib-optimization.html)
   * [stochastic gradient descent](mllib-optimization.html#stochastic-gradient-descent-sgd)

From b9842e6c9b0b4318d637dcc0c00cee8acab0f243 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Mon, 17 Aug 2015 16:05:12 -0700
Subject: [PATCH 4/5] Code review feedback

---
 docs/mllib-frequent-pattern-mining.md | 35 ++++++++++++++++-----------
 docs/mllib-guide.md                   |  2 +-
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md
index 190c8c0daa882..998f54f313f5a 100644
--- a/docs/mllib-frequent-pattern-mining.md
+++ b/docs/mllib-frequent-pattern-mining.md
@@ -97,15 +97,15 @@ for (FPGrowth.FreqItemset<String> itemset: model.freqItemsets().toJavaRDD().coll
 </div>
 </div>
 
-## Prefix Span
+## PrefixSpan
 
-Prefix Span is a sequential pattern mining algorithm described in
-[Mortazavi-Asl et al., Mining Sequential Patterns by Pattern-Growth: The
+PrefixSpan is a sequential pattern mining algorithm described in
+[Pei et al., Mining Sequential Patterns by Pattern-Growth: The
 PrefixSpan Approach](http://dx.doi.org/10.1109%2FTKDE.2004.77). We refer
 the reader to the referenced paper for formalizing the sequential
 pattern mining problem.
 
-MLlib's FP-growth implementation takes the following parameters:
+MLlib's PrefixSpan implementation takes the following parameters:
 
 * `minSupport`: the minimum support required to be considered a frequent
   sequential pattern.
@@ -121,11 +121,19 @@ MLlib's FP-growth implementation takes the following parameters:
 
 **Examples**
 
+The following example illustrates PrefixSpan running on the sequences
+(using same notation as Pei et al):
+
+  <(12)3>
+  <1(32)(12)>
+  <(12)5>
+  <6>
+
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
 [`PrefixSpan`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan) implements the
-Prefix Span algorithm.
+PrefixSpan algorithm.
 Calling `PrefixSpan.run` returns a
 [`PrefixSpanModel`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan)
 that stores the frequent sequences with their frequencies.
@@ -133,17 +141,16 @@ that stores the frequent sequences with their frequencies.
 {% highlight scala %}
 import org.apache.spark.mllib.fpm.PrefixSpan
 
-val sequences = Seq(
-  Array(Array(1, 2), Array(3)),
-  Array(Array(1), Array(3, 2), Array(1, 2)),
-  Array(Array(1, 2), Array(5)),
-  Array(Array(6)))
-val rdd = sc.parallelize(sequences, 2).cache()
-
+val sequences = sc.parallelize(Seq(
+    Array(Array(1, 2), Array(3)),
+    Array(Array(1), Array(3, 2), Array(1, 2)),
+    Array(Array(1, 2), Array(5)),
+    Array(Array(6))
+  ), 2).cache()
 val prefixSpan = new PrefixSpan()
   .setMinSupport(0.5)
   .setMaxPatternLength(5)
-val model = prefixSpan.run(rdd)
+val model = prefixSpan.run(sequences)
 model.freqSequences.collect().foreach { freqSequence =>
 println(
   freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]") + ", " + freqSequence.freq)
@@ -155,7 +162,7 @@ println(
 <div data-lang="java" markdown="1">
 
 [`PrefixSpan`](api/java/org/apache/spark/mllib/fpm/PrefixSpan.html) implements the
-Prefix Span algorithm.
+PrefixSpan algorithm.
 Calling `PrefixSpan.run` returns a
 [`PrefixSpanModel`](api/java/org/apache/spark/mllib/fpm/PrefixSpanModel.html)
 that stores the frequent sequences with their frequencies.
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 3532d8534a77b..7851175b98230 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -48,7 +48,7 @@ This lists functionality included in `spark.mllib`, the main MLlib API.
 * [Feature extraction and transformation](mllib-feature-extraction.html)
 * [Frequent pattern mining](mllib-frequent-pattern-mining.html)
   * [FP-growth](mllib-frequent-pattern-mining.html#fp-growth)
-  * [prefix span](mllib-frequent-pattern-mining.html#prefix-span)
+  * [PrefixSpan](mllib-frequent-pattern-mining.html#prefix-span)
 * [Evaluation Metrics](mllib-evaluation-metrics.html)
 * [Optimization (developer)](mllib-optimization.html)
   * [stochastic gradient descent](mllib-optimization.html#stochastic-gradient-descent-sgd)

From cec483bff50a83802e507f69570d767e753b3da4 Mon Sep 17 00:00:00 2001
From: Feynman Liang <fliang@databricks.com>
Date: Mon, 17 Aug 2015 17:40:37 -0700
Subject: [PATCH 5/5] Code review fixes

---
 docs/mllib-frequent-pattern-mining.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md
index 998f54f313f5a..8ea4389266484 100644
--- a/docs/mllib-frequent-pattern-mining.md
+++ b/docs/mllib-frequent-pattern-mining.md
@@ -117,17 +117,17 @@ MLlib's PrefixSpan implementation takes the following parameters:
   projected databse begins. This parameter should be tuned with respect
   to the size of your executors.
 
-
-
 **Examples**
 
 The following example illustrates PrefixSpan running on the sequences
 (using same notation as Pei et al):
 
+~~~
   <(12)3>
   <1(32)(12)>
   <(12)5>
   <6>
+~~~
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
@@ -135,7 +135,7 @@ The following example illustrates PrefixSpan running on the sequences
 [`PrefixSpan`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan) implements the
 PrefixSpan algorithm.
 Calling `PrefixSpan.run` returns a
-[`PrefixSpanModel`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpan)
+[`PrefixSpanModel`](api/scala/index.html#org.apache.spark.mllib.fpm.PrefixSpanModel)
 that stores the frequent sequences with their frequencies.
 
 {% highlight scala %}