From da414d86fdeeeaf17c392daa1c72c5b047997552 Mon Sep 17 00:00:00 2001 From: Alan Gardner Date: Fri, 5 Dec 2014 16:10:31 -0500 Subject: [PATCH 1/3] Added classProbabilities method --- .../mllib/classification/NaiveBayes.scala | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 8c8e4a161aa5b..424367738b03b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -65,6 +65,24 @@ class NaiveBayesModel private[mllib] ( override def predict(testData: Vector): Double = { labels(brzArgmax(brzPi + brzTheta * testData.toBreeze)) } + + def classProbabilities(testData: RDD[Vector]): + RDD[scala.collection.mutable.Map[Double, Double]] = { + val bcModel = testData.context.broadcast(this) + testData.mapPartitions { iter => + val model = bcModel.value + iter.map(model.classProbabilities) + } + } + + def classProbabilities(testData: Vector): scala.collection.mutable.Map[Double, Double] = { + val posteriors = (brzPi + brzTheta * testData.toBreeze) + val probs:scala.collection.mutable.Map[Double,Double] = + scala.collection.mutable.Map.empty[Double, Double] + posteriors.foreachPair((k,v) => probs += (labels(k) -> v)) + probs + } + } /** From a97d0f89ebfa72186800d2d1592e404d8ebb3d85 Mon Sep 17 00:00:00 2001 From: Alan Gardner Date: Fri, 5 Dec 2014 16:49:15 -0500 Subject: [PATCH 2/3] Import mutable to be less verbose --- .../apache/spark/mllib/classification/NaiveBayes.scala | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 424367738b03b..5c0ffcb035974 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -24,6 +24,7 @@ import org.apache.spark.SparkContext._ import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD +import scala.collection.mutable /** * Model for Naive Bayes Classifiers. @@ -67,7 +68,7 @@ class NaiveBayesModel private[mllib] ( } def classProbabilities(testData: RDD[Vector]): - RDD[scala.collection.mutable.Map[Double, Double]] = { + RDD[mutable.Map[Double, Double]] = { val bcModel = testData.context.broadcast(this) testData.mapPartitions { iter => val model = bcModel.value @@ -75,10 +76,10 @@ class NaiveBayesModel private[mllib] ( } } - def classProbabilities(testData: Vector): scala.collection.mutable.Map[Double, Double] = { + def classProbabilities(testData: Vector): mutable.Map[Double, Double] = { val posteriors = (brzPi + brzTheta * testData.toBreeze) - val probs:scala.collection.mutable.Map[Double,Double] = - scala.collection.mutable.Map.empty[Double, Double] + val probs:mutable.Map[Double,Double] = + mutable.Map.empty[Double, Double] posteriors.foreachPair((k,v) => probs += (labels(k) -> v)) probs } From 7d6b5b4801c9402bfdfedf0eb8d9f87be8345efa Mon Sep 17 00:00:00 2001 From: Alan Gardner Date: Fri, 5 Dec 2014 23:05:49 -0500 Subject: [PATCH 3/3] Normalize posteriors, change signature to Map interface --- .../org/apache/spark/mllib/classification/NaiveBayes.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 5c0ffcb035974..76d2be3c632a3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -68,7 +68,7 @@ class NaiveBayesModel private[mllib] ( } def classProbabilities(testData: RDD[Vector]): - RDD[mutable.Map[Double, Double]] = { + RDD[scala.collection.Map[Double, Double]] = { val bcModel = testData.context.broadcast(this) testData.mapPartitions { iter => val model = bcModel.value @@ -76,11 +76,12 @@ class NaiveBayesModel private[mllib] ( } } - def classProbabilities(testData: Vector): mutable.Map[Double, Double] = { + def classProbabilities(testData: Vector): scala.collection.Map[Double, Double] = { val posteriors = (brzPi + brzTheta * testData.toBreeze) + val sum = posteriors.sum val probs:mutable.Map[Double,Double] = mutable.Map.empty[Double, Double] - posteriors.foreachPair((k,v) => probs += (labels(k) -> v)) + posteriors.foreachPair((k,v) => probs += (labels(k) -> v/sum)) probs }