From f7435874ddd0ede6f00893090fb9ea2fc34ee8d0 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Mon, 9 May 2016 21:31:49 +0800 Subject: [PATCH 1/4] create pr --- .../org/apache/spark/ml/ann/BreezeUtil.scala | 2 +- .../scala/org/apache/spark/ml/ann/Layer.scala | 36 ++++++++++--------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala b/mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala index 7429f9d652ac5..29e4c965e8821 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala @@ -55,7 +55,7 @@ private[ann] object BreezeUtil { * @param y y */ def dgemv(alpha: Double, a: BDM[Double], x: BDV[Double], beta: Double, y: BDV[Double]): Unit = { - require(a.cols == x.length, "A & b Dimension mismatch!") + require(a.cols == x.length, "A & x Dimension mismatch!") NativeBLAS.dgemv(transposeString(a), a.rows, a.cols, alpha, a.data, a.offset, a.majorStride, x.data, x.offset, x.stride, beta, y.data, y.offset, y.stride) diff --git a/mllib/src/main/scala/org/apache/spark/ml/ann/Layer.scala b/mllib/src/main/scala/org/apache/spark/ml/ann/Layer.scala index 3588ac1e95bee..a27ee51874fa2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/ann/Layer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/ann/Layer.scala @@ -64,8 +64,9 @@ private[ann] trait Layer extends Serializable { * @return the layer model */ def createModel(initialWeights: BDV[Double]): LayerModel + /** - * Returns the instance of the layer with random generated weights + * Returns the instance of the layer with random generated weights. * * @param weights vector for weights initialization, must be equal to weightSize * @param random random number generator @@ -83,11 +84,11 @@ private[ann] trait LayerModel extends Serializable { val weights: BDV[Double] /** - * Evaluates the data (process the data through the layer) + * Evaluates the data (process the data through the layer). * Output is allocated based on the size provided by the - * LayerModel implementation and the stack (batch) size + * LayerModel implementation and the stack (batch) size. * Developer is responsible for checking the size of output - * when writing to it + * when writing to it. * * @param data data * @param output output (modified in place) @@ -95,11 +96,11 @@ private[ann] trait LayerModel extends Serializable { def eval(data: BDM[Double], output: BDM[Double]): Unit /** - * Computes the delta for back propagation + * Computes the delta for back propagation. * Delta is allocated based on the size provided by the - * LayerModel implementation and the stack (batch) size + * LayerModel implementation and the stack (batch) size. * Developer is responsible for checking the size of - * prevDelta when writing to it + * prevDelta when writing to it. * * @param delta delta of this layer * @param output output of this layer @@ -108,10 +109,10 @@ private[ann] trait LayerModel extends Serializable { def computePrevDelta(delta: BDM[Double], output: BDM[Double], prevDelta: BDM[Double]): Unit /** - * Computes the gradient - * cumGrad is a wrapper on the part of the weight vector - * size of cumGrad is based on weightSize provided by - * implementation of LayerModel + * Computes the gradient. + * cumGrad is a wrapper on the part of the weight vector. + * Size of cumGrad is based on weightSize provided by + * implementation of LayerModel. * * @param delta delta for this layer * @param input input data @@ -197,11 +198,11 @@ private[ann] object AffineLayerModel { } /** - * Initialize weights randomly in the interval - * Uses [Bottou-88] heuristic [-a/sqrt(in); a/sqrt(in)] - * where a is chosen in a such way that the weight variance corresponds + * Initialize weights randomly in the interval. + * Uses [Bottou-88] heuristic [-a/sqrt(in); a/sqrt(in)], + * where `a` is chosen in such a way that the weight variance corresponds * to the points to the maximal curvature of the activation function - * (which is approximately 2.38 for a standard sigmoid) + * (which is approximately 2.38 for a standard sigmoid). * * @param numIn number of inputs * @param numOut number of outputs @@ -306,7 +307,7 @@ private[ann] class FunctionalLayer (val activationFunction: ActivationFunction) /** * Functional layer model. Holds no weights. * - * @param layer functiona layer + * @param layer functional layer */ private[ann] class FunctionalLayerModel private[ann] (val layer: FunctionalLayer) extends LayerModel { @@ -352,6 +353,7 @@ private[ann] trait TopologyModel extends Serializable { * Array of layer models */ val layerModels: Array[LayerModel] + /** * Forward propagation * @@ -410,7 +412,7 @@ private[ml] object FeedForwardTopology { * Creates a multi-layer perceptron * * @param layerSizes sizes of layers including input and output size - * @param softmaxOnTop wether to use SoftMax or Sigmoid function for an output layer. + * @param softmaxOnTop whether to use SoftMax or Sigmoid function for an output layer. * Softmax is default * @return multilayer perceptron topology */ From b2e7ca7feb98af511365cdc09b41f78b475f7488 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Mon, 9 May 2016 21:35:34 +0800 Subject: [PATCH 2/4] fix one nit in ml-guide.md --- docs/ml-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ml-guide.md b/docs/ml-guide.md index 99167873cd02d..cc353df1ecd00 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -257,7 +257,7 @@ Currently, `spark.ml` supports model selection using the [`CrossValidator`](api/ The `Evaluator` can be a [`RegressionEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.RegressionEvaluator) for regression problems, a [`BinaryClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.BinaryClassificationEvaluator) -for binary data, or a [`MultiClassClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator) +for binary data, or a [`MulticlassClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator) for multiclass problems. The default metric used to choose the best `ParamMap` can be overridden by the `setMetricName` method in each of these evaluators. From 9ab32c2d4a66c9a8f4992e4c93ac0c88a9cde4c8 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 12 May 2016 21:48:29 +0800 Subject: [PATCH 3/4] fix some typos in sql --- .../main/scala/org/apache/spark/sql/SparkSession.scala | 2 +- .../main/scala/org/apache/spark/sql/api/r/SQLUtils.scala | 8 +++----- .../apache/spark/sql/expressions/scalalang/typed.scala | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index 02c9dc03ae82e..d94408ba9eb32 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -39,7 +39,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.execution.ui.SQLListener -import org.apache.spark.sql.internal.{CatalogImpl, SessionState, SharedState, SQLConf} +import org.apache.spark.sql.internal.{CatalogImpl, SessionState, SharedState} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types.{DataType, LongType, StructType} import org.apache.spark.sql.util.ExecutionListenerManager diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala index 36173a49250b5..ffb606f2c66de 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala @@ -26,9 +26,7 @@ import org.apache.spark.api.r.SerDe import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SaveMode, SQLContext} -import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema -import org.apache.spark.sql.Encoder import org.apache.spark.sql.types._ private[sql] object SQLUtils { @@ -75,7 +73,7 @@ private[sql] object SQLUtils { org.apache.spark.sql.types.MapType(getSQLDataType(keyType), getSQLDataType(valueType)) case r"\Astruct<(.+)${fieldsStr}>\Z" => if (fieldsStr(fieldsStr.length - 1) == ',') { - throw new IllegalArgumentException(s"Invaid type $dataType") + throw new IllegalArgumentException(s"Invalid type $dataType") } val fields = fieldsStr.split(",") val structFields = fields.map { field => @@ -83,11 +81,11 @@ private[sql] object SQLUtils { case r"\A(.+)${fieldName}:(.+)${fieldType}\Z" => createStructField(fieldName, fieldType, true) - case _ => throw new IllegalArgumentException(s"Invaid type $dataType") + case _ => throw new IllegalArgumentException(s"Invalid type $dataType") } } createStructType(structFields) - case _ => throw new IllegalArgumentException(s"Invaid type $dataType") + case _ => throw new IllegalArgumentException(s"Invalid type $dataType") } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala index f46a4a7879788..60d7b7d0894d0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala @@ -38,7 +38,7 @@ object typed { // The reason we have separate files for Java and Scala is because in the Scala version, we can // use tighter types (primitive types) for return types, whereas in the Java version we can only // use boxed primitive types. - // For example, avg in the Scala veresion returns Scala primitive Double, whose bytecode + // For example, avg in the Scala version returns Scala primitive Double, whose bytecode // signature is just a java.lang.Object; avg in the Java version returns java.lang.Double. // TODO: This is pretty hacky. Maybe we should have an object for implicit encoders. From 9b2a5aa9f88cbb35a4843298a7c295297b4ce378 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 13 May 2016 11:08:54 +0800 Subject: [PATCH 4/4] rename matrix args in BreezeUtil to upper --- .../org/apache/spark/ml/ann/BreezeUtil.scala | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala b/mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala index 29e4c965e8821..6bbe7e1cb2134 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala @@ -26,38 +26,39 @@ import com.github.fommil.netlib.BLAS.{getInstance => NativeBLAS} private[ann] object BreezeUtil { // TODO: switch to MLlib BLAS interface - private def transposeString(a: BDM[Double]): String = if (a.isTranspose) "T" else "N" + private def transposeString(A: BDM[Double]): String = if (A.isTranspose) "T" else "N" /** * DGEMM: C := alpha * A * B + beta * C * @param alpha alpha - * @param a A - * @param b B + * @param A A + * @param B B * @param beta beta - * @param c C + * @param C C */ - def dgemm(alpha: Double, a: BDM[Double], b: BDM[Double], beta: Double, c: BDM[Double]): Unit = { + def dgemm(alpha: Double, A: BDM[Double], B: BDM[Double], beta: Double, C: BDM[Double]): Unit = { // TODO: add code if matrices isTranspose!!! - require(a.cols == b.rows, "A & B Dimension mismatch!") - require(a.rows == c.rows, "A & C Dimension mismatch!") - require(b.cols == c.cols, "A & C Dimension mismatch!") - NativeBLAS.dgemm(transposeString(a), transposeString(b), c.rows, c.cols, a.cols, - alpha, a.data, a.offset, a.majorStride, b.data, b.offset, b.majorStride, - beta, c.data, c.offset, c.rows) + require(A.cols == B.rows, "A & B Dimension mismatch!") + require(A.rows == C.rows, "A & C Dimension mismatch!") + require(B.cols == C.cols, "A & C Dimension mismatch!") + NativeBLAS.dgemm(transposeString(A), transposeString(B), C.rows, C.cols, A.cols, + alpha, A.data, A.offset, A.majorStride, B.data, B.offset, B.majorStride, + beta, C.data, C.offset, C.rows) } /** * DGEMV: y := alpha * A * x + beta * y * @param alpha alpha - * @param a A + * @param A A * @param x x * @param beta beta * @param y y */ - def dgemv(alpha: Double, a: BDM[Double], x: BDV[Double], beta: Double, y: BDV[Double]): Unit = { - require(a.cols == x.length, "A & x Dimension mismatch!") - NativeBLAS.dgemv(transposeString(a), a.rows, a.cols, - alpha, a.data, a.offset, a.majorStride, x.data, x.offset, x.stride, + def dgemv(alpha: Double, A: BDM[Double], x: BDV[Double], beta: Double, y: BDV[Double]): Unit = { + require(A.cols == x.length, "A & x Dimension mismatch!") + require(A.rows == y.length, "A & y Dimension mismatch!") + NativeBLAS.dgemv(transposeString(A), A.rows, A.cols, + alpha, A.data, A.offset, A.majorStride, x.data, x.offset, x.stride, beta, y.data, y.offset, y.stride) } }