Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/ml-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ Currently, `spark.ml` supports model selection using the [`CrossValidator`](api/

The `Evaluator` can be a [`RegressionEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.RegressionEvaluator)
for regression problems, a [`BinaryClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.BinaryClassificationEvaluator)
for binary data, or a [`MultiClassClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator)
for binary data, or a [`MulticlassClassificationEvaluator`](api/scala/index.html#org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator)
for multiclass problems. The default metric used to choose the best `ParamMap` can be overridden by the `setMetricName`
method in each of these evaluators.

Expand Down
33 changes: 17 additions & 16 deletions mllib/src/main/scala/org/apache/spark/ml/ann/BreezeUtil.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,38 +26,39 @@ import com.github.fommil.netlib.BLAS.{getInstance => NativeBLAS}
private[ann] object BreezeUtil {

// TODO: switch to MLlib BLAS interface
private def transposeString(a: BDM[Double]): String = if (a.isTranspose) "T" else "N"
private def transposeString(A: BDM[Double]): String = if (A.isTranspose) "T" else "N"

/**
* DGEMM: C := alpha * A * B + beta * C
* @param alpha alpha
* @param a A
* @param b B
* @param A A
* @param B B
* @param beta beta
* @param c C
* @param C C
*/
def dgemm(alpha: Double, a: BDM[Double], b: BDM[Double], beta: Double, c: BDM[Double]): Unit = {
def dgemm(alpha: Double, A: BDM[Double], B: BDM[Double], beta: Double, C: BDM[Double]): Unit = {
// TODO: add code if matrices isTranspose!!!
require(a.cols == b.rows, "A & B Dimension mismatch!")
require(a.rows == c.rows, "A & C Dimension mismatch!")
require(b.cols == c.cols, "A & C Dimension mismatch!")
NativeBLAS.dgemm(transposeString(a), transposeString(b), c.rows, c.cols, a.cols,
alpha, a.data, a.offset, a.majorStride, b.data, b.offset, b.majorStride,
beta, c.data, c.offset, c.rows)
require(A.cols == B.rows, "A & B Dimension mismatch!")
require(A.rows == C.rows, "A & C Dimension mismatch!")
require(B.cols == C.cols, "A & C Dimension mismatch!")
NativeBLAS.dgemm(transposeString(A), transposeString(B), C.rows, C.cols, A.cols,
alpha, A.data, A.offset, A.majorStride, B.data, B.offset, B.majorStride,
beta, C.data, C.offset, C.rows)
}

/**
* DGEMV: y := alpha * A * x + beta * y
* @param alpha alpha
* @param a A
* @param A A
* @param x x
* @param beta beta
* @param y y
*/
def dgemv(alpha: Double, a: BDM[Double], x: BDV[Double], beta: Double, y: BDV[Double]): Unit = {
require(a.cols == x.length, "A & b Dimension mismatch!")
NativeBLAS.dgemv(transposeString(a), a.rows, a.cols,
alpha, a.data, a.offset, a.majorStride, x.data, x.offset, x.stride,
def dgemv(alpha: Double, A: BDM[Double], x: BDV[Double], beta: Double, y: BDV[Double]): Unit = {
require(A.cols == x.length, "A & x Dimension mismatch!")
require(A.rows == y.length, "A & y Dimension mismatch!")
NativeBLAS.dgemv(transposeString(A), A.rows, A.cols,
alpha, A.data, A.offset, A.majorStride, x.data, x.offset, x.stride,
beta, y.data, y.offset, y.stride)
}
}
36 changes: 19 additions & 17 deletions mllib/src/main/scala/org/apache/spark/ml/ann/Layer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,9 @@ private[ann] trait Layer extends Serializable {
* @return the layer model
*/
def createModel(initialWeights: BDV[Double]): LayerModel

/**
* Returns the instance of the layer with random generated weights
* Returns the instance of the layer with random generated weights.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could just become the @return text which is otherwise unuseful

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method already have a @return text.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I see, but it just says "returns the layer model", which is what the text above already says in more detail. I didn't see a point in repeating it

*
* @param weights vector for weights initialization, must be equal to weightSize
* @param random random number generator
Expand All @@ -83,23 +84,23 @@ private[ann] trait LayerModel extends Serializable {

val weights: BDV[Double]
/**
* Evaluates the data (process the data through the layer)
* Evaluates the data (process the data through the layer).
* Output is allocated based on the size provided by the
* LayerModel implementation and the stack (batch) size
* LayerModel implementation and the stack (batch) size.
* Developer is responsible for checking the size of output
* when writing to it
* when writing to it.
*
* @param data data
* @param output output (modified in place)
*/
def eval(data: BDM[Double], output: BDM[Double]): Unit

/**
* Computes the delta for back propagation
* Computes the delta for back propagation.
* Delta is allocated based on the size provided by the
* LayerModel implementation and the stack (batch) size
* LayerModel implementation and the stack (batch) size.
* Developer is responsible for checking the size of
* prevDelta when writing to it
* prevDelta when writing to it.
*
* @param delta delta of this layer
* @param output output of this layer
Expand All @@ -108,10 +109,10 @@ private[ann] trait LayerModel extends Serializable {
def computePrevDelta(delta: BDM[Double], output: BDM[Double], prevDelta: BDM[Double]): Unit

/**
* Computes the gradient
* cumGrad is a wrapper on the part of the weight vector
* size of cumGrad is based on weightSize provided by
* implementation of LayerModel
* Computes the gradient.
* cumGrad is a wrapper on the part of the weight vector.
* Size of cumGrad is based on weightSize provided by
* implementation of LayerModel.
*
* @param delta delta for this layer
* @param input input data
Expand Down Expand Up @@ -197,11 +198,11 @@ private[ann] object AffineLayerModel {
}

/**
* Initialize weights randomly in the interval
* Uses [Bottou-88] heuristic [-a/sqrt(in); a/sqrt(in)]
* where a is chosen in a such way that the weight variance corresponds
* Initialize weights randomly in the interval.
* Uses [Bottou-88] heuristic [-a/sqrt(in); a/sqrt(in)],
* where `a` is chosen in such a way that the weight variance corresponds
* to the points to the maximal curvature of the activation function
* (which is approximately 2.38 for a standard sigmoid)
* (which is approximately 2.38 for a standard sigmoid).
*
* @param numIn number of inputs
* @param numOut number of outputs
Expand Down Expand Up @@ -306,7 +307,7 @@ private[ann] class FunctionalLayer (val activationFunction: ActivationFunction)
/**
* Functional layer model. Holds no weights.
*
* @param layer functiona layer
* @param layer functional layer
*/
private[ann] class FunctionalLayerModel private[ann] (val layer: FunctionalLayer)
extends LayerModel {
Expand Down Expand Up @@ -352,6 +353,7 @@ private[ann] trait TopologyModel extends Serializable {
* Array of layer models
*/
val layerModels: Array[LayerModel]

/**
* Forward propagation
*
Expand Down Expand Up @@ -410,7 +412,7 @@ private[ml] object FeedForwardTopology {
* Creates a multi-layer perceptron
*
* @param layerSizes sizes of layers including input and output size
* @param softmaxOnTop wether to use SoftMax or Sigmoid function for an output layer.
* @param softmaxOnTop whether to use SoftMax or Sigmoid function for an output layer.
* Softmax is default
* @return multilayer perceptron topology
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan,
import org.apache.spark.sql.execution._
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.execution.ui.SQLListener
import org.apache.spark.sql.internal.{CatalogImpl, SessionState, SharedState, SQLConf}
import org.apache.spark.sql.internal.{CatalogImpl, SessionState, SharedState}
import org.apache.spark.sql.sources.BaseRelation
import org.apache.spark.sql.types.{DataType, LongType, StructType}
import org.apache.spark.sql.util.ExecutionListenerManager
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,7 @@ import org.apache.spark.api.r.SerDe
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SaveMode, SQLContext}
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.Encoder
import org.apache.spark.sql.types._

private[sql] object SQLUtils {
Expand Down Expand Up @@ -75,19 +73,19 @@ private[sql] object SQLUtils {
org.apache.spark.sql.types.MapType(getSQLDataType(keyType), getSQLDataType(valueType))
case r"\Astruct<(.+)${fieldsStr}>\Z" =>
if (fieldsStr(fieldsStr.length - 1) == ',') {
throw new IllegalArgumentException(s"Invaid type $dataType")
throw new IllegalArgumentException(s"Invalid type $dataType")
}
val fields = fieldsStr.split(",")
val structFields = fields.map { field =>
field match {
case r"\A(.+)${fieldName}:(.+)${fieldType}\Z" =>
createStructField(fieldName, fieldType, true)

case _ => throw new IllegalArgumentException(s"Invaid type $dataType")
case _ => throw new IllegalArgumentException(s"Invalid type $dataType")
}
}
createStructType(structFields)
case _ => throw new IllegalArgumentException(s"Invaid type $dataType")
case _ => throw new IllegalArgumentException(s"Invalid type $dataType")
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ object typed {
// The reason we have separate files for Java and Scala is because in the Scala version, we can
// use tighter types (primitive types) for return types, whereas in the Java version we can only
// use boxed primitive types.
// For example, avg in the Scala veresion returns Scala primitive Double, whose bytecode
// For example, avg in the Scala version returns Scala primitive Double, whose bytecode
// signature is just a java.lang.Object; avg in the Java version returns java.lang.Double.

// TODO: This is pretty hacky. Maybe we should have an object for implicit encoders.
Expand Down