From 602c27f60a92524ff522ba5e40142e8a9159bacc Mon Sep 17 00:00:00 2001 From: wangmiao1981 Date: Fri, 26 May 2017 23:27:04 -0700 Subject: [PATCH 01/12] add constraint logit --- R/pkg/R/mllib_classification.R | 33 +++++++++++++++++-- .../ml/r/LogisticRegressionWrapper.scala | 32 ++++++++++++++++-- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index 306a9b8676539..d3e9cc93ed09c 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -239,8 +239,14 @@ function(object, path, overwrite = FALSE) { setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"), function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100, tol = 1E-6, family = "auto", standardization = TRUE, - thresholds = 0.5, weightCol = NULL, aggregationDepth = 2) { + thresholds = 0.5, weightCol = NULL, aggregationDepth = 2, + lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients = NULL, + lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL) { formula <- paste(deparse(formula), collapse = "") + lrow <- 0 + lcol <- 0 + urow <- 0 + ucol <- 0 if (!is.null(weightCol) && weightCol == "") { weightCol <- NULL @@ -248,12 +254,35 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") weightCol <- as.character(weightCol) } + if (!is.null(lowerBoundsOnIntercepts)) { + lowerBoundsOnIntercepts <- as.array(lowerBoundsOnIntercepts) + } + + if (!is.null(upperBoundsOnIntercepts)) { + upperBoundsOnIntercepts <- as.array(upperBoundsOnIntercepts) + } + + if (!is.null(lowerBoundsOnCoefficients)) { + lrow = nrow(lowerBoundsOnCoefficients) + lcol = ncol(lowerBoundsOnCoefficients) + lowerBoundsOnCoefficients <- as.array(as.vector(lowerBoundsOnCoefficients)) + } + + if (!is.null(upperBoundsOnIntercepts)) { + urow = nrow(upperBoundsOnIntercepts) + ucol = ncol(upperBoundsOnIntercepts) + upperBoundsOnIntercepts <- as.array(as.vector(upperBoundsOnIntercepts)) + } + jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit", data@sdf, formula, as.numeric(regParam), as.numeric(elasticNetParam), as.integer(maxIter), as.numeric(tol), as.character(family), as.logical(standardization), as.array(thresholds), - weightCol, as.integer(aggregationDepth)) + weightCol, as.integer(aggregationDepth), + as.integer(lrow), as.integer(lcol), as.integer(urow), as.integer(ucol), + lowerBoundsOnCoefficients, upperBoundsOnCoefficients, + lowerBoundsOnIntercepts, upperBoundsOnIntercepts) new("LogisticRegressionModel", jobj = jobj) }) diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala index 703bcdf4ca725..c4422d2b7f09c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala @@ -25,7 +25,7 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.feature.{IndexToString, RFormula} -import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.linalg.{Matrices, Vector, Vectors} import org.apache.spark.ml.r.RWrapperUtils._ import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset} @@ -97,7 +97,15 @@ private[r] object LogisticRegressionWrapper standardization: Boolean, thresholds: Array[Double], weightCol: String, - aggregationDepth: Int + aggregationDepth: Int, + lrow: Int, + lcol: Int, + urow: Int, + ucol: Int, + lowerBoundsOnCoefficients: Array[Double], + upperBoundsOnCoefficients: Array[Double], + lowerBoundsOnIntercepts: Array[Double], + upperBoundsOnIntercepts: Array[Double] ): LogisticRegressionWrapper = { val rFormula = new RFormula() @@ -133,6 +141,26 @@ private[r] object LogisticRegressionWrapper if (weightCol != null) lr.setWeightCol(weightCol) + if (lrow != 0 && lcol != 0 && lowerBoundsOnCoefficients != null) { + val coef = Matrices.dense(lrow, lcol, lowerBoundsOnCoefficients) + lr.setLowerBoundsOnCoefficients(coef) + } + + if (urow != 0 && ucol != 0 && upperBoundsOnCoefficients != null) { + val coef = Matrices.dense(urow, ucol, upperBoundsOnCoefficients) + lr.setUpperBoundsOnCoefficients(coef) + } + + if (lowerBoundsOnIntercepts != null) { + val intercept = Vectors.dense(lowerBoundsOnIntercepts) + lr.setLowerBoundsOnIntercepts(intercept) + } + + if (upperBoundsOnIntercepts != null) { + val intercept = Vectors.dense(upperBoundsOnIntercepts) + lr.setUpperBoundsOnIntercepts(intercept) + } + val idxToStr = new IndexToString() .setInputCol(PREDICTED_LABEL_INDEX_COL) .setOutputCol(PREDICTED_LABEL_COL) From 2dd0b7d101afe2074f34161c14ca232c3c748417 Mon Sep 17 00:00:00 2001 From: wangmiao1981 Date: Sat, 27 May 2017 00:29:25 -0700 Subject: [PATCH 02/12] add unit test and doc --- R/pkg/R/mllib_classification.R | 22 +++++++++++++++---- .../fulltests/test_mllib_classification.R | 20 +++++++++++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index d3e9cc93ed09c..75683cf053328 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -202,6 +202,20 @@ function(object, path, overwrite = FALSE) { #' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features #' or the number of partitions are large, this param could be adjusted to a larger size. #' This is an expert parameter. Default value should be good for most cases. +#' @param lowerBoundsOnCoefficients The lower bounds on coefficients if fitting under bound constrained optimization. +#' The bound matrix must be compatible with the shape (1, number of features) for binomial +#' regression, or (number of classes, number of features) for multinomial regression. +#' It is a R matrix. +#' @param upperBoundsOnCoefficients The upper bounds on coefficients if fitting under bound constrained optimization. +#' The bound matrix must be compatible with the shape (1, number of features) for binomial +#' regression, or (number of classes, number of features) for multinomial regression. +#' It is a R matrix. +#' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting under bound constrained optimization. +#' The bounds vector size must be equal with 1 for binomial regression, or the number +#' of classes for multinomial regression. +#' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained optimization. +#' The bound vector size must be equal with 1 for binomial regression, or the number +#' of classes for multinomial regression. #' @param ... additional arguments passed to the method. #' @return \code{spark.logit} returns a fitted logistic regression model. #' @rdname spark.logit @@ -268,10 +282,10 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") lowerBoundsOnCoefficients <- as.array(as.vector(lowerBoundsOnCoefficients)) } - if (!is.null(upperBoundsOnIntercepts)) { - urow = nrow(upperBoundsOnIntercepts) - ucol = ncol(upperBoundsOnIntercepts) - upperBoundsOnIntercepts <- as.array(as.vector(upperBoundsOnIntercepts)) + if (!is.null(upperBoundsOnCoefficients)) { + urow = nrow(upperBoundsOnCoefficients) + ucol = ncol(upperBoundsOnCoefficients) + upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients)) } jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit", diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R index 726e9d9a20b1c..4e1a9cb0fd820 100644 --- a/R/pkg/tests/fulltests/test_mllib_classification.R +++ b/R/pkg/tests/fulltests/test_mllib_classification.R @@ -200,6 +200,26 @@ test_that("spark.logit", { coefs <- summary$coefficients[, "Estimate"] expect_true(all(abs(coefsR - coefs) < 0.1)) + # Test binomial logistic regression againt two classes with upperBoundsOnCoefficients + # and upperBoundsOnIntercepts + u <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4) + model <- spark.logit(training, Species ~ ., upperBoundsOnCoefficients = u, + upperBoundsOnIntercepts = 1.0) + summary <- summary(model) + coefsR <- c(-11.13331, 1.00000, 0.00000, 1.00000, 0.00000) + coefs <- summary$coefficients[, "Estimate"] + expect_true(all(abs(coefsR - coefs) < 0.1)) + + # Test binomial logistic regression againt two classes with lowerBoundsOnCoefficients + # and lowerBoundsOnIntercepts + l <- matrix(c(0.0, -1.0, 0.0, -1.0), nrow = 1, ncol = 4) + model <- spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = l, + lowerBoundsOnIntercepts = 0.0) + summary <- summary(model) + coefsR <- c(0, 0, -1, 0, 1.902192) + coefs <- summary$coefficients[, "Estimate"] + expect_true(all(abs(coefsR - coefs) < 0.1)) + # Test prediction with string label prediction <- predict(model, training) expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character") From 5b272fd3ea0f4636f0ebca999c5858b99726845c Mon Sep 17 00:00:00 2001 From: wangmiao1981 Date: Sat, 27 May 2017 00:39:32 -0700 Subject: [PATCH 03/12] fix <- --- R/pkg/R/mllib_classification.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index 75683cf053328..7c0d2c1d4b8ca 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -277,14 +277,14 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") } if (!is.null(lowerBoundsOnCoefficients)) { - lrow = nrow(lowerBoundsOnCoefficients) - lcol = ncol(lowerBoundsOnCoefficients) + lrow <- nrow(lowerBoundsOnCoefficients) + lcol <- ncol(lowerBoundsOnCoefficients) lowerBoundsOnCoefficients <- as.array(as.vector(lowerBoundsOnCoefficients)) } if (!is.null(upperBoundsOnCoefficients)) { - urow = nrow(upperBoundsOnCoefficients) - ucol = ncol(upperBoundsOnCoefficients) + urow <- nrow(upperBoundsOnCoefficients) + ucol <- ncol(upperBoundsOnCoefficients) upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients)) } From eb65a1d96c91ea7798014a5ccdf790c6fbe406b7 Mon Sep 17 00:00:00 2001 From: wangmiao1981 Date: Sat, 27 May 2017 00:46:04 -0700 Subject: [PATCH 04/12] fix style --- R/pkg/R/mllib_classification.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index 7c0d2c1d4b8ca..56a15e5a981b7 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -294,7 +294,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") as.numeric(tol), as.character(family), as.logical(standardization), as.array(thresholds), weightCol, as.integer(aggregationDepth), - as.integer(lrow), as.integer(lcol), as.integer(urow), as.integer(ucol), + as.integer(lrow), as.integer(lcol), + as.integer(urow), as.integer(ucol), lowerBoundsOnCoefficients, upperBoundsOnCoefficients, lowerBoundsOnIntercepts, upperBoundsOnIntercepts) new("LogisticRegressionModel", jobj = jobj) From 62a436bba8523e0693c75841e90dcb1f6625f4e2 Mon Sep 17 00:00:00 2001 From: wangmiao1981 Date: Sat, 27 May 2017 09:18:01 -0700 Subject: [PATCH 05/12] change test order --- .../fulltests/test_mllib_classification.R | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R index 4e1a9cb0fd820..60b9a3d21508a 100644 --- a/R/pkg/tests/fulltests/test_mllib_classification.R +++ b/R/pkg/tests/fulltests/test_mllib_classification.R @@ -200,26 +200,6 @@ test_that("spark.logit", { coefs <- summary$coefficients[, "Estimate"] expect_true(all(abs(coefsR - coefs) < 0.1)) - # Test binomial logistic regression againt two classes with upperBoundsOnCoefficients - # and upperBoundsOnIntercepts - u <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4) - model <- spark.logit(training, Species ~ ., upperBoundsOnCoefficients = u, - upperBoundsOnIntercepts = 1.0) - summary <- summary(model) - coefsR <- c(-11.13331, 1.00000, 0.00000, 1.00000, 0.00000) - coefs <- summary$coefficients[, "Estimate"] - expect_true(all(abs(coefsR - coefs) < 0.1)) - - # Test binomial logistic regression againt two classes with lowerBoundsOnCoefficients - # and lowerBoundsOnIntercepts - l <- matrix(c(0.0, -1.0, 0.0, -1.0), nrow = 1, ncol = 4) - model <- spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = l, - lowerBoundsOnIntercepts = 0.0) - summary <- summary(model) - coefsR <- c(0, 0, -1, 0, 1.902192) - coefs <- summary$coefficients[, "Estimate"] - expect_true(all(abs(coefsR - coefs) < 0.1)) - # Test prediction with string label prediction <- predict(model, training) expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character") @@ -243,6 +223,26 @@ test_that("spark.logit", { model2 <- spark.logit(df2, label ~ feature, weightCol = "weight") prediction2 <- collect(select(predict(model2, df2), "prediction")) expect_equal(sort(prediction2$prediction), c("0.0", "0.0", "0.0", "0.0", "0.0")) + + # Test binomial logistic regression againt two classes with upperBoundsOnCoefficients + # and upperBoundsOnIntercepts + u <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4) + model <- spark.logit(training, Species ~ ., upperBoundsOnCoefficients = u, + upperBoundsOnIntercepts = 1.0) + summary <- summary(model) + coefsR <- c(-11.13331, 1.00000, 0.00000, 1.00000, 0.00000) + coefs <- summary$coefficients[, "Estimate"] + expect_true(all(abs(coefsR - coefs) < 0.1)) + + # Test binomial logistic regression againt two classes with lowerBoundsOnCoefficients + # and lowerBoundsOnIntercepts + l <- matrix(c(0.0, -1.0, 0.0, -1.0), nrow = 1, ncol = 4) + model <- spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = l, + lowerBoundsOnIntercepts = 0.0) + summary <- summary(model) + coefsR <- c(0, 0, -1, 0, 1.902192) + coefs <- summary$coefficients[, "Estimate"] + expect_true(all(abs(coefsR - coefs) < 0.1)) }) test_that("spark.mlp", { From 43640f43474d37c05d6e74cf5b602c6c71c0bc3d Mon Sep 17 00:00:00 2001 From: wangmiao1981 Date: Sun, 28 May 2017 22:57:16 -0700 Subject: [PATCH 06/12] add check and test for matrix type --- R/pkg/R/mllib_classification.R | 6 ++++++ R/pkg/tests/fulltests/test_mllib_classification.R | 8 +++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index 56a15e5a981b7..e91dc8db3b175 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -277,12 +277,18 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") } if (!is.null(lowerBoundsOnCoefficients)) { + if (class(lowerBoundsOnCoefficients) != "matrix") { + stop("lowerBoundsOnCoefficients must be a matrix.") + } lrow <- nrow(lowerBoundsOnCoefficients) lcol <- ncol(lowerBoundsOnCoefficients) lowerBoundsOnCoefficients <- as.array(as.vector(lowerBoundsOnCoefficients)) } if (!is.null(upperBoundsOnCoefficients)) { + if (class(upperBoundsOnCoefficients) != "matrix") { + stop("upperBoundsOnCoefficients must be a matrix.") + } urow <- nrow(upperBoundsOnCoefficients) ucol <- ncol(upperBoundsOnCoefficients) upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients)) diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R index 60b9a3d21508a..0af4a8353ab3f 100644 --- a/R/pkg/tests/fulltests/test_mllib_classification.R +++ b/R/pkg/tests/fulltests/test_mllib_classification.R @@ -233,16 +233,22 @@ test_that("spark.logit", { coefsR <- c(-11.13331, 1.00000, 0.00000, 1.00000, 0.00000) coefs <- summary$coefficients[, "Estimate"] expect_true(all(abs(coefsR - coefs) < 0.1)) + # Test upperBoundsOnCoefficients should be matrix + expect_error(spark.logit(training, Species ~ ., upperBoundsOnCoefficients = as.array(c(1, 2)), + upperBoundsOnIntercepts = 1.0)) # Test binomial logistic regression againt two classes with lowerBoundsOnCoefficients # and lowerBoundsOnIntercepts l <- matrix(c(0.0, -1.0, 0.0, -1.0), nrow = 1, ncol = 4) model <- spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = l, - lowerBoundsOnIntercepts = 0.0) + lowerBoundsOnIntercepts = 0.0) summary <- summary(model) coefsR <- c(0, 0, -1, 0, 1.902192) coefs <- summary$coefficients[, "Estimate"] expect_true(all(abs(coefsR - coefs) < 0.1)) + # Test lowerBoundsOnCoefficients should be matrix + expect_error(spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = as.array(c(1, 2)), + lowerBoundsOnIntercepts = 0.0)) }) test_that("spark.mlp", { From 9ab89c76b6994eaa86e19aa466f28dc6c110c6a2 Mon Sep 17 00:00:00 2001 From: wangmiao1981 Date: Sun, 28 May 2017 23:00:47 -0700 Subject: [PATCH 07/12] extra spaces --- R/pkg/tests/fulltests/test_mllib_classification.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R index 0af4a8353ab3f..e148392e69642 100644 --- a/R/pkg/tests/fulltests/test_mllib_classification.R +++ b/R/pkg/tests/fulltests/test_mllib_classification.R @@ -228,14 +228,14 @@ test_that("spark.logit", { # and upperBoundsOnIntercepts u <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4) model <- spark.logit(training, Species ~ ., upperBoundsOnCoefficients = u, - upperBoundsOnIntercepts = 1.0) + upperBoundsOnIntercepts = 1.0) summary <- summary(model) coefsR <- c(-11.13331, 1.00000, 0.00000, 1.00000, 0.00000) coefs <- summary$coefficients[, "Estimate"] expect_true(all(abs(coefsR - coefs) < 0.1)) # Test upperBoundsOnCoefficients should be matrix expect_error(spark.logit(training, Species ~ ., upperBoundsOnCoefficients = as.array(c(1, 2)), - upperBoundsOnIntercepts = 1.0)) + upperBoundsOnIntercepts = 1.0)) # Test binomial logistic regression againt two classes with lowerBoundsOnCoefficients # and lowerBoundsOnIntercepts @@ -248,7 +248,7 @@ test_that("spark.logit", { expect_true(all(abs(coefsR - coefs) < 0.1)) # Test lowerBoundsOnCoefficients should be matrix expect_error(spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = as.array(c(1, 2)), - lowerBoundsOnIntercepts = 0.0)) + lowerBoundsOnIntercepts = 0.0)) }) test_that("spark.mlp", { From b8d683cb3308a85c198293d9e390ca908b06be63 Mon Sep 17 00:00:00 2001 From: wangmiao1981 Date: Fri, 2 Jun 2017 16:13:58 -0700 Subject: [PATCH 08/12] address review comments --- R/pkg/R/mllib_classification.R | 26 ++++++++++++------- .../ml/r/LogisticRegressionWrapper.scala | 18 +++++++------ 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index e91dc8db3b175..d9dec59fabb62 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -257,10 +257,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients = NULL, lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL) { formula <- paste(deparse(formula), collapse = "") - lrow <- 0 - lcol <- 0 - urow <- 0 - ucol <- 0 + row <- 0 + col <- 0 if (!is.null(weightCol) && weightCol == "") { weightCol <- NULL @@ -280,8 +278,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") if (class(lowerBoundsOnCoefficients) != "matrix") { stop("lowerBoundsOnCoefficients must be a matrix.") } - lrow <- nrow(lowerBoundsOnCoefficients) - lcol <- ncol(lowerBoundsOnCoefficients) + row <- nrow(lowerBoundsOnCoefficients) + col <- ncol(lowerBoundsOnCoefficients) lowerBoundsOnCoefficients <- as.array(as.vector(lowerBoundsOnCoefficients)) } @@ -289,8 +287,17 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") if (class(upperBoundsOnCoefficients) != "matrix") { stop("upperBoundsOnCoefficients must be a matrix.") } - urow <- nrow(upperBoundsOnCoefficients) - ucol <- ncol(upperBoundsOnCoefficients) + + if(!is.null(lowerBoundsOnCoefficients) & (row != nrow(upperBoundsOnCoefficients) + | col != ncol(upperBoundsOnCoefficients))) { + stop("dimension of upperBoundsOnCoefficients is not the same as lowerBoundsOnCoefficients") + } + + if (is.null(lowerBoundsOnCoefficients)) { + row <- nrow(upperBoundsOnCoefficients) + col <- ncol(upperBoundsOnCoefficients) + } + upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients)) } @@ -300,8 +307,7 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") as.numeric(tol), as.character(family), as.logical(standardization), as.array(thresholds), weightCol, as.integer(aggregationDepth), - as.integer(lrow), as.integer(lcol), - as.integer(urow), as.integer(ucol), + as.integer(row), as.integer(col), lowerBoundsOnCoefficients, upperBoundsOnCoefficients, lowerBoundsOnIntercepts, upperBoundsOnIntercepts) new("LogisticRegressionModel", jobj = jobj) diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala index c4422d2b7f09c..b96481acf46d7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala @@ -98,10 +98,8 @@ private[r] object LogisticRegressionWrapper thresholds: Array[Double], weightCol: String, aggregationDepth: Int, - lrow: Int, - lcol: Int, - urow: Int, - ucol: Int, + numRowsOfBoundsOnCoefficients: Int, + numColsOfBoundsOnCoefficients: Int, lowerBoundsOnCoefficients: Array[Double], upperBoundsOnCoefficients: Array[Double], lowerBoundsOnIntercepts: Array[Double], @@ -141,13 +139,17 @@ private[r] object LogisticRegressionWrapper if (weightCol != null) lr.setWeightCol(weightCol) - if (lrow != 0 && lcol != 0 && lowerBoundsOnCoefficients != null) { - val coef = Matrices.dense(lrow, lcol, lowerBoundsOnCoefficients) + if (numRowsOfBoundsOnCoefficients != 0 && + numColsOfBoundsOnCoefficients != 0 && lowerBoundsOnCoefficients != null) { + val coef = Matrices.dense(numRowsOfBoundsOnCoefficients, + numColsOfBoundsOnCoefficients, lowerBoundsOnCoefficients) lr.setLowerBoundsOnCoefficients(coef) } - if (urow != 0 && ucol != 0 && upperBoundsOnCoefficients != null) { - val coef = Matrices.dense(urow, ucol, upperBoundsOnCoefficients) + if (numRowsOfBoundsOnCoefficients != 0 && + numColsOfBoundsOnCoefficients != 0 && upperBoundsOnCoefficients != null) { + val coef = Matrices.dense(numRowsOfBoundsOnCoefficients, + numColsOfBoundsOnCoefficients, upperBoundsOnCoefficients) lr.setUpperBoundsOnCoefficients(coef) } From 98709ac23828c4c801c029da4de1406f973cf7ae Mon Sep 17 00:00:00 2001 From: wangmiao1981 Date: Fri, 2 Jun 2017 16:43:16 -0700 Subject: [PATCH 09/12] add unit test and fix R style issue --- R/pkg/R/mllib_classification.R | 5 +++-- R/pkg/tests/fulltests/test_mllib_classification.R | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index d9dec59fabb62..a66fed8f4c2d7 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -288,9 +288,10 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") stop("upperBoundsOnCoefficients must be a matrix.") } - if(!is.null(lowerBoundsOnCoefficients) & (row != nrow(upperBoundsOnCoefficients) + if (!is.null(lowerBoundsOnCoefficients) & (row != nrow(upperBoundsOnCoefficients) | col != ncol(upperBoundsOnCoefficients))) { - stop("dimension of upperBoundsOnCoefficients is not the same as lowerBoundsOnCoefficients") + stop(paste("dimension of upperBoundsOnCoefficients ", + "is not the same as lowerBoundsOnCoefficients", sep = "")) } if (is.null(lowerBoundsOnCoefficients)) { diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R index e148392e69642..3d75f4ce11ec8 100644 --- a/R/pkg/tests/fulltests/test_mllib_classification.R +++ b/R/pkg/tests/fulltests/test_mllib_classification.R @@ -249,6 +249,20 @@ test_that("spark.logit", { # Test lowerBoundsOnCoefficients should be matrix expect_error(spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = as.array(c(1, 2)), lowerBoundsOnIntercepts = 0.0)) + + # Test multinomial logistic regression with lowerBoundsOnCoefficients + # and lowerBoundsOnIntercepts + l <- matrix(c(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0), nrow = 2, ncol = 4) + model <- spark.logit(training, Species ~ ., family = "multinomial", + lowerBoundsOnCoefficients = l, + lowerBoundsOnIntercepts = as.array(c(0.0, 0.0))) + summary <- summary(model) + versicolorCoefsR <- c(42.639465, 7.258104, 14.330814, 16.298243, 11.716429) + virginicaCoefsR <- c(0.0002970796, 4.79274, 7.65047, 25.72793, 30.0021) + versicolorCoefs <- summary$coefficients[, "versicolor"] + virginicaCoefs <- summary$coefficients[, "virginica"] + expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1)) + expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1)) }) test_that("spark.mlp", { From 8c88a4fd0bb1f988610e74696a1b57806f408b2c Mon Sep 17 00:00:00 2001 From: wangmiao1981 Date: Fri, 2 Jun 2017 21:24:17 -0700 Subject: [PATCH 10/12] address review comments --- R/pkg/R/mllib_classification.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index a66fed8f4c2d7..fe4f2aeed4e00 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -290,7 +290,7 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") if (!is.null(lowerBoundsOnCoefficients) & (row != nrow(upperBoundsOnCoefficients) | col != ncol(upperBoundsOnCoefficients))) { - stop(paste("dimension of upperBoundsOnCoefficients ", + stop(paste0("dimension of upperBoundsOnCoefficients ", "is not the same as lowerBoundsOnCoefficients", sep = "")) } @@ -307,8 +307,7 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") as.numeric(elasticNetParam), as.integer(maxIter), as.numeric(tol), as.character(family), as.logical(standardization), as.array(thresholds), - weightCol, as.integer(aggregationDepth), - as.integer(row), as.integer(col), + weightCol, as.integer(aggregationDepth), row, col, lowerBoundsOnCoefficients, upperBoundsOnCoefficients, lowerBoundsOnIntercepts, upperBoundsOnIntercepts) new("LogisticRegressionModel", jobj = jobj) From 8bebb982c5cfa786a4d82567268908ab0ce242c4 Mon Sep 17 00:00:00 2001 From: wangmiao1981 Date: Sat, 3 Jun 2017 16:11:25 -0700 Subject: [PATCH 11/12] revert change of as.integer --- R/pkg/R/mllib_classification.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index fe4f2aeed4e00..2ab3088868856 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -307,7 +307,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") as.numeric(elasticNetParam), as.integer(maxIter), as.numeric(tol), as.character(family), as.logical(standardization), as.array(thresholds), - weightCol, as.integer(aggregationDepth), row, col, + weightCol, as.integer(aggregationDepth), + as.integer(row), as.integer(col), lowerBoundsOnCoefficients, upperBoundsOnCoefficients, lowerBoundsOnIntercepts, upperBoundsOnIntercepts) new("LogisticRegressionModel", jobj = jobj) From 45b62cc9b070105ff2553ffc7b1e5788dc8134e5 Mon Sep 17 00:00:00 2001 From: wangmiao1981 Date: Fri, 16 Jun 2017 17:05:46 +0800 Subject: [PATCH 12/12] address review comments --- R/pkg/R/mllib_classification.R | 8 ++++---- .../spark/ml/classification/LogisticRegression.scala | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index 2ab3088868856..1177a0f7f722b 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -211,10 +211,10 @@ function(object, path, overwrite = FALSE) { #' regression, or (number of classes, number of features) for multinomial regression. #' It is a R matrix. #' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting under bound constrained optimization. -#' The bounds vector size must be equal with 1 for binomial regression, or the number +#' The bounds vector size must be equal to 1 for binomial regression, or the number #' of classes for multinomial regression. #' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained optimization. -#' The bound vector size must be equal with 1 for binomial regression, or the number +#' The bound vector size must be equal to 1 for binomial regression, or the number #' of classes for multinomial regression. #' @param ... additional arguments passed to the method. #' @return \code{spark.logit} returns a fitted logistic regression model. @@ -288,8 +288,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") stop("upperBoundsOnCoefficients must be a matrix.") } - if (!is.null(lowerBoundsOnCoefficients) & (row != nrow(upperBoundsOnCoefficients) - | col != ncol(upperBoundsOnCoefficients))) { + if (!is.null(lowerBoundsOnCoefficients) && (row != nrow(upperBoundsOnCoefficients) + || col != ncol(upperBoundsOnCoefficients))) { stop(paste0("dimension of upperBoundsOnCoefficients ", "is not the same as lowerBoundsOnCoefficients", sep = "")) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 567af0488e1b4..b234bc4c2df4f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -214,7 +214,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas /** * The lower bounds on intercepts if fitting under bound constrained optimization. - * The bounds vector size must be equal with 1 for binomial regression, or the number + * The bounds vector size must be equal to 1 for binomial regression, or the number * of classes for multinomial regression. Otherwise, it throws exception. * Default is none. * @@ -230,7 +230,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas /** * The upper bounds on intercepts if fitting under bound constrained optimization. - * The bound vector size must be equal with 1 for binomial regression, or the number + * The bound vector size must be equal to 1 for binomial regression, or the number * of classes for multinomial regression. Otherwise, it throws exception. * Default is none. * @@ -451,12 +451,12 @@ class LogisticRegression @Since("1.2.0") ( } if (isSet(lowerBoundsOnIntercepts)) { require($(lowerBoundsOnIntercepts).size == numCoefficientSets, "The size of " + - "lowerBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " + + "lowerBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " + s"classes for multinomial regression, but found: ${getLowerBoundsOnIntercepts.size}.") } if (isSet(upperBoundsOnIntercepts)) { require($(upperBoundsOnIntercepts).size == numCoefficientSets, "The size of " + - "upperBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " + + "upperBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " + s"classes for multinomial regression, but found: ${getUpperBoundsOnIntercepts.size}.") } if (isSet(lowerBoundsOnCoefficients) && isSet(upperBoundsOnCoefficients)) {