From 56478be3bf7f64784e90d954bb188f5adaf79bd4 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Sat, 30 Apr 2016 12:40:58 +0800 Subject: [PATCH 01/12] create --- .../src/main/python/ml/one_vs_rest_example.py | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 examples/src/main/python/ml/one_vs_rest_example.py diff --git a/examples/src/main/python/ml/one_vs_rest_example.py b/examples/src/main/python/ml/one_vs_rest_example.py new file mode 100644 index 0000000000000..b6aef39bd3c40 --- /dev/null +++ b/examples/src/main/python/ml/one_vs_rest_example.py @@ -0,0 +1,101 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +import sys +from optparse import OptionParser + +from pyspark import SparkContext + +# $example on$ +from pyspark.ml.classification import LogisticRegression, OneVsRest +from pyspark.mllib.evaluation import MulticlassMetrics +from pyspark.sql import Row, SQLContext +# $example off$ + +""" +An example runner for Multiclass to Binary Reduction with One Vs Rest. +The example uses Logistic Regression as the base classifier. All parameters that +can be specified on the base classifier can be passed in to the runner options. +Run with: + + bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py +""" + +class Params: + def __init__(self, input, testInput, maxIter, tol, fitIntercept, regParam, + elasticNetParam, fracTest): + self.input = input + self.testInput = testInput + self.maxIter = maxIter + self.tol = tol + self.fitIntercept = fitIntercept + self.regParam = regParam + self.elasticNetParam = elasticNetParam + self.fracTest = fracTest + + +def parse(args): + parser = OptionParser() + + return Params + +if __name__ == "__main__": + + params = parse(sys.argv) + + sc = SparkContext(appName="OneVsRestExample") + sqlContext = SQLContext(sc) + + # $example on$ + inputData = sqlContext.read.format("libsvm").load(params.input) + # compute the train/test split: if testInput is not provided use part of input. + if params.testInput is not None: + train = inputData + test = sqlContext.read.format("libsvm").load(params.testInput) + else: + f = params.fracTest + (train, test) = inputData.randomSplit([1 - f, f]) + + lrParams = {'maxIter': params.maxIter, 'tol': params.tol, 'fitIntercept': params.fitIntercept} + if params.regParam is not None: + lrParams['regParam'] = params.regParam + if params.elasticNetParam is not None: + lrParams['elasticNetParam'] = params.elasticNetParam + + # instantiate the base classifier + lr = LogisticRegression(**lrParams) + + # instantiate the One Vs Rest Classifier. + ovr = OneVsRest(classifier=lr) + + # train the multiclass model. + ovrModel = ovr.fit(train) + + # score the model on test data. + predictions = ovrModel.transform(test) + + # evaluate the model + predictionsAndLabels = predictions.select("prediction", "label").rdd.map(row => (row.getDouble(0), row.getDouble(1))) + + evaluator = MulticlassClassificationEvaluator( + labelCol="indexedLabel", predictionCol="prediction", metricName="precision") + accuracy = evaluator.evaluate(predictions) + # $example off$ + + sc.stop() From 8bcc0c4875aaec7c7d93e7ff5fab650a95c2dc86 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Tue, 3 May 2016 18:32:13 +0800 Subject: [PATCH 02/12] finish pr --- .../src/main/python/ml/one_vs_rest_example.py | 57 ++++++++++++------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/examples/src/main/python/ml/one_vs_rest_example.py b/examples/src/main/python/ml/one_vs_rest_example.py index b6aef39bd3c40..263e5fd92e019 100644 --- a/examples/src/main/python/ml/one_vs_rest_example.py +++ b/examples/src/main/python/ml/one_vs_rest_example.py @@ -17,15 +17,15 @@ from __future__ import print_function -import sys from optparse import OptionParser +import sys from pyspark import SparkContext # $example on$ from pyspark.ml.classification import LogisticRegression, OneVsRest from pyspark.mllib.evaluation import MulticlassMetrics -from pyspark.sql import Row, SQLContext +from pyspark.sql import SQLContext # $example off$ """ @@ -37,29 +37,30 @@ bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py """ -class Params: - def __init__(self, input, testInput, maxIter, tol, fitIntercept, regParam, - elasticNetParam, fracTest): - self.input = input - self.testInput = testInput - self.maxIter = maxIter - self.tol = tol - self.fitIntercept = fitIntercept - self.regParam = regParam - self.elasticNetParam = elasticNetParam - self.fracTest = fracTest - +parser = OptionParser() +parser.add_option("--input", type="string", help="input path to labeled examples. This path must be specified") +parser.add_option("--fracTest", type="float", default=0.2, help="fraction of data to hold out for testing. If given option testInput, this option is ignored. default: 0.2") +parser.add_option("--testInput", type="string", default=None, help="iinput path to test dataset. If given, option fracTest is ignored") +parser.add_option("--maxIter", type="int", default=100, help="maximum number of iterations for Logistic Regression. default: 100") +parser.add_option("--tol", type="float", default=1e-6, help="the convergence tolerance of iterations for Logistic Regression. default: 1e-6") +parser.add_option("--fitIntercept", type="string", default="true", help="fit intercept for Logistic Regression. default: true") +parser.add_option("--regParam", type="float", default=None, help="the regularization parameter for Logistic Regression. default: None") +parser.add_option("--elasticNetParam", type="float", default=None, help="the ElasticNet mixing parameter for Logistic Regression. default: None") def parse(args): - parser = OptionParser() - - return Params + (params, args) = parser.parse_args(args) + assert params.input != None, "input is required" + assert 0 <= params.fracTest < 1, "fracTest value incorrect; should be in [0,1)." + assert params.fitIntercept in ("true", "false") + params.fitIntercept = params.fitIntercept == "true" + return params if __name__ == "__main__": + print(sys.argv) params = parse(sys.argv) - sc = SparkContext(appName="OneVsRestExample") + sc = SparkContext(appName="PythonOneVsRestExample") sqlContext = SQLContext(sc) # $example on$ @@ -91,11 +92,23 @@ def parse(args): predictions = ovrModel.transform(test) # evaluate the model - predictionsAndLabels = predictions.select("prediction", "label").rdd.map(row => (row.getDouble(0), row.getDouble(1))) + predictionAndLabels = predictions.rdd.map(lambda r: (r.prediction, r.label)) + + metrics = MulticlassMetrics(predictionAndLabels) + + confusionMatrix = metrics.confusionMatrix() + + # compute the false positive rate per label + numClasses = train.select('label').distinct().count() + + fprs = [(p, metrics.falsePositiveRate(float(p))) for p in range(numClasses)] + + print("Confusion Matrix") + print(confusionMatrix) - evaluator = MulticlassClassificationEvaluator( - labelCol="indexedLabel", predictionCol="prediction", metricName="precision") - accuracy = evaluator.evaluate(predictions) + print("label\tfpr") + for label, fpr in fprs: + print(str(label) + "\t" + str(fpr)) # $example off$ sc.stop() From 3d671738655cd29889a3637117605341dd60303f Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 5 May 2016 13:01:20 +0800 Subject: [PATCH 03/12] use argparse --- .../src/main/python/ml/one_vs_rest_example.py | 44 ++++++++++++------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/examples/src/main/python/ml/one_vs_rest_example.py b/examples/src/main/python/ml/one_vs_rest_example.py index 263e5fd92e019..2bcd65b799688 100644 --- a/examples/src/main/python/ml/one_vs_rest_example.py +++ b/examples/src/main/python/ml/one_vs_rest_example.py @@ -17,8 +17,7 @@ from __future__ import print_function -from optparse import OptionParser -import sys +import argparse from pyspark import SparkContext @@ -37,28 +36,39 @@ bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py """ -parser = OptionParser() -parser.add_option("--input", type="string", help="input path to labeled examples. This path must be specified") -parser.add_option("--fracTest", type="float", default=0.2, help="fraction of data to hold out for testing. If given option testInput, this option is ignored. default: 0.2") -parser.add_option("--testInput", type="string", default=None, help="iinput path to test dataset. If given, option fracTest is ignored") -parser.add_option("--maxIter", type="int", default=100, help="maximum number of iterations for Logistic Regression. default: 100") -parser.add_option("--tol", type="float", default=1e-6, help="the convergence tolerance of iterations for Logistic Regression. default: 1e-6") -parser.add_option("--fitIntercept", type="string", default="true", help="fit intercept for Logistic Regression. default: true") -parser.add_option("--regParam", type="float", default=None, help="the regularization parameter for Logistic Regression. default: None") -parser.add_option("--elasticNetParam", type="float", default=None, help="the ElasticNet mixing parameter for Logistic Regression. default: None") - -def parse(args): - (params, args) = parser.parse_args(args) - assert params.input != None, "input is required" +def parse(): + parser = argparse.ArgumentParser() + parser.add_argument("--input", + help="input path to labeled examples. This path must be specified") + parser.add_argument("--fracTest", type=float, default=0.2, + help="fraction of data to hold out for testing. If given option testInput," + " this option is ignored. default: 0.2") + parser.add_argument("--testInput", + help="iinput path to test dataset. If given, option fracTest is ignored") + parser.add_argument("--maxIter", type=int, default=100, + help="maximum number of iterations for Logistic Regression. default: 100") + parser.add_argument("--tol", type=float, default=1e-6, + help="the convergence tolerance of iterations for Logistic Regression." + " default: 1e-6") + parser.add_argument("--fitIntercept", default="true", + help="fit intercept for Logistic Regression. default: true") + parser.add_argument("--regParam", type=float, + help="the regularization parameter for Logistic Regression. default: None") + parser.add_argument("--elasticNetParam", type=float, + help="the ElasticNet mixing parameter for Logistic Regression. default:" + " None") + params = parser.parse_args() + + assert params.input is not None, "input is required" assert 0 <= params.fracTest < 1, "fracTest value incorrect; should be in [0,1)." assert params.fitIntercept in ("true", "false") params.fitIntercept = params.fitIntercept == "true" + return params if __name__ == "__main__": - print(sys.argv) - params = parse(sys.argv) + params = parse() sc = SparkContext(appName="PythonOneVsRestExample") sqlContext = SQLContext(sc) From 4e08c908ceb0bfbf4b9fdb6d63662d68eb09acdf Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 5 May 2016 13:11:19 +0800 Subject: [PATCH 04/12] fix py style --- examples/src/main/python/ml/one_vs_rest_example.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/src/main/python/ml/one_vs_rest_example.py b/examples/src/main/python/ml/one_vs_rest_example.py index 2bcd65b799688..8103caa763b37 100644 --- a/examples/src/main/python/ml/one_vs_rest_example.py +++ b/examples/src/main/python/ml/one_vs_rest_example.py @@ -36,6 +36,7 @@ bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py """ + def parse(): parser = argparse.ArgumentParser() parser.add_argument("--input", From b80b1ec24d1c68a026f735e3039a57cb7f07fda0 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 5 May 2016 13:49:32 +0800 Subject: [PATCH 05/12] add link in user guide doc --- docs/ml-classification-regression.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md index eaf4f6d843368..f6a6937e29f04 100644 --- a/docs/ml-classification-regression.md +++ b/docs/ml-classification-regression.md @@ -300,6 +300,13 @@ Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/OneVsRe {% include_example java/org/apache/spark/examples/ml/JavaOneVsRestExample.java %} + +
+ +Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classification.OneVsRest) for more details. + +{% include_example python/ml/one_vs_rest_example.py %} +
## Naive Bayes From 985a060909fe6d5b5122d89cb566a4ae51a81e2d Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 6 May 2016 10:21:59 +0800 Subject: [PATCH 06/12] update OvR examples, del the args-parsing --- .../examples/ml/JavaOneVsRestExample.java | 192 +++--------------- .../src/main/python/ml/one_vs_rest_example.py | 83 ++------ .../spark/examples/ml/OneVsRestExample.scala | 140 ++----------- 3 files changed, 63 insertions(+), 352 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java index e0cb752224f75..de55653e39d94 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java @@ -17,102 +17,66 @@ package org.apache.spark.examples.ml; -import org.apache.commons.cli.*; - // $example on$ import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.classification.OneVsRest; import org.apache.spark.ml.classification.OneVsRestModel; -import org.apache.spark.ml.util.MetadataUtils; import org.apache.spark.mllib.evaluation.MulticlassMetrics; import org.apache.spark.mllib.linalg.Matrix; -import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.StructField; // $example off$ +import org.apache.spark.sql.SparkSession; + /** * An example runner for Multiclass to Binary Reduction with One Vs Rest. - * The example uses Logistic Regression as the base classifier. All parameters that - * can be specified on the base classifier can be passed in to the runner options. + * The example uses Logistic Regression as the base classifier. * Run with *
- * bin/run-example ml.JavaOneVsRestExample [options]
+ * bin/run-example ml.JavaOneVsRestExample
  * 
*/ public class JavaOneVsRestExample { - - private static class Params { - String input; - String testInput = null; - Integer maxIter = 100; - double tol = 1E-6; - boolean fitIntercept = true; - Double regParam = null; - Double elasticNetParam = null; - double fracTest = 0.2; - } - public static void main(String[] args) { - // parse the arguments - Params params = parse(args); SparkSession spark = SparkSession .builder() .appName("JavaOneVsRestExample") .getOrCreate(); // $example on$ - // configure the base classifier - LogisticRegression classifier = new LogisticRegression() - .setMaxIter(params.maxIter) - .setTol(params.tol) - .setFitIntercept(params.fitIntercept); + // load data file. + Dataset inputData = spark.read().format("libsvm") + .load("data/mllib/sample_multiclass_classification_data.txt"); - if (params.regParam != null) { - classifier.setRegParam(params.regParam); - } - if (params.elasticNetParam != null) { - classifier.setElasticNetParam(params.elasticNetParam); - } + // generate the train/test split. + Dataset[] tmp = inputData.randomSplit(new double[]{0.8, 0.2}); + Dataset train = tmp[0]; + Dataset test = tmp[1]; - // instantiate the One Vs Rest Classifier - OneVsRest ovr = new OneVsRest().setClassifier(classifier); + // configure the base classifier. + LogisticRegression classifier = new LogisticRegression() + .setMaxIter(10) + .setTol(1E-6) + .setFitIntercept(true); - String input = params.input; - Dataset inputData = spark.read().format("libsvm").load(input); - Dataset train; - Dataset test; - - // compute the train/ test split: if testInput is not provided use part of input - String testInput = params.testInput; - if (testInput != null) { - train = inputData; - // compute the number of features in the training set. - int numFeatures = inputData.first().getAs(1).size(); - test = spark.read().format("libsvm").option("numFeatures", - String.valueOf(numFeatures)).load(testInput); - } else { - double f = params.fracTest; - Dataset[] tmp = inputData.randomSplit(new double[]{1 - f, f}, 12345); - train = tmp[0]; - test = tmp[1]; - } + // instantiate the One Vs Rest Classifier. + OneVsRest ovr = new OneVsRest().setClassifier(classifier); - // train the multiclass model + // train the multiclass model. OneVsRestModel ovrModel = ovr.fit(train.cache()); - // score the model on test data - Dataset predictions = ovrModel.transform(test.cache()) + // score the model on test data. + Dataset predictions = ovrModel.transform(test) .select("prediction", "label"); - // obtain metrics + // obtain metrics. MulticlassMetrics metrics = new MulticlassMetrics(predictions); - StructField predictionColSchema = predictions.schema().apply("prediction"); - Integer numClasses = (Integer) MetadataUtils.getNumClasses(predictionColSchema).get(); - // compute the false positive rate per label + Matrix confusionMatrix = metrics.confusionMatrix(); + + // compute the false positive rate per label. + int numClasses = confusionMatrix.numRows(); StringBuilder results = new StringBuilder(); results.append("label\tfpr\n"); for (int label = 0; label < numClasses; label++) { @@ -122,8 +86,6 @@ public static void main(String[] args) { results.append("\n"); } - Matrix confusionMatrix = metrics.confusionMatrix(); - // output the Confusion Matrix System.out.println("Confusion Matrix"); System.out.println(confusionMatrix); System.out.println(); @@ -133,106 +95,4 @@ public static void main(String[] args) { spark.stop(); } - private static Params parse(String[] args) { - Options options = generateCommandlineOptions(); - CommandLineParser parser = new PosixParser(); - Params params = new Params(); - - try { - CommandLine cmd = parser.parse(options, args); - String value; - if (cmd.hasOption("input")) { - params.input = cmd.getOptionValue("input"); - } - if (cmd.hasOption("maxIter")) { - value = cmd.getOptionValue("maxIter"); - params.maxIter = Integer.parseInt(value); - } - if (cmd.hasOption("tol")) { - value = cmd.getOptionValue("tol"); - params.tol = Double.parseDouble(value); - } - if (cmd.hasOption("fitIntercept")) { - value = cmd.getOptionValue("fitIntercept"); - params.fitIntercept = Boolean.parseBoolean(value); - } - if (cmd.hasOption("regParam")) { - value = cmd.getOptionValue("regParam"); - params.regParam = Double.parseDouble(value); - } - if (cmd.hasOption("elasticNetParam")) { - value = cmd.getOptionValue("elasticNetParam"); - params.elasticNetParam = Double.parseDouble(value); - } - if (cmd.hasOption("testInput")) { - value = cmd.getOptionValue("testInput"); - params.testInput = value; - } - if (cmd.hasOption("fracTest")) { - value = cmd.getOptionValue("fracTest"); - params.fracTest = Double.parseDouble(value); - } - - } catch (ParseException e) { - printHelpAndQuit(options); - } - return params; - } - - @SuppressWarnings("static") - private static Options generateCommandlineOptions() { - Option input = OptionBuilder.withArgName("input") - .hasArg() - .isRequired() - .withDescription("input path to labeled examples. This path must be specified") - .create("input"); - Option testInput = OptionBuilder.withArgName("testInput") - .hasArg() - .withDescription("input path to test examples") - .create("testInput"); - Option fracTest = OptionBuilder.withArgName("testInput") - .hasArg() - .withDescription("fraction of data to hold out for testing." + - " If given option testInput, this option is ignored. default: 0.2") - .create("fracTest"); - Option maxIter = OptionBuilder.withArgName("maxIter") - .hasArg() - .withDescription("maximum number of iterations for Logistic Regression. default:100") - .create("maxIter"); - Option tol = OptionBuilder.withArgName("tol") - .hasArg() - .withDescription("the convergence tolerance of iterations " + - "for Logistic Regression. default: 1E-6") - .create("tol"); - Option fitIntercept = OptionBuilder.withArgName("fitIntercept") - .hasArg() - .withDescription("fit intercept for logistic regression. default true") - .create("fitIntercept"); - Option regParam = OptionBuilder.withArgName( "regParam" ) - .hasArg() - .withDescription("the regularization parameter for Logistic Regression.") - .create("regParam"); - Option elasticNetParam = OptionBuilder.withArgName("elasticNetParam" ) - .hasArg() - .withDescription("the ElasticNet mixing parameter for Logistic Regression.") - .create("elasticNetParam"); - - Options options = new Options() - .addOption(input) - .addOption(testInput) - .addOption(fracTest) - .addOption(maxIter) - .addOption(tol) - .addOption(fitIntercept) - .addOption(regParam) - .addOption(elasticNetParam); - - return options; - } - - private static void printHelpAndQuit(Options options) { - HelpFormatter formatter = new HelpFormatter(); - formatter.printHelp("JavaOneVsRestExample", options); - System.exit(-1); - } } diff --git a/examples/src/main/python/ml/one_vs_rest_example.py b/examples/src/main/python/ml/one_vs_rest_example.py index 8103caa763b37..41b2c46fe1c35 100644 --- a/examples/src/main/python/ml/one_vs_rest_example.py +++ b/examples/src/main/python/ml/one_vs_rest_example.py @@ -17,80 +17,36 @@ from __future__ import print_function -import argparse - -from pyspark import SparkContext - # $example on$ from pyspark.ml.classification import LogisticRegression, OneVsRest from pyspark.mllib.evaluation import MulticlassMetrics -from pyspark.sql import SQLContext # $example off$ +from pyspark.sql import SparkSession """ An example runner for Multiclass to Binary Reduction with One Vs Rest. -The example uses Logistic Regression as the base classifier. All parameters that -can be specified on the base classifier can be passed in to the runner options. +The example uses Logistic Regression as the base classifier. Run with: - bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py """ -def parse(): - parser = argparse.ArgumentParser() - parser.add_argument("--input", - help="input path to labeled examples. This path must be specified") - parser.add_argument("--fracTest", type=float, default=0.2, - help="fraction of data to hold out for testing. If given option testInput," - " this option is ignored. default: 0.2") - parser.add_argument("--testInput", - help="iinput path to test dataset. If given, option fracTest is ignored") - parser.add_argument("--maxIter", type=int, default=100, - help="maximum number of iterations for Logistic Regression. default: 100") - parser.add_argument("--tol", type=float, default=1e-6, - help="the convergence tolerance of iterations for Logistic Regression." - " default: 1e-6") - parser.add_argument("--fitIntercept", default="true", - help="fit intercept for Logistic Regression. default: true") - parser.add_argument("--regParam", type=float, - help="the regularization parameter for Logistic Regression. default: None") - parser.add_argument("--elasticNetParam", type=float, - help="the ElasticNet mixing parameter for Logistic Regression. default:" - " None") - params = parser.parse_args() - - assert params.input is not None, "input is required" - assert 0 <= params.fracTest < 1, "fracTest value incorrect; should be in [0,1)." - assert params.fitIntercept in ("true", "false") - params.fitIntercept = params.fitIntercept == "true" - - return params - if __name__ == "__main__": + spark = SparkSession \ + .builder \ + .appName("OneHotEncoderExample") \ + .getOrCreate() - params = parse() + # $example on$ + # load data file. + inputData = spark.read.format("libsvm") \ + .load("data/mllib/sample_multiclass_classification_data.txt") - sc = SparkContext(appName="PythonOneVsRestExample") - sqlContext = SQLContext(sc) + # generate the train/test split. + (train, test) = inputData.randomSplit([0.8, 0.2]) - # $example on$ - inputData = sqlContext.read.format("libsvm").load(params.input) - # compute the train/test split: if testInput is not provided use part of input. - if params.testInput is not None: - train = inputData - test = sqlContext.read.format("libsvm").load(params.testInput) - else: - f = params.fracTest - (train, test) = inputData.randomSplit([1 - f, f]) - - lrParams = {'maxIter': params.maxIter, 'tol': params.tol, 'fitIntercept': params.fitIntercept} - if params.regParam is not None: - lrParams['regParam'] = params.regParam - if params.elasticNetParam is not None: - lrParams['elasticNetParam'] = params.elasticNetParam - - # instantiate the base classifier + # instantiate the base classifier. + lrParams = {'maxIter': 10, 'tol': 1E-6, 'fitIntercept': True} lr = LogisticRegression(**lrParams) # instantiate the One Vs Rest Classifier. @@ -102,24 +58,21 @@ def parse(): # score the model on test data. predictions = ovrModel.transform(test) - # evaluate the model + # obtain metrics. predictionAndLabels = predictions.rdd.map(lambda r: (r.prediction, r.label)) - metrics = MulticlassMetrics(predictionAndLabels) confusionMatrix = metrics.confusionMatrix() - # compute the false positive rate per label - numClasses = train.select('label').distinct().count() - + # compute the false positive rate per label. + numClasses = confusionMatrix.numRows fprs = [(p, metrics.falsePositiveRate(float(p))) for p in range(numClasses)] print("Confusion Matrix") print(confusionMatrix) - print("label\tfpr") for label, fpr in fprs: print(str(label) + "\t" + str(fpr)) # $example off$ - sc.stop() + spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala index fc73ae07ff6c6..411cae8e25767 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala @@ -18,171 +18,69 @@ // scalastyle:off println package org.apache.spark.examples.ml -import java.util.concurrent.TimeUnit.{NANOSECONDS => NANO} - -import scopt.OptionParser - // $example on$ -import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest} -import org.apache.spark.ml.util.MetadataUtils import org.apache.spark.mllib.evaluation.MulticlassMetrics -import org.apache.spark.mllib.linalg.Vector -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, Dataset} // $example off$ import org.apache.spark.sql.SparkSession /** * An example runner for Multiclass to Binary Reduction with One Vs Rest. - * The example uses Logistic Regression as the base classifier. All parameters that - * can be specified on the base classifier can be passed in to the runner options. + * The example uses Logistic Regression as the base classifier. * Run with * {{{ * ./bin/run-example ml.OneVsRestExample [options] * }}} - * For local mode, run - * {{{ - * ./bin/spark-submit --class org.apache.spark.examples.ml.OneVsRestExample --driver-memory 1g - * [examples JAR path] [options] - * }}} - * If you use it as a template to create your own app, please use `spark-submit` to submit your app. */ -object OneVsRestExample { - - case class Params private[ml] ( - input: String = null, - testInput: Option[String] = None, - maxIter: Int = 100, - tol: Double = 1E-6, - fitIntercept: Boolean = true, - regParam: Option[Double] = None, - elasticNetParam: Option[Double] = None, - fracTest: Double = 0.2) extends AbstractParams[Params] +object OneVsRestExample { def main(args: Array[String]) { - val defaultParams = Params() - - val parser = new OptionParser[Params]("OneVsRest Example") { - head("OneVsRest Example: multiclass to binary reduction using OneVsRest") - opt[String]("input") - .text("input path to labeled examples. This path must be specified") - .required() - .action((x, c) => c.copy(input = x)) - opt[Double]("fracTest") - .text(s"fraction of data to hold out for testing. If given option testInput, " + - s"this option is ignored. default: ${defaultParams.fracTest}") - .action((x, c) => c.copy(fracTest = x)) - opt[String]("testInput") - .text("input path to test dataset. If given, option fracTest is ignored") - .action((x, c) => c.copy(testInput = Some(x))) - opt[Int]("maxIter") - .text(s"maximum number of iterations for Logistic Regression." + - s" default: ${defaultParams.maxIter}") - .action((x, c) => c.copy(maxIter = x)) - opt[Double]("tol") - .text(s"the convergence tolerance of iterations for Logistic Regression." + - s" default: ${defaultParams.tol}") - .action((x, c) => c.copy(tol = x)) - opt[Boolean]("fitIntercept") - .text(s"fit intercept for Logistic Regression." + - s" default: ${defaultParams.fitIntercept}") - .action((x, c) => c.copy(fitIntercept = x)) - opt[Double]("regParam") - .text(s"the regularization parameter for Logistic Regression.") - .action((x, c) => c.copy(regParam = Some(x))) - opt[Double]("elasticNetParam") - .text(s"the ElasticNet mixing parameter for Logistic Regression.") - .action((x, c) => c.copy(elasticNetParam = Some(x))) - checkConfig { params => - if (params.fracTest < 0 || params.fracTest >= 1) { - failure(s"fracTest ${params.fracTest} value incorrect; should be in [0,1).") - } else { - success - } - } - } - parser.parse(args, defaultParams).map { params => - run(params) - }.getOrElse { - sys.exit(1) - } - } - - private def run(params: Params) { val spark = SparkSession .builder - .appName(s"OneVsRestExample with $params") + .appName(s"OneVsRestExample") .getOrCreate() // $example on$ - val inputData = spark.read.format("libsvm").load(params.input) - // compute the train/test split: if testInput is not provided use part of input. - val data = params.testInput match { - case Some(t) => - // compute the number of features in the training set. - val numFeatures = inputData.first().getAs[Vector](1).size - val testData = spark.read.option("numFeatures", numFeatures.toString) - .format("libsvm").load(t) - Array[DataFrame](inputData, testData) - case None => - val f = params.fracTest - inputData.randomSplit(Array(1 - f, f), seed = 12345) - } - val Array(train, test) = data.map(_.cache()) + // load data file. + val inputData: DataFrame = spark.read.format("libsvm") + .load("data/mllib/sample_multiclass_classification_data.txt") + + // generate the train/test split. + val Array(train, test) = inputData.randomSplit(Array(0.8, 0.2)) // instantiate the base classifier val classifier = new LogisticRegression() - .setMaxIter(params.maxIter) - .setTol(params.tol) - .setFitIntercept(params.fitIntercept) - - // Set regParam, elasticNetParam if specified in params - params.regParam.foreach(classifier.setRegParam) - params.elasticNetParam.foreach(classifier.setElasticNetParam) + .setMaxIter(10) + .setTol(1E-6) + .setFitIntercept(true) // instantiate the One Vs Rest Classifier. - val ovr = new OneVsRest() ovr.setClassifier(classifier) // train the multiclass model. - val (trainingDuration, ovrModel) = time(ovr.fit(train)) + val ovrModel = ovr.fit(train) // score the model on test data. - val (predictionDuration, predictions) = time(ovrModel.transform(test)) + val predictions = ovrModel.transform(test) - // evaluate the model - val predictionsAndLabels = predictions.select("prediction", "label") - .rdd.map(row => (row.getDouble(0), row.getDouble(1))) - - val metrics = new MulticlassMetrics(predictionsAndLabels) + // obtain metrics. + val metrics = new MulticlassMetrics(predictions.as[(Double, Double)].rdd) val confusionMatrix = metrics.confusionMatrix - // compute the false positive rate per label - val predictionColSchema = predictions.schema("prediction") - val numClasses = MetadataUtils.getNumClasses(predictionColSchema).get + // compute the false positive rate per label. + val numClasses = confusionMatrix.numRows val fprs = Range(0, numClasses).map(p => (p, metrics.falsePositiveRate(p.toDouble))) - println(s" Training Time ${trainingDuration} sec\n") - - println(s" Prediction Time ${predictionDuration} sec\n") - println(s" Confusion Matrix\n ${confusionMatrix.toString}\n") - println("label\tfpr") - println(fprs.map {case (label, fpr) => label + "\t" + fpr}.mkString("\n")) // $example off$ spark.stop() } - private def time[R](block: => R): (Long, R) = { - val t0 = System.nanoTime() - val result = block // call-by-name - val t1 = System.nanoTime() - (NANO.toSeconds(t1 - t0), result) - } } // scalastyle:on println From ff84ccc125f8f3cda4a521a1489b81eb0651da8d Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 6 May 2016 10:58:10 +0800 Subject: [PATCH 07/12] add implicits for dataset.as --- .../scala/org/apache/spark/examples/ml/OneVsRestExample.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala index 411cae8e25767..a4cfd81adff4a 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala @@ -21,7 +21,7 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest} import org.apache.spark.mllib.evaluation.MulticlassMetrics -import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.DataFrame // $example off$ import org.apache.spark.sql.SparkSession @@ -41,6 +41,8 @@ object OneVsRestExample { .appName(s"OneVsRestExample") .getOrCreate() + import spark.implicits._ + // $example on$ // load data file. val inputData: DataFrame = spark.read.format("libsvm") From aec9994813d9c7e141a5e61cc38843626dc4046d Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Fri, 6 May 2016 11:01:26 +0800 Subject: [PATCH 08/12] fix one nit --- .../scala/org/apache/spark/examples/ml/OneVsRestExample.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala index a4cfd81adff4a..680eac3de4320 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.SparkSession * The example uses Logistic Regression as the base classifier. * Run with * {{{ - * ./bin/run-example ml.OneVsRestExample [options] + * ./bin/run-example ml.OneVsRestExample * }}} */ From 78f503d51c745f42bd5a40fdc55e8b8070d41fa7 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Sun, 8 May 2016 10:34:58 +0800 Subject: [PATCH 09/12] use MulticlassClassificationEvaluator --- .../examples/ml/JavaOneVsRestExample.java | 27 +++++-------------- .../src/main/python/ml/one_vs_rest_example.py | 26 ++++++------------ .../spark/examples/ml/OneVsRestExample.scala | 24 ++++++----------- 3 files changed, 23 insertions(+), 54 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java index de55653e39d94..0e32d736839a0 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java @@ -21,7 +21,7 @@ import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.classification.OneVsRest; import org.apache.spark.ml.classification.OneVsRestModel; -import org.apache.spark.mllib.evaluation.MulticlassMetrics; +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; import org.apache.spark.mllib.linalg.Matrix; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -70,26 +70,13 @@ public static void main(String[] args) { Dataset predictions = ovrModel.transform(test) .select("prediction", "label"); - // obtain metrics. - MulticlassMetrics metrics = new MulticlassMetrics(predictions); + // obtain evaluator. + MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() + .setMetricName("precision"); - Matrix confusionMatrix = metrics.confusionMatrix(); - - // compute the false positive rate per label. - int numClasses = confusionMatrix.numRows(); - StringBuilder results = new StringBuilder(); - results.append("label\tfpr\n"); - for (int label = 0; label < numClasses; label++) { - results.append(label); - results.append("\t"); - results.append(metrics.falsePositiveRate((double) label)); - results.append("\n"); - } - - System.out.println("Confusion Matrix"); - System.out.println(confusionMatrix); - System.out.println(); - System.out.println(results); + // compute the classification error on test data. + double precision = evaluator.evaluate(predictions); + System.out.print("Test Error : " + (1 - precision)); // $example off$ spark.stop(); diff --git a/examples/src/main/python/ml/one_vs_rest_example.py b/examples/src/main/python/ml/one_vs_rest_example.py index 41b2c46fe1c35..574fdd235f9fd 100644 --- a/examples/src/main/python/ml/one_vs_rest_example.py +++ b/examples/src/main/python/ml/one_vs_rest_example.py @@ -19,7 +19,7 @@ # $example on$ from pyspark.ml.classification import LogisticRegression, OneVsRest -from pyspark.mllib.evaluation import MulticlassMetrics +from pyspark.ml.evaluation import MulticlassClassificationEvaluator # $example off$ from pyspark.sql import SparkSession @@ -34,7 +34,7 @@ if __name__ == "__main__": spark = SparkSession \ .builder \ - .appName("OneHotEncoderExample") \ + .appName("PythonOneVsRestExample") \ .getOrCreate() # $example on$ @@ -46,8 +46,7 @@ (train, test) = inputData.randomSplit([0.8, 0.2]) # instantiate the base classifier. - lrParams = {'maxIter': 10, 'tol': 1E-6, 'fitIntercept': True} - lr = LogisticRegression(**lrParams) + lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) @@ -58,21 +57,12 @@ # score the model on test data. predictions = ovrModel.transform(test) - # obtain metrics. - predictionAndLabels = predictions.rdd.map(lambda r: (r.prediction, r.label)) - metrics = MulticlassMetrics(predictionAndLabels) + # obtain evaluator. + evaluator = MulticlassClassificationEvaluator(metricName="precision") - confusionMatrix = metrics.confusionMatrix() - - # compute the false positive rate per label. - numClasses = confusionMatrix.numRows - fprs = [(p, metrics.falsePositiveRate(float(p))) for p in range(numClasses)] - - print("Confusion Matrix") - print(confusionMatrix) - print("label\tfpr") - for label, fpr in fprs: - print(str(label) + "\t" + str(fpr)) + # compute the classification error on test data. + precision = evaluator.evaluate(predictions) + print("Test Error : " + str(1 - precision)) # $example off$ spark.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala index 680eac3de4320..2ca750a26d0c9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala @@ -20,7 +20,7 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest} -import org.apache.spark.mllib.evaluation.MulticlassMetrics +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.sql.DataFrame // $example off$ import org.apache.spark.sql.SparkSession @@ -41,8 +41,6 @@ object OneVsRestExample { .appName(s"OneVsRestExample") .getOrCreate() - import spark.implicits._ - // $example on$ // load data file. val inputData: DataFrame = spark.read.format("libsvm") @@ -58,8 +56,7 @@ object OneVsRestExample { .setFitIntercept(true) // instantiate the One Vs Rest Classifier. - val ovr = new OneVsRest() - ovr.setClassifier(classifier) + val ovr = new OneVsRest().setClassifier(classifier) // train the multiclass model. val ovrModel = ovr.fit(train) @@ -67,18 +64,13 @@ object OneVsRestExample { // score the model on test data. val predictions = ovrModel.transform(test) - // obtain metrics. - val metrics = new MulticlassMetrics(predictions.as[(Double, Double)].rdd) - - val confusionMatrix = metrics.confusionMatrix - - // compute the false positive rate per label. - val numClasses = confusionMatrix.numRows - val fprs = Range(0, numClasses).map(p => (p, metrics.falsePositiveRate(p.toDouble))) + // obtain evaluator. + val evaluator = new MulticlassClassificationEvaluator() + .setMetricName("precision") - println(s" Confusion Matrix\n ${confusionMatrix.toString}\n") - println("label\tfpr") - println(fprs.map {case (label, fpr) => label + "\t" + fpr}.mkString("\n")) + // compute the classification error on test data. + val precision = evaluator.evaluate(predictions) + println(s"Test Error : ${1 - precision}") // $example off$ spark.stop() From 4538b647ef85864cad412dd727c4c47aa5006d65 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Sun, 8 May 2016 17:49:54 +0800 Subject: [PATCH 10/12] java print -> println --- .../java/org/apache/spark/examples/ml/JavaOneVsRestExample.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java index 0e32d736839a0..2c234e218be42 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java @@ -76,7 +76,7 @@ public static void main(String[] args) { // compute the classification error on test data. double precision = evaluator.evaluate(predictions); - System.out.print("Test Error : " + (1 - precision)); + System.out.println("Test Error : " + (1 - precision)); // $example off$ spark.stop(); From 77ff733f298fe5042070dfb5cbcf33519f9a6a18 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Wed, 11 May 2016 09:36:18 +0800 Subject: [PATCH 11/12] del one unused import and update comments --- .../org/apache/spark/examples/ml/JavaOneVsRestExample.java | 5 ++--- examples/src/main/python/ml/one_vs_rest_example.py | 4 ++-- .../org/apache/spark/examples/ml/OneVsRestExample.scala | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java index 2c234e218be42..f22d5bab0342a 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java @@ -22,7 +22,6 @@ import org.apache.spark.ml.classification.OneVsRest; import org.apache.spark.ml.classification.OneVsRestModel; import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; -import org.apache.spark.mllib.linalg.Matrix; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; // $example off$ @@ -30,8 +29,8 @@ /** - * An example runner for Multiclass to Binary Reduction with One Vs Rest. - * The example uses Logistic Regression as the base classifier. + * An example of Multiclass to Binary Reduction with One Vs Rest, + * using Logistic Regression as the base classifier. * Run with *
  * bin/run-example ml.JavaOneVsRestExample
diff --git a/examples/src/main/python/ml/one_vs_rest_example.py b/examples/src/main/python/ml/one_vs_rest_example.py
index 574fdd235f9fd..971156d0dd293 100644
--- a/examples/src/main/python/ml/one_vs_rest_example.py
+++ b/examples/src/main/python/ml/one_vs_rest_example.py
@@ -24,8 +24,8 @@
 from pyspark.sql import SparkSession
 
 """
-An example runner for Multiclass to Binary Reduction with One Vs Rest.
-The example uses Logistic Regression as the base classifier.
+An example of Multiclass to Binary Reduction with One Vs Rest,
+using Logistic Regression as the base classifier.
 Run with:
   bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py
 """
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
index 2ca750a26d0c9..0b333cf629419 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
@@ -26,8 +26,8 @@ import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.SparkSession
 
 /**
- * An example runner for Multiclass to Binary Reduction with One Vs Rest.
- * The example uses Logistic Regression as the base classifier.
+ * An example of Multiclass to Binary Reduction with One Vs Rest,
+ * using Logistic Regression as the base classifier.
  * Run with
  * {{{
  * ./bin/run-example ml.OneVsRestExample

From 9049002d48bee41a389782c082a7fd4752a6d3fe Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng 
Date: Wed, 11 May 2016 15:24:38 +0800
Subject: [PATCH 12/12] rename cache in Java

---
 .../java/org/apache/spark/examples/ml/JavaOneVsRestExample.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
index f22d5bab0342a..5bf455ebfed23 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
@@ -63,7 +63,7 @@ public static void main(String[] args) {
     OneVsRest ovr = new OneVsRest().setClassifier(classifier);
 
     // train the multiclass model.
-    OneVsRestModel ovrModel = ovr.fit(train.cache());
+    OneVsRestModel ovrModel = ovr.fit(train);
 
     // score the model on test data.
     Dataset predictions = ovrModel.transform(test)