From c1bc4fd1639a75afd914f3bd7f280a3880548798 Mon Sep 17 00:00:00 2001 From: Niketan Pansare Date: Fri, 5 Aug 2016 08:17:47 -0700 Subject: [PATCH 01/14] [SYSTEMML-234] [SYSTEMML-208] Added mllearn library to support scikit-learn and MLPipeline --- .../java/org/apache/sysml/api/MLContext.java | 27 ++- .../java/org/apache/sysml/api/MLOutput.java | 39 +++- .../org/apache/sysml/api/python/SystemML.py | 174 +++++++++++++- .../spark/utils/RDDConverterUtilsExt.java | 43 ++++ .../sysml/api/ml/LogisticRegression.scala | 218 ++++++++++++++---- 5 files changed, 441 insertions(+), 60 deletions(-) diff --git a/src/main/java/org/apache/sysml/api/MLContext.java b/src/main/java/org/apache/sysml/api/MLContext.java index a03c8b7753e..32b0ce949e5 100644 --- a/src/main/java/org/apache/sysml/api/MLContext.java +++ b/src/main/java/org/apache/sysml/api/MLContext.java @@ -477,6 +477,25 @@ public void registerInput(String varName, RDD rdd, String format, long r registerInput(varName, rdd.toJavaRDD().mapToPair(new ConvertStringToLongTextPair()), format, rlen, clen, nnz, null); } + public void registerInput(String varName, MatrixBlock mb) throws DMLRuntimeException { + MatrixCharacteristics mc = new MatrixCharacteristics(mb.getNumRows(), mb.getNumColumns(), OptimizerUtils.DEFAULT_BLOCKSIZE, OptimizerUtils.DEFAULT_BLOCKSIZE, mb.getNonZeros()); + registerInput(varName, mb, mc); + } + + public void registerInput(String varName, MatrixBlock mb, MatrixCharacteristics mc) throws DMLRuntimeException { + if(_variables == null) + _variables = new LocalVariableMap(); + if(_inVarnames == null) + _inVarnames = new ArrayList(); + + MatrixObject mo = new MatrixObject(ValueType.DOUBLE, "temp", new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo)); + mo.acquireModify(mb); + mo.release(); + _variables.put(varName, mo); + _inVarnames.add(varName); + checkIfRegisteringInputAllowed(); + } + // All CSV related methods call this ... It provides access to dimensions, nnz, file properties. private void registerInput(String varName, JavaPairRDD textOrCsv_rdd, String format, long rlen, long clen, long nnz, FileFormatProperties props) throws DMLRuntimeException { if(!(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)) { @@ -1237,8 +1256,6 @@ private synchronized MLOutput compileAndExecuteScript(String dmlScriptFilePath, if(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK) { - Map> retVal = null; - // Depending on whether registerInput/registerOutput was called initialize the variables String[] inputs; String[] outputs; if(_inVarnames != null) { @@ -1268,10 +1285,6 @@ private synchronized MLOutput compileAndExecuteScript(String dmlScriptFilePath, for( String ovar : _outVarnames ) { if( _variables.keySet().contains(ovar) ) { - if(retVal == null) { - retVal = new HashMap>(); - } - retVal.put(ovar, ((SparkExecutionContext) ec).getBinaryBlockRDDHandleForVariable(ovar)); outMetadata.put(ovar, ec.getMatrixCharacteristics(ovar)); // For converting output to dataframe } else { @@ -1280,7 +1293,7 @@ private synchronized MLOutput compileAndExecuteScript(String dmlScriptFilePath, } } - return new MLOutput(retVal, outMetadata); + return new MLOutput(_variables, ec, outMetadata); } else { throw new DMLRuntimeException("Unsupported runtime:" + DMLScript.rtplatform.name()); diff --git a/src/main/java/org/apache/sysml/api/MLOutput.java b/src/main/java/org/apache/sysml/api/MLOutput.java index a3e601910c7..3ef68a9f151 100644 --- a/src/main/java/org/apache/sysml/api/MLOutput.java +++ b/src/main/java/org/apache/sysml/api/MLOutput.java @@ -39,13 +39,16 @@ import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.LocalVariableMap; +import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; +import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; import org.apache.sysml.runtime.instructions.spark.functions.GetMLBlock; import org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.MatrixIndexes; import org.apache.sysml.runtime.util.UtilFunctions; - +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import scala.Tuple2; /** @@ -54,25 +57,39 @@ */ public class MLOutput { - - - Map> _outputs; + private LocalVariableMap _variables; + private ExecutionContext _ec; private Map _outMetadata = null; - public MLOutput(Map> outputs, Map outMetadata) { - this._outputs = outputs; + public MLOutput(LocalVariableMap variables, ExecutionContext ec, Map outMetadata) { + this._variables = variables; + this._ec = ec; this._outMetadata = outMetadata; } + public MatrixBlock getMatrixBlock(String varName) throws DMLRuntimeException { + if( _variables.keySet().contains(varName) ) { + MatrixObject mo = _ec.getMatrixObject(varName); + MatrixBlock mb = mo.acquireRead(); + mo.release(); + return mb; + } + else { + throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table."); + } + } + public JavaPairRDD getBinaryBlockedRDD(String varName) throws DMLRuntimeException { - if(_outputs.containsKey(varName)) { - return _outputs.get(varName); + if( _variables.keySet().contains(varName) ) { + return ((SparkExecutionContext) _ec).getBinaryBlockRDDHandleForVariable(varName); + } + else { + throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table."); } - throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table."); } public MatrixCharacteristics getMatrixCharacteristics(String varName) throws DMLRuntimeException { - if(_outputs.containsKey(varName)) { + if(_outMetadata.containsKey(varName)) { return _outMetadata.get(varName); } throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table."); @@ -246,7 +263,7 @@ public Iterable>> call(Tuple2>(startRowIndex + i, new Tuple2(kv._1.getColumnIndex(), partialRow))); + retVal.add(new Tuple2>(startRowIndex + i + 1, new Tuple2(kv._1.getColumnIndex(), partialRow))); } return retVal; } diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py index 8ad3117db65..3dfef67eb17 100644 --- a/src/main/java/org/apache/sysml/api/python/SystemML.py +++ b/src/main/java/org/apache/sysml/api/python/SystemML.py @@ -25,7 +25,13 @@ import os from pyspark.sql import DataFrame, SQLContext from pyspark.rdd import RDD - +import numpy as np +import pandas as pd +import sklearn as sk +from pyspark.ml.feature import VectorAssembler +from pyspark.mllib.linalg import Vectors +import sys +from pyspark.ml import Estimator, Model class MLContext(object): @@ -57,6 +63,7 @@ def __init__(self, sc, *args): setForcedSparkExecType = (args[1] if len(args) > 1 else False) self.sc = sc self.ml = sc._jvm.org.apache.sysml.api.MLContext(sc._jsc, monitorPerformance, setForcedSparkExecType) + self.sqlCtx = SQLContext(sc) except Py4JError: traceback.print_exc() @@ -171,7 +178,6 @@ def registerInput(self, varName, src, *args): else: raise TypeError('Arguments do not match MLContext-API') except Py4JJavaError: - traceback.print_exc() def registerOutput(self, varName): @@ -232,6 +238,10 @@ def getDF(self, sqlContext, varName): except Py4JJavaError: traceback.print_exc() + def getPandasDF(self, sqlContext, varName): + df = self.toDF(sqlContext, varName).sort('ID').drop('ID') + return df.toPandas() + def getMLMatrix(self, sqlContext, varName): raise Exception('Not supported in Python MLContext') #try: @@ -247,3 +257,163 @@ def getStringRDD(self, varName, format): # return rdd #except Py4JJavaError: # traceback.print_exc() + +def getNumCols(numPyArr): + if len(numPyArr.shape) == 1: + return 1 + else: + return numPyArr.shape[1] + +def convertToJavaMatrix(sc, src): + if isinstance(src, np.ndarray): + from array import array + numCols = getNumCols(src) + numRows = src.shape[0] + if src.dtype.type is np.float64: + arr = src.reshape(-1) + else: + arr = array('d', src.reshape(-1)) + buf = bytearray(arr.tostring()) + return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols) + else: + raise Exception('Type is not supported') + +def convertToNumpyArr(sc, mb): + numRows = mb.getNumRows() + numCols = mb.getNumColumns() + buf = sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertMBtoPy4JDenseArr(mb) + return np.frombuffer(buf, count=numRows*numCols, dtype=np.float64) + +class mllearn: + # Or we can create new Python project with package structure + class LogisticRegression(Estimator): + + def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False): + self.sqlCtx = sqlCtx + self.sc = sqlCtx._sc + self.log = self.sc._jvm.org.apache.sysml.api.ml.LogisticRegression("lr", self.sc._jsc.sc()) + self.transferUsingDF = transferUsingDF + if penalty != 'l2': + raise Exception('Only l2 penalty is supported') + if fit_intercept: + self.icpt = 1 + else: + self.icpt = 0 + self.max_iter = max_iter + self.max_inner_iter = max_inner_iter + self.tol = tol + if C == 0: + raise Exception('C cannot be 0') + reg = 1/C + self.reg = reg + self.updateLog() + if solver != 'newton-cg': + raise Exception('Only newton-cg solver supported') + + def updateLog(self): + self.log.setMaxOuterIter(self.max_iter) + self.log.setMaxInnerIter(self.max_inner_iter) + self.log.setRegParam(self.reg) + self.log.setTol(self.tol) + self.log.setIcpt(self.icpt) + + def convertToPDF(self, X): + if isinstance(X, np.ndarray): + colNames = [] + numCols = getNumCols(X) + for i in range(0, numCols): + colNames = colNames + [ str('C' + str(i))] + pdfX = pd.DataFrame(X, columns=colNames) + elif isinstance(X, pd.core.frame.DataFrame): + pdfX = X + else: + raise Exception('The input type not supported') + return pdfX + + def tolist(self, inputCols): + if isinstance(inputCols, pd.indexes.base.Index): + return inputCols.get_values().tolist() + elif isinstance(inputCols, list): + return inputCols + else: + raise Exception('inputCols should be of type pandas.indexes.base.Index or list') + + def assemble(self, pdf, inputCols, outputCol): + tmpDF = self.sqlCtx.createDataFrame(pdf, self.tolist(pdf.columns)) + assembler = VectorAssembler(inputCols=self.tolist(inputCols), outputCol=outputCol) + return assembler.transform(tmpDF) + + def _fit(self, X): + if hasattr(X, '_jdf') and 'features' in X.columns and 'label' in X.columns: + self.model = self.log.fit(X._jdf) + return self + else: + raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns') + + # TOOD: Ignoring kwargs + def fit(self, X, *args, **kwargs): + self.updateLog() + numArgs = len(args) + 1 + if numArgs == 1: + return self._fit(X) + elif numArgs == 2 and (isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame)): + y = args[0] + if self.transferUsingDF: + pdfX = self.convertToPDF(X) + pdfY = self.convertToPDF(y) + if getNumCols(pdfY) != 1: + raise Exception('y should be a column vector') + if pdfX.shape[0] != pdfY.shape[0]: + raise Exception('Number of rows of X and y should match') + colNames = pdfX.columns + pdfX['label'] = pdfY[pdfY.columns[0]] + df = self.assemble(pdfX, colNames, 'features').select('features', 'label') + self.model = self.log.fit(df._jdf) + else: + numColsy = getNumCols(y) + if numColsy != 1: + raise Exception('Expected y to be a column vector') + self.model = self.log.fit(convertToJavaMatrix(self.sc, X), convertToJavaMatrix(self.sc, y)) + self.model.setOutputRawPredictions(False) + return self + else: + raise Exception('Unsupported input type') + + def transform(self, X): + return self.predict(X) + + def predict(self, X): + if isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame): + if self.transferUsingDF: + pdfX = self.convertToPDF(X) + df = self.assemble(pdfX, pdfX.columns, 'features').select('features') + retjDF = self.model.transform(df._jdf) + retDF = DataFrame(retjDF, self.sqlCtx) + retPDF = retDF.sort('ID').select('prediction').toPandas() + if isinstance(X, np.ndarray): + return retPDF.as_matrix().flatten() + else: + return retPDF + else: + retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToJavaMatrix(self.sc, X))) + if isinstance(X, np.ndarray): + return retNumPy + else: + return retNumPy # TODO: Convert to Pandas + elif hasattr(X, '_jdf'): + if 'features' in X.columns: + # No need to assemble as input DF is likely coming via MLPipeline + df = X + else: + assembler = VectorAssembler(inputCols=X.columns, outputCol='features') + df = assembler.transform(X) + retjDF = self.model.transform(df._jdf) + retDF = DataFrame(retjDF, self.sqlCtx) + # Return DF + return retDF.sort('ID') + else: + raise Exception('Unsupported input type') + + def score(self, X, y): + return sk.metrics.accuracy_score(y, self.predict(X)) + \ No newline at end of file diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java index f022e40b947..114e78fc3d3 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java @@ -46,6 +46,8 @@ import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import scala.Tuple2; @@ -260,6 +262,47 @@ public static JavaPairRDD dataFrameToBinaryBlock(Jav return dataFrameToBinaryBlock(sc, df, mcOut, false, columns); } + public static MatrixBlock convertPy4JArrayToMB(byte [] data, int rlen, int clen) throws DMLRuntimeException { + return convertPy4JArrayToMB(data, rlen, clen, false); + } + + public static MatrixBlock convertPy4JArrayToMB(byte [] data, int rlen, int clen, boolean isSparse) throws DMLRuntimeException { + MatrixBlock mb = new MatrixBlock(rlen, clen, isSparse, -1); + if(isSparse) { + throw new DMLRuntimeException("Convertion to sparse format not supported"); + } + else { + double [] denseBlock = new double[rlen*clen]; + ByteBuffer buf = ByteBuffer.wrap(data); + buf.order(ByteOrder.nativeOrder()); + for(int i = 0; i < rlen*clen; i++) { + denseBlock[i] = buf.getDouble(); + } + mb.init( denseBlock, rlen, clen ); + } + mb.examSparsity(); + return mb; + } + + public static byte [] convertMBtoPy4JDenseArr(MatrixBlock mb) throws DMLRuntimeException { + byte [] ret = null; + if(mb.isInSparseFormat()) { + throw new DMLRuntimeException("Sparse to dense conversion is not yet implemented"); + } + else { + double [] denseBlock = mb.getDenseBlock(); + if(denseBlock == null) { + throw new DMLRuntimeException("Sparse to dense conversion is not yet implemented"); + } + int times = Double.SIZE / Byte.SIZE; + ret = new byte[denseBlock.length * times]; + for(int i=0;i < denseBlock.length;i++){ + ByteBuffer.wrap(ret, i*times, times).order(ByteOrder.nativeOrder()).putDouble(denseBlock[i]); + } + } + return ret; + } + /** * Converts DataFrame into binary blocked RDD. * Note: mcOut will be set if you don't know the dimensions. diff --git a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala index 2fabde1b14a..c679ff6d212 100644 --- a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala +++ b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala @@ -34,6 +34,10 @@ import org.apache.spark.SparkConf import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import scala.reflect.ClassTag +import scala.collection.immutable.HashMap +import org.apache.spark.sql.functions.udf +import org.apache.sysml.runtime.matrix.data.MatrixBlock +import org.apache.sysml.runtime.DMLRuntimeException trait HasIcpt extends Params { final val icpt: Param[Int] = new Param[Int](this, "icpt", "Intercept presence, shifting and rescaling X columns") @@ -81,12 +85,62 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends copyValues(that, extra) } override def transformSchema(schema: StructType): StructType = schema + + // Note: will update the y_mb as this will be called by Python mllearn + def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): LogisticRegressionModel = { + val ml = new MLContext(sc) + val labelMapping = new java.util.HashMap[String, Int] + val revLabelMapping = new java.util.HashMap[Int, String] + if(y_mb.getNumColumns != 1) { + throw new RuntimeException("Expected a column vector for y") + } + if(y_mb.isInSparseFormat()) { + throw new DMLRuntimeException("Sparse block is not implemented for fit") + } + else { + val denseBlock = y_mb.getDenseBlock() + var id:Int = 1 + for(i <- 0 until denseBlock.length) { + val v = denseBlock(i).toString() + if(!labelMapping.containsKey(v)) { + labelMapping.put(v, id) + revLabelMapping.put(id, v) + id += 1 + } + denseBlock.update(i, labelMapping.get(v)) + } + } + + val mloutput = { + val paramsMap: Map[String, String] = Map( + "icpt" -> this.getIcpt.toString(), + "reg" -> this.getRegParam.toString(), + "tol" -> this.getTol.toString, + "moi" -> this.getMaxOuterIte.toString, + "mii" -> this.getMaxInnerIter.toString, + + "X" -> " ", + "Y" -> " ", + "B" -> " ") + ml.registerInput("X", X_mb); + ml.registerInput("Y_vec", y_mb); + ml.registerOutput("B_out"); + ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath), paramsMap) + } + new LogisticRegressionModel("logisticRegression")(mloutput, revLabelMapping, sc) + } override def fit(df: DataFrame): LogisticRegressionModel = { val ml = new MLContext(df.rdd.sparkContext) val mcXin = new MatrixCharacteristics() val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features") - val yin = df.select("label").rdd.map { _.apply(0).toString() } - + val temp = df.select("label").distinct.rdd.map(_.apply(0).toString).collect() + val labelMapping = new java.util.HashMap[String, Int] + val revLabelMapping = new java.util.HashMap[Int, String] + for(i <- 0 until temp.length) { + labelMapping.put(temp(i), i+1) + revLabelMapping.put(i+1, temp(i)) + } + val yin = df.select("label").rdd.map( x => labelMapping.get(x.apply(0).toString).toString ) val mloutput = { val paramsMap: Map[String, String] = Map( "icpt" -> this.getIcpt.toString(), @@ -102,67 +156,151 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends ml.registerInput("Y_vec", yin, "csv"); ml.registerOutput("B_out"); ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath), paramsMap) - //ml.execute(ScriptsUtils.resolvePath(LogisticRegression.scriptPath), paramsMap) } - new LogisticRegressionModel("logisticRegression")(mloutput) + new LogisticRegressionModel("logisticRegression")(mloutput, revLabelMapping, sc) } } object LogisticRegressionModel { final val scriptPath = "scripts" + File.separator + "algorithms" + File.separator + "GLM-predict.dml" } +class LogisticRegressionModelSerializableData(val labelMapping: java.util.HashMap[Int, String]) extends Serializable { + def mapLabelStr(x:Double):String = { + if(labelMapping.containsKey(x.toInt)) + labelMapping.get(x.toInt) + else + throw new RuntimeException("Incorrect label mapping") + } + def mapLabelDouble(x:Double):Double = { + if(labelMapping.containsKey(x.toInt)) + labelMapping.get(x.toInt).toDouble + else + throw new RuntimeException("Incorrect label mapping") + } + val mapLabel_udf = { + try { + val it = labelMapping.values().iterator() + while(it.hasNext()) { + it.next().toDouble + } + udf(mapLabelDouble _) + } catch { + case e: Exception => udf(mapLabelStr _) + } + } +} /** * Logistic Regression Scala API */ class LogisticRegressionModel( override val uid: String)( - val mloutput: MLOutput) extends Model[LogisticRegressionModel] with HasIcpt + val mloutput: MLOutput, val labelMapping: java.util.HashMap[Int, String], val sc: SparkContext) extends Model[LogisticRegressionModel] with HasIcpt with HasRegParam with HasTol with HasMaxOuterIter with HasMaxInnerIter { override def copy(extra: ParamMap): LogisticRegressionModel = { - val that = new LogisticRegressionModel(uid)(mloutput) + val that = new LogisticRegressionModel(uid)(mloutput, labelMapping, sc) copyValues(that, extra) } + var outputRawPredictions = true + def setOutputRawPredictions(outRawPred:Boolean): Unit = { outputRawPredictions = outRawPred } override def transformSchema(schema: StructType): StructType = schema - override def transform(df: DataFrame): DataFrame = { - val ml = new MLContext(df.rdd.sparkContext) - - val mcXin = new MatrixCharacteristics() - val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features") - - val mlscoreoutput = { - val paramsMap: Map[String, String] = Map( + + def transform(X: MatrixBlock): MatrixBlock = { + if(outputRawPredictions) { + throw new RuntimeException("Outputting raw prediction is not supported") + } + else { + val isSingleNode = true + val ret = computePredictedLabels(doGLMPredict(isSingleNode, null, X), isSingleNode).getMatrixBlock("Prediction"); + if(ret.getNumColumns != 1) { + throw new RuntimeException("Expected predicted label to be a column vector") + } + if(ret.isInSparseFormat()) { + throw new RuntimeException("Since predicted label is a column vector, expected it to be in dense format") + } + else { + updateLabels(true, null, ret, null) + } + return ret + } + } + + def updateLabels(isSingleNode:Boolean, df:DataFrame, X: MatrixBlock, labelColName:String): DataFrame = { + if(isSingleNode) { + for(i <- 0 until X.getNumRows) { + val v:Int = X.getValue(i, 0).toInt + if(labelMapping.containsKey(v)) { + X.setValue(i, 0, labelMapping.get(v).toDouble) + } + else { + throw new RuntimeException("No mapping found for " + v + " in " + labelMapping.toString()) + } + } + return null + } + else { + val serObj = new LogisticRegressionModelSerializableData(labelMapping) + return df.withColumn(labelColName, serObj.mapLabel_udf(df(labelColName))) + .withColumnRenamed(labelColName, "prediction") + } + } + + def doGLMPredict(isSingleNode:Boolean, df:DataFrame, X: MatrixBlock): MLOutput = { + val ml = new MLContext(sc) + val paramsMap: Map[String, String] = Map( "X" -> " ", - "B" -> " ") - ml.registerInput("X", Xin, mcXin); - ml.registerInput("B_full", mloutput.getBinaryBlockedRDD("B_out"), mloutput.getMatrixCharacteristics("B_out")); + "B" -> " ", + "dfam" -> "3") + if(isSingleNode) { + ml.registerInput("X", X); + ml.registerInput("B_full", mloutput.getMatrixBlock("B_out"), mloutput.getMatrixCharacteristics("B_out")); + } + else { + val mcXin = new MatrixCharacteristics() + val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features") + ml.registerInput("X", Xin, mcXin); + ml.registerInput("B_full", mloutput.getBinaryBlockedRDD("B_out"), mloutput.getMatrixCharacteristics("B_out")); + } ml.registerOutput("means"); ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegressionModel.scriptPath), paramsMap) + } + + def computePredictedLabels(mlscoreoutput:MLOutput, isSingleNode:Boolean): MLOutput = { + val mlNew = new MLContext(sc) + if(isSingleNode) { + mlNew.registerInput("Prob", mlscoreoutput.getMatrixBlock("means"), mlscoreoutput.getMatrixCharacteristics("means")); + } + else { + mlNew.registerInput("Prob", mlscoreoutput.getBinaryBlockedRDD("means"), mlscoreoutput.getMatrixCharacteristics("means")); } + mlNew.registerOutput("Prediction") + mlNew.executeScript( + """ + Prob = read("temp1"); + Prediction = rowIndexMax(Prob); # assuming one-based label mapping + write(Prediction, "tempOut", "csv"); + """) + } + + def joinUsingID(df1:DataFrame, df2:DataFrame):DataFrame = { + val tempDF1 = df1.withColumnRenamed("ID", "ID1") + tempDF1.join(df2, tempDF1.col("ID1").equalTo(df2.col("ID"))).drop("ID1") + } + + override def transform(df: DataFrame): DataFrame = { + val ml = new MLContext(df.rdd.sparkContext) - val prob = mlscoreoutput.getDF(df.sqlContext, "means", true).withColumnRenamed("C1", "probability") - - val mlNew = new MLContext(df.rdd.sparkContext) - mlNew.registerInput("X", Xin, mcXin); - mlNew.registerInput("B_full", mloutput.getBinaryBlockedRDD("B_out"), mloutput.getMatrixCharacteristics("B_out")); - mlNew.registerInput("Prob", mlscoreoutput.getBinaryBlockedRDD("means"), mlscoreoutput.getMatrixCharacteristics("means")); - mlNew.registerOutput("Prediction"); - mlNew.registerOutput("rawPred"); - - val outNew = mlNew.executeScript("Prob = read(\"temp1\"); " - + "Prediction = rowIndexMax(Prob); " - + "write(Prediction, \"tempOut\", \"csv\")" - + "X = read(\"temp2\");" - + "B_full = read(\"temp3\");" - + "rawPred = 1 / (1 + exp(- X * t(B_full)) );" // Raw prediction logic: - + "write(rawPred, \"tempOut1\", \"csv\")"); - - val pred = outNew.getDF(df.sqlContext, "Prediction").withColumnRenamed("C1", "prediction").withColumnRenamed("ID", "ID1") - val rawPred = outNew.getDF(df.sqlContext, "rawPred", true).withColumnRenamed("C1", "rawPrediction").withColumnRenamed("ID", "ID2") - var predictionsNProb = prob.join(pred, prob.col("ID").equalTo(pred.col("ID1"))).select("ID", "probability", "prediction") - predictionsNProb = predictionsNProb.join(rawPred, predictionsNProb.col("ID").equalTo(rawPred.col("ID2"))).select("ID", "probability", "prediction", "rawPrediction") - val dataset1 = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID") - dataset1.join(predictionsNProb, dataset1.col("ID").equalTo(predictionsNProb.col("ID"))) + val isSingleNode = false + val glmPredOut = doGLMPredict(isSingleNode, df, null) + val predLabelOut = computePredictedLabels(glmPredOut, isSingleNode) + val predictedDF = updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1").select("ID", "prediction") + val prob = glmPredOut.getDF(df.sqlContext, "means", true).withColumnRenamed("C1", "probability").select("ID", "probability") + val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID") + + if(outputRawPredictions) { + // Not supported: rawPred = 1 / (1 + exp(- X * t(B_full)) ); + } + return joinUsingID(dataset, joinUsingID(prob, predictedDF)) } } From c4ab1e6ebde04687e4b47cae64f45e7b4dc6511f Mon Sep 17 00:00:00 2001 From: Niketan Pansare Date: Fri, 5 Aug 2016 15:38:01 -0700 Subject: [PATCH 02/14] [SYSTEMML-234] [SYSTEMML-208] Updated the code as per Manoj's suggestion --- .../org/apache/sysml/api/python/SystemML.py | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py index 3dfef67eb17..753527c8292 100644 --- a/src/main/java/org/apache/sysml/api/python/SystemML.py +++ b/src/main/java/org/apache/sysml/api/python/SystemML.py @@ -259,24 +259,19 @@ def getStringRDD(self, varName, format): # traceback.print_exc() def getNumCols(numPyArr): - if len(numPyArr.shape) == 1: + if numPyArr.ndim == 1: return 1 else: return numPyArr.shape[1] def convertToJavaMatrix(sc, src): - if isinstance(src, np.ndarray): - from array import array - numCols = getNumCols(src) - numRows = src.shape[0] - if src.dtype.type is np.float64: - arr = src.reshape(-1) - else: - arr = array('d', src.reshape(-1)) - buf = bytearray(arr.tostring()) - return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols) - else: - raise Exception('Type is not supported') + src = np.asarray(src) + numCols = getNumCols(src) + numRows = src.shape[0] + arr = src.ravel().astype(np.float64) + buf = bytearray(arr.tostring()) + return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols) + def convertToNumpyArr(sc, mb): numRows = mb.getNumRows() From 5f8c532742816e717d8951a8a8b298b8fb41dc31 Mon Sep 17 00:00:00 2001 From: Niketan Pansare Date: Fri, 5 Aug 2016 15:55:38 -0700 Subject: [PATCH 03/14] [SYSTEMML-234] [SYSTEMML-208] Minor code refactoring --- .../org/apache/sysml/api/python/SystemML.py | 60 ++++++++++--------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py index 753527c8292..bba4db8dfad 100644 --- a/src/main/java/org/apache/sysml/api/python/SystemML.py +++ b/src/main/java/org/apache/sysml/api/python/SystemML.py @@ -23,6 +23,7 @@ from py4j.protocol import Py4JJavaError, Py4JError import traceback import os +from pyspark.context import SparkContext from pyspark.sql import DataFrame, SQLContext from pyspark.rdd import RDD import numpy as np @@ -259,25 +260,31 @@ def getStringRDD(self, varName, format): # traceback.print_exc() def getNumCols(numPyArr): - if numPyArr.ndim == 1: - return 1 - else: - return numPyArr.shape[1] + if numPyArr.ndim == 1: + return 1 + else: + return numPyArr.shape[1] -def convertToJavaMatrix(sc, src): - src = np.asarray(src) - numCols = getNumCols(src) - numRows = src.shape[0] - arr = src.ravel().astype(np.float64) - buf = bytearray(arr.tostring()) - return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols) - +def convertToMatrixBlock(sc, src): + if isinstance(sc, SparkContext): + src = np.asarray(src) + numCols = getNumCols(src) + numRows = src.shape[0] + arr = src.ravel().astype(np.float64) + buf = bytearray(arr.tostring()) + return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols) + else: + raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves + def convertToNumpyArr(sc, mb): - numRows = mb.getNumRows() - numCols = mb.getNumColumns() - buf = sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertMBtoPy4JDenseArr(mb) - return np.frombuffer(buf, count=numRows*numCols, dtype=np.float64) + if isinstance(sc, SparkContext): + numRows = mb.getNumRows() + numCols = mb.getNumColumns() + buf = sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertMBtoPy4JDenseArr(mb) + return np.frombuffer(buf, count=numRows*numCols, dtype=np.float64) + else: + raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves class mllearn: # Or we can create new Python project with package structure @@ -290,15 +297,12 @@ def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_i self.transferUsingDF = transferUsingDF if penalty != 'l2': raise Exception('Only l2 penalty is supported') - if fit_intercept: - self.icpt = 1 - else: - self.icpt = 0 + self.icpt = int(fit_intercept) self.max_iter = max_iter self.max_inner_iter = max_inner_iter self.tol = tol - if C == 0: - raise Exception('C cannot be 0') + if C < 0: + raise Exception('C has to be positive') reg = 1/C self.reg = reg self.updateLog() @@ -312,7 +316,7 @@ def updateLog(self): self.log.setTol(self.tol) self.log.setIcpt(self.icpt) - def convertToPDF(self, X): + def convertToPandasDF(self, X): if isinstance(X, np.ndarray): colNames = [] numCols = getNumCols(X) @@ -354,8 +358,8 @@ def fit(self, X, *args, **kwargs): elif numArgs == 2 and (isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame)): y = args[0] if self.transferUsingDF: - pdfX = self.convertToPDF(X) - pdfY = self.convertToPDF(y) + pdfX = self.convertToPandasDF(X) + pdfY = self.convertToPandasDF(y) if getNumCols(pdfY) != 1: raise Exception('y should be a column vector') if pdfX.shape[0] != pdfY.shape[0]: @@ -368,7 +372,7 @@ def fit(self, X, *args, **kwargs): numColsy = getNumCols(y) if numColsy != 1: raise Exception('Expected y to be a column vector') - self.model = self.log.fit(convertToJavaMatrix(self.sc, X), convertToJavaMatrix(self.sc, y)) + self.model = self.log.fit(convertToMatrixBlock(self.sc, X), convertToMatrixBlock(self.sc, y)) self.model.setOutputRawPredictions(False) return self else: @@ -380,7 +384,7 @@ def transform(self, X): def predict(self, X): if isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame): if self.transferUsingDF: - pdfX = self.convertToPDF(X) + pdfX = self.convertToPandasDF(X) df = self.assemble(pdfX, pdfX.columns, 'features').select('features') retjDF = self.model.transform(df._jdf) retDF = DataFrame(retjDF, self.sqlCtx) @@ -390,7 +394,7 @@ def predict(self, X): else: return retPDF else: - retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToJavaMatrix(self.sc, X))) + retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X))) if isinstance(X, np.ndarray): return retNumPy else: From f223a0aadaffc2f78fd45444950025dbef1d0e38 Mon Sep 17 00:00:00 2001 From: Niketan Pansare Date: Fri, 5 Aug 2016 15:57:34 -0700 Subject: [PATCH 04/14] [SYSTEMML-234] [SYSTEMML-208] Taking care of division issue --- src/main/java/org/apache/sysml/api/python/SystemML.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py index bba4db8dfad..3f3f453afd4 100644 --- a/src/main/java/org/apache/sysml/api/python/SystemML.py +++ b/src/main/java/org/apache/sysml/api/python/SystemML.py @@ -33,6 +33,7 @@ from pyspark.mllib.linalg import Vectors import sys from pyspark.ml import Estimator, Model +from __future__ import division class MLContext(object): @@ -303,7 +304,7 @@ def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_i self.tol = tol if C < 0: raise Exception('C has to be positive') - reg = 1/C + reg = 1.0 / C self.reg = reg self.updateLog() if solver != 'newton-cg': From e7371aa0d617a25e94829ddfcc2e5dc11330bce4 Mon Sep 17 00:00:00 2001 From: Niketan Pansare Date: Fri, 5 Aug 2016 16:15:17 -0700 Subject: [PATCH 05/14] [SYSTEMML-234] [SYSTEMML-208] Avoid divide by zero --- src/main/java/org/apache/sysml/api/python/SystemML.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py index 3f3f453afd4..fca3f1e8cc4 100644 --- a/src/main/java/org/apache/sysml/api/python/SystemML.py +++ b/src/main/java/org/apache/sysml/api/python/SystemML.py @@ -302,7 +302,7 @@ def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_i self.max_iter = max_iter self.max_inner_iter = max_inner_iter self.tol = tol - if C < 0: + if C <= 0: raise Exception('C has to be positive') reg = 1.0 / C self.reg = reg From 9fff023802701cbf45b57bf2349a319bc68d0280 Mon Sep 17 00:00:00 2001 From: Niketan Pansare Date: Fri, 5 Aug 2016 17:01:12 -0700 Subject: [PATCH 06/14] [SYSTEMML-234] [SYSTEMML-208] Refactored convertToPandas --- .../org/apache/sysml/api/python/SystemML.py | 26 ++++++------------- 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py index fca3f1e8cc4..d2051aac8d7 100644 --- a/src/main/java/org/apache/sysml/api/python/SystemML.py +++ b/src/main/java/org/apache/sysml/api/python/SystemML.py @@ -20,6 +20,7 @@ # #------------------------------------------------------------- +from __future__ import division from py4j.protocol import Py4JJavaError, Py4JError import traceback import os @@ -33,7 +34,7 @@ from pyspark.mllib.linalg import Vectors import sys from pyspark.ml import Estimator, Model -from __future__ import division + class MLContext(object): @@ -318,17 +319,9 @@ def updateLog(self): self.log.setIcpt(self.icpt) def convertToPandasDF(self, X): - if isinstance(X, np.ndarray): - colNames = [] - numCols = getNumCols(X) - for i in range(0, numCols): - colNames = colNames + [ str('C' + str(i))] - pdfX = pd.DataFrame(X, columns=colNames) - elif isinstance(X, pd.core.frame.DataFrame): - pdfX = X - else: - raise Exception('The input type not supported') - return pdfX + if not instance(X, pd.DataFrame): + return pd.DataFrame(X, columns=['C' + str(i) for i in range(numCols)]) + return X def tolist(self, inputCols): if isinstance(inputCols, pd.indexes.base.Index): @@ -350,14 +343,11 @@ def _fit(self, X): else: raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns') - # TOOD: Ignoring kwargs - def fit(self, X, *args, **kwargs): + def fit(self, X, y=None, params=None): self.updateLog() - numArgs = len(args) + 1 - if numArgs == 1: + if y is None: return self._fit(X) - elif numArgs == 2 and (isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame)): - y = args[0] + elif y is not None and (isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame)): if self.transferUsingDF: pdfX = self.convertToPandasDF(X) pdfY = self.convertToPandasDF(y) From 41f1668b0280b23cf4b0d5dc5e8c92d9922ea9c3 Mon Sep 17 00:00:00 2001 From: Niketan Pansare Date: Sat, 6 Aug 2016 18:33:23 -0700 Subject: [PATCH 07/14] [SYSTEMML-234] [SYSTEMML-208] Fixed bugs in MLContext and added LinearRegression Only scikit learn way of usage tested --- .../java/org/apache/sysml/api/MLContext.java | 96 +++++------- .../org/apache/sysml/api/python/SystemML.py | 132 +++++++++------- .../sysml/api/ml/LinearRegression.scala | 146 ++++++++++++++++++ .../sysml/api/ml/LogisticRegression.scala | 88 +++-------- .../apache/sysml/api/ml/PredictionUtils.scala | 69 +++++++++ 5 files changed, 351 insertions(+), 180 deletions(-) create mode 100644 src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala create mode 100644 src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala diff --git a/src/main/java/org/apache/sysml/api/MLContext.java b/src/main/java/org/apache/sysml/api/MLContext.java index 32b0ce949e5..32dc544f8e2 100644 --- a/src/main/java/org/apache/sysml/api/MLContext.java +++ b/src/main/java/org/apache/sysml/api/MLContext.java @@ -1241,74 +1241,56 @@ private MLOutput compileAndExecuteScript(String dmlScriptFilePath, String [] arg * @throws ParseException */ private synchronized MLOutput compileAndExecuteScript(String dmlScriptFilePath, String [] args, boolean isFile, boolean isNamedArgument, boolean isPyDML, String configFilePath) throws IOException, DMLException { - try { - if(getActiveMLContext() != null) { - throw new DMLRuntimeException("SystemML (and hence by definition MLContext) doesnot support parallel execute() calls from same or different MLContexts. " - + "As a temporary fix, please do explicit synchronization, i.e. synchronized(MLContext.class) { ml.execute(...) } "); + // Set active MLContext. + _activeMLContext = this; + + if(_monitorUtils != null) { + _monitorUtils.resetMonitoringData(); + } + + if(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK) { + + // Depending on whether registerInput/registerOutput was called initialize the variables + String[] inputs; String[] outputs; + if(_inVarnames != null) { + inputs = _inVarnames.toArray(new String[0]); + } + else { + inputs = new String[0]; + } + if(_outVarnames != null) { + outputs = _outVarnames.toArray(new String[0]); + } + else { + outputs = new String[0]; } + Map outMetadata = new HashMap(); - // Set active MLContext. - _activeMLContext = this; + Map argVals = DMLScript.createArgumentsMap(isNamedArgument, args); - if(_monitorUtils != null) { - _monitorUtils.resetMonitoringData(); - } + // Run the DML script + ExecutionContext ec = executeUsingSimplifiedCompilationChain(dmlScriptFilePath, isFile, argVals, isPyDML, inputs, outputs, _variables, configFilePath); - if(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK) { - - // Depending on whether registerInput/registerOutput was called initialize the variables - String[] inputs; String[] outputs; - if(_inVarnames != null) { - inputs = _inVarnames.toArray(new String[0]); - } - else { - inputs = new String[0]; + // Now collect the output + if(_outVarnames != null) { + if(_variables == null) { + throw new DMLRuntimeException("The symbol table returned after executing the script is empty"); } - if(_outVarnames != null) { - outputs = _outVarnames.toArray(new String[0]); - } - else { - outputs = new String[0]; - } - Map outMetadata = new HashMap(); - - Map argVals = DMLScript.createArgumentsMap(isNamedArgument, args); - - // Run the DML script - ExecutionContext ec = executeUsingSimplifiedCompilationChain(dmlScriptFilePath, isFile, argVals, isPyDML, inputs, outputs, _variables, configFilePath); - // Now collect the output - if(_outVarnames != null) { - if(_variables == null) { - throw new DMLRuntimeException("The symbol table returned after executing the script is empty"); + for( String ovar : _outVarnames ) { + if( _variables.keySet().contains(ovar) ) { + outMetadata.put(ovar, ec.getMatrixCharacteristics(ovar)); // For converting output to dataframe } - - for( String ovar : _outVarnames ) { - if( _variables.keySet().contains(ovar) ) { - outMetadata.put(ovar, ec.getMatrixCharacteristics(ovar)); // For converting output to dataframe - } - else { - throw new DMLException("Error: The variable " + ovar + " is not available as output after the execution of the DMLScript."); - } + else { + throw new DMLException("Error: The variable " + ovar + " is not available as output after the execution of the DMLScript."); } } - - return new MLOutput(_variables, ec, outMetadata); } - else { - throw new DMLRuntimeException("Unsupported runtime:" + DMLScript.rtplatform.name()); - } - - } - finally { - // Remove global dml config and all thread-local configs - // TODO enable cleanup whenever invalid GNMF MLcontext is fixed - // (the test is invalid because it assumes that status of previous execute is kept) - //ConfigurationManager.setGlobalConfig(new DMLConfig()); - //ConfigurationManager.clearLocalConfigs(); - // Reset active MLContext. - _activeMLContext = null; + return new MLOutput(_variables, ec, outMetadata); + } + else { + throw new DMLRuntimeException("Unsupported runtime:" + DMLScript.rtplatform.name()); } } diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py index d2051aac8d7..85cc4138285 100644 --- a/src/main/java/org/apache/sysml/api/python/SystemML.py +++ b/src/main/java/org/apache/sysml/api/python/SystemML.py @@ -288,83 +288,55 @@ def convertToNumpyArr(sc, mb): else: raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves -class mllearn: - # Or we can create new Python project with package structure - class LogisticRegression(Estimator): - - def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False): - self.sqlCtx = sqlCtx - self.sc = sqlCtx._sc - self.log = self.sc._jvm.org.apache.sysml.api.ml.LogisticRegression("lr", self.sc._jsc.sc()) - self.transferUsingDF = transferUsingDF - if penalty != 'l2': - raise Exception('Only l2 penalty is supported') - self.icpt = int(fit_intercept) - self.max_iter = max_iter - self.max_inner_iter = max_inner_iter - self.tol = tol - if C <= 0: - raise Exception('C has to be positive') - reg = 1.0 / C - self.reg = reg - self.updateLog() - if solver != 'newton-cg': - raise Exception('Only newton-cg solver supported') - - def updateLog(self): - self.log.setMaxOuterIter(self.max_iter) - self.log.setMaxInnerIter(self.max_inner_iter) - self.log.setRegParam(self.reg) - self.log.setTol(self.tol) - self.log.setIcpt(self.icpt) - - def convertToPandasDF(self, X): - if not instance(X, pd.DataFrame): - return pd.DataFrame(X, columns=['C' + str(i) for i in range(numCols)]) - return X - - def tolist(self, inputCols): - if isinstance(inputCols, pd.indexes.base.Index): - return inputCols.get_values().tolist() - elif isinstance(inputCols, list): - return inputCols - else: - raise Exception('inputCols should be of type pandas.indexes.base.Index or list') - - def assemble(self, pdf, inputCols, outputCol): - tmpDF = self.sqlCtx.createDataFrame(pdf, self.tolist(pdf.columns)) - assembler = VectorAssembler(inputCols=self.tolist(inputCols), outputCol=outputCol) - return assembler.transform(tmpDF) +def convertToPandasDF(X): + if not isinstance(X, pd.DataFrame): + return pd.DataFrame(X, columns=['C' + str(i) for i in range(getNumCols(X))]) + return X +def tolist(inputCols): + if isinstance(inputCols, pd.indexes.base.Index): + return inputCols.get_values().tolist() + elif isinstance(inputCols, list): + return inputCols + else: + raise Exception('inputCols should be of type pandas.indexes.base.Index or list') + +def assemble(sqlCtx, pdf, inputCols, outputCol): + tmpDF = sqlCtx.createDataFrame(pdf, tolist(pdf.columns)) + assembler = VectorAssembler(inputCols=tolist(inputCols), outputCol=outputCol) + return assembler.transform(tmpDF) + +class mllearn: + class BaseSystemMLEstimator(Estimator): def _fit(self, X): if hasattr(X, '_jdf') and 'features' in X.columns and 'label' in X.columns: - self.model = self.log.fit(X._jdf) + self.model = self.estimator.fit(X._jdf) return self else: raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns') def fit(self, X, y=None, params=None): - self.updateLog() if y is None: return self._fit(X) elif y is not None and (isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame)): if self.transferUsingDF: - pdfX = self.convertToPandasDF(X) - pdfY = self.convertToPandasDF(y) + pdfX = convertToPandasDF(X) + pdfY = convertToPandasDF(y) if getNumCols(pdfY) != 1: raise Exception('y should be a column vector') if pdfX.shape[0] != pdfY.shape[0]: raise Exception('Number of rows of X and y should match') colNames = pdfX.columns pdfX['label'] = pdfY[pdfY.columns[0]] - df = self.assemble(pdfX, colNames, 'features').select('features', 'label') - self.model = self.log.fit(df._jdf) + df = assemble(self.sqlCtx, pdfX, colNames, 'features').select('features', 'label') + self.model = self.estimator.fit(df._jdf) else: numColsy = getNumCols(y) if numColsy != 1: raise Exception('Expected y to be a column vector') - self.model = self.log.fit(convertToMatrixBlock(self.sc, X), convertToMatrixBlock(self.sc, y)) - self.model.setOutputRawPredictions(False) + self.model = self.estimator.fit(convertToMatrixBlock(self.sc, X), convertToMatrixBlock(self.sc, y)) + if self.setOutputRawPredictionsToFalse: + self.model.setOutputRawPredictions(False) return self else: raise Exception('Unsupported input type') @@ -375,8 +347,8 @@ def transform(self, X): def predict(self, X): if isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame): if self.transferUsingDF: - pdfX = self.convertToPandasDF(X) - df = self.assemble(pdfX, pdfX.columns, 'features').select('features') + pdfX = convertToPandasDF(X) + df = assemble(self.sqlCtx, pdfX, pdfX.columns, 'features').select('features') retjDF = self.model.transform(df._jdf) retDF = DataFrame(retjDF, self.sqlCtx) retPDF = retDF.sort('ID').select('prediction').toPandas() @@ -405,5 +377,49 @@ def predict(self, X): raise Exception('Unsupported input type') def score(self, X, y): - return sk.metrics.accuracy_score(y, self.predict(X)) + return sk.metrics.accuracy_score(y, self.predict(X)) + + # Or we can create new Python project with package structure + class LogisticRegression(BaseSystemMLEstimator): + + def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False): + self.sqlCtx = sqlCtx + self.sc = sqlCtx._sc + self.uid = "logReg" + self.estimator = self.sc._jvm.org.apache.sysml.api.ml.LogisticRegression(self.uid, self.sc._jsc.sc()) + self.estimator.setMaxOuterIter(max_iter) + self.estimator.setMaxInnerIter(max_inner_iter) + if C <= 0: + raise Exception('C has to be positive') + reg = 1.0 / C + self.estimator.setRegParam(reg) + self.estimator.setTol(tol) + self.estimator.setIcpt(int(fit_intercept)) + self.transferUsingDF = transferUsingDF + self.setOutputRawPredictionsToFalse = True + if penalty != 'l2': + raise Exception('Only l2 penalty is supported') + if solver != 'newton-cg': + raise Exception('Only newton-cg solver supported') + + class LinearRegression(BaseSystemMLEstimator): + + def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False): + self.sqlCtx = sqlCtx + self.sc = sqlCtx._sc + self.uid = "lr" + if solver == 'newton-cg' or solver == 'direct-solve': + self.estimator = self.sc._jvm.org.apache.sysml.api.ml.LinearRegression(self.uid, self.sc._jsc.sc(), solver) + else: + raise Exception('Only newton-cg solver supported') + self.estimator.setMaxIter(max_iter) + if C <= 0: + raise Exception('C has to be positive') + reg = 1.0 / C + self.estimator.setRegParam(reg) + self.estimator.setTol(tol) + self.estimator.setIcpt(int(fit_intercept)) + self.transferUsingDF = transferUsingDF + self.setOutputRawPredictionsToFalse = False + \ No newline at end of file diff --git a/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala new file mode 100644 index 00000000000..28d3dcceaab --- /dev/null +++ b/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.api.ml + +import java.io.File +import org.apache.spark.SparkContext +import org.apache.spark.ml.{ Model, Estimator } +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.types.StructType +import org.apache.spark.ml.param.ParamMap +import org.apache.sysml.api.{ MLContext, MLOutput } +import org.apache.sysml.runtime.matrix.MatrixCharacteristics +import org.apache.sysml.runtime.matrix.data.MatrixBlock +import org.apache.sysml.runtime.DMLRuntimeException +import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils } + +object LinearRegression { + final val scriptPathCG = "scripts" + File.separator + "algorithms" + File.separator + "LinearRegCG.dml" + final val scriptPathDS = "scripts" + File.separator + "algorithms" + File.separator + "LinearRegDS.dml" +} + +// algorithm = "direct-solve", "conjugate-gradient" +class LinearRegression(override val uid: String, val sc: SparkContext, val solver:String="direct-solve") extends Estimator[LinearRegressionModel] with HasIcpt + with HasRegParam with HasTol with HasMaxOuterIter { + + def setIcpt(value: Int) = set(icpt, value) + def setMaxIter(value: Int) = set(maxOuterIter, value) + def setRegParam(value: Double) = set(regParam, value) + def setTol(value: Double) = set(tol, value) + + override def copy(extra: ParamMap): Estimator[LinearRegressionModel] = { + val that = new LinearRegression(uid, sc, solver) + copyValues(that, extra) + } + def transformSchema(schema: StructType): StructType = schema + + // Note: will update the y_mb as this will be called by Python mllearn + def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): LinearRegressionModel = { + val ml = new MLContext(sc) + if(y_mb.getNumColumns != 1) { + throw new RuntimeException("Expected a column vector for y") + } + val mloutput = { + ml.registerInput("X", X_mb); + ml.registerInput("y", y_mb); + ml.registerOutput("beta_out"); + if(solver.compareTo("direct-solve") == 0) + ml.executeScript(ScriptsUtils.getDMLScript(LinearRegression.scriptPathDS), getParamMap()) + else if(solver.compareTo("newton-cg") == 0) { + ml.executeScript(ScriptsUtils.getDMLScript(LinearRegression.scriptPathCG), getParamMap()) + } + else { + throw new DMLRuntimeException("The algorithm should be direct-solve or conjugate-gradient") + } + } + new LinearRegressionModel("linearRegression")(mloutput, sc) + } + + def getParamMap(): Map[String, String] = { + Map( "icpt" -> this.getIcpt.toString(), + "reg" -> this.getRegParam.toString(), + "tol" -> this.getTol.toString, + "maxi" -> this.getMaxOuterIte.toString, + + "X" -> " ", + "Y" -> " ", + "B" -> " ", + "O" -> " ", + "Log" -> " ", + "fmt" -> "binary") + } + + def fit(df: DataFrame): LinearRegressionModel = { + val ml = new MLContext(df.rdd.sparkContext) + val mcXin = new MatrixCharacteristics() + val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features") + val yin = df.select("label") + val mloutput = { + ml.registerInput("X", Xin, mcXin); + ml.registerInput("y", yin); + ml.registerOutput("beta_out"); + if(solver.compareTo("direct-solve") == 0) + ml.executeScript(ScriptsUtils.getDMLScript(LinearRegression.scriptPathDS), getParamMap()) + else if(solver.compareTo("newton-cg") == 0) { + ml.executeScript(ScriptsUtils.getDMLScript(LinearRegression.scriptPathCG), getParamMap()) + } + else { + throw new DMLRuntimeException("The algorithm should be direct-solve or conjugate-gradient") + } + } + new LinearRegressionModel("linearRegression")(mloutput, sc) + } +} + +class LinearRegressionModel(override val uid: String)(val mloutput: MLOutput, val sc: SparkContext) extends Model[LinearRegressionModel] with HasIcpt + with HasRegParam with HasTol with HasMaxOuterIter { + override def copy(extra: ParamMap): LinearRegressionModel = { + val that = new LinearRegressionModel(uid)(mloutput, sc) + copyValues(that, extra) + } + + override def transformSchema(schema: StructType): StructType = schema + + def transform(df: DataFrame): DataFrame = { + val isSingleNode = false + val glmPredOut = PredictionUtils.doGLMPredict(isSingleNode, df, null, sc, mloutput, "beta_out", getPredictParams()) + val predictedDF = glmPredOut.getDF(df.sqlContext, "means").withColumnRenamed("C1", "prediction") + val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID") + return PredictionUtils.joinUsingID(dataset, predictedDF) + } + + def transform(X: MatrixBlock): MatrixBlock = { + val isSingleNode = true + return PredictionUtils.doGLMPredict(isSingleNode, null, X, sc, mloutput, "beta_out", getPredictParams()).getMatrixBlock("means") + } + + + def getPredictParams(): Map[String, String] = { + Map("X" -> " ", + "B" -> " ", + // Gaussian distribution + "dfam" -> "1", "vpow" -> "0.0", + // identity link function + "link" -> "1", "lpow" -> "1.0" +// // Dispersion value: TODO +// ,"disp" -> "5.0" + ) + } +} \ No newline at end of file diff --git a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala index c679ff6d212..69e4126ac7c 100644 --- a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala +++ b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala @@ -112,7 +112,16 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends } val mloutput = { - val paramsMap: Map[String, String] = Map( + ml.registerInput("X", X_mb); + ml.registerInput("Y_vec", y_mb); + ml.registerOutput("B_out"); + ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath), getParamMap()) + } + new LogisticRegressionModel("logisticRegression")(mloutput, revLabelMapping, sc) + } + + def getParamMap():Map[String, String] = { + Map( "icpt" -> this.getIcpt.toString(), "reg" -> this.getRegParam.toString(), "tol" -> this.getTol.toString, @@ -122,12 +131,6 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends "X" -> " ", "Y" -> " ", "B" -> " ") - ml.registerInput("X", X_mb); - ml.registerInput("Y_vec", y_mb); - ml.registerOutput("B_out"); - ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath), paramsMap) - } - new LogisticRegressionModel("logisticRegression")(mloutput, revLabelMapping, sc) } override def fit(df: DataFrame): LogisticRegressionModel = { val ml = new MLContext(df.rdd.sparkContext) @@ -142,20 +145,10 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends } val yin = df.select("label").rdd.map( x => labelMapping.get(x.apply(0).toString).toString ) val mloutput = { - val paramsMap: Map[String, String] = Map( - "icpt" -> this.getIcpt.toString(), - "reg" -> this.getRegParam.toString(), - "tol" -> this.getTol.toString, - "moi" -> this.getMaxOuterIte.toString, - "mii" -> this.getMaxInnerIter.toString, - - "X" -> " ", - "Y" -> " ", - "B" -> " ") ml.registerInput("X", Xin, mcXin); ml.registerInput("Y_vec", yin, "csv"); ml.registerOutput("B_out"); - ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath), paramsMap) + ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath), getParamMap()) } new LogisticRegressionModel("logisticRegression")(mloutput, revLabelMapping, sc) } @@ -211,7 +204,9 @@ class LogisticRegressionModel( } else { val isSingleNode = true - val ret = computePredictedLabels(doGLMPredict(isSingleNode, null, X), isSingleNode).getMatrixBlock("Prediction"); + val ret = PredictionUtils.computePredictedClassLabelsFromProbability( + PredictionUtils.doGLMPredict(isSingleNode, null, X, sc, mloutput, "B_out", getPredictParams), + isSingleNode, sc).getMatrixBlock("Prediction"); if(ret.getNumColumns != 1) { throw new RuntimeException("Expected predicted label to be a column vector") } @@ -245,54 +240,11 @@ class LogisticRegressionModel( } } - def doGLMPredict(isSingleNode:Boolean, df:DataFrame, X: MatrixBlock): MLOutput = { - val ml = new MLContext(sc) - val paramsMap: Map[String, String] = Map( - "X" -> " ", - "B" -> " ", - "dfam" -> "3") - if(isSingleNode) { - ml.registerInput("X", X); - ml.registerInput("B_full", mloutput.getMatrixBlock("B_out"), mloutput.getMatrixCharacteristics("B_out")); - } - else { - val mcXin = new MatrixCharacteristics() - val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features") - ml.registerInput("X", Xin, mcXin); - ml.registerInput("B_full", mloutput.getBinaryBlockedRDD("B_out"), mloutput.getMatrixCharacteristics("B_out")); - } - ml.registerOutput("means"); - ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegressionModel.scriptPath), paramsMap) - } - - def computePredictedLabels(mlscoreoutput:MLOutput, isSingleNode:Boolean): MLOutput = { - val mlNew = new MLContext(sc) - if(isSingleNode) { - mlNew.registerInput("Prob", mlscoreoutput.getMatrixBlock("means"), mlscoreoutput.getMatrixCharacteristics("means")); - } - else { - mlNew.registerInput("Prob", mlscoreoutput.getBinaryBlockedRDD("means"), mlscoreoutput.getMatrixCharacteristics("means")); - } - mlNew.registerOutput("Prediction") - mlNew.executeScript( - """ - Prob = read("temp1"); - Prediction = rowIndexMax(Prob); # assuming one-based label mapping - write(Prediction, "tempOut", "csv"); - """) - } - - def joinUsingID(df1:DataFrame, df2:DataFrame):DataFrame = { - val tempDF1 = df1.withColumnRenamed("ID", "ID1") - tempDF1.join(df2, tempDF1.col("ID1").equalTo(df2.col("ID"))).drop("ID1") - } override def transform(df: DataFrame): DataFrame = { - val ml = new MLContext(df.rdd.sparkContext) - val isSingleNode = false - val glmPredOut = doGLMPredict(isSingleNode, df, null) - val predLabelOut = computePredictedLabels(glmPredOut, isSingleNode) + val glmPredOut = PredictionUtils.doGLMPredict(isSingleNode, df, null, sc, mloutput, "B_out", getPredictParams()) + val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(glmPredOut, isSingleNode, sc) val predictedDF = updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1").select("ID", "prediction") val prob = glmPredOut.getDF(df.sqlContext, "means", true).withColumnRenamed("C1", "probability").select("ID", "probability") val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID") @@ -300,7 +252,13 @@ class LogisticRegressionModel( if(outputRawPredictions) { // Not supported: rawPred = 1 / (1 + exp(- X * t(B_full)) ); } - return joinUsingID(dataset, joinUsingID(prob, predictedDF)) + return PredictionUtils.joinUsingID(dataset, PredictionUtils.joinUsingID(prob, predictedDF)) + } + + def getPredictParams(): Map[String, String] = { + Map("X" -> " ", + "B" -> " ", + "dfam" -> "3") } } diff --git a/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala b/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala new file mode 100644 index 00000000000..60c5b8c0c2b --- /dev/null +++ b/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.api.ml + +import org.apache.sysml.api.{ MLContext, MLOutput } +import org.apache.spark.sql.DataFrame +import org.apache.spark.SparkContext +import org.apache.sysml.runtime.matrix.data.MatrixBlock +import org.apache.sysml.runtime.DMLRuntimeException +import org.apache.sysml.runtime.matrix.MatrixCharacteristics +import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils } + +object PredictionUtils { + + def doGLMPredict(isSingleNode:Boolean, df:DataFrame, X: MatrixBlock, sc:SparkContext, mloutput:MLOutput, B:String, paramsMap: Map[String, String]): MLOutput = { + val ml = new MLContext(sc) + if(isSingleNode) { + ml.registerInput("X", X); + ml.registerInput("B_full", mloutput.getMatrixBlock(B), mloutput.getMatrixCharacteristics(B)); + } + else { + val mcXin = new MatrixCharacteristics() + val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features") + ml.registerInput("X", Xin, mcXin); + ml.registerInput("B_full", mloutput.getBinaryBlockedRDD(B), mloutput.getMatrixCharacteristics(B)); + } + ml.registerOutput("means"); + ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegressionModel.scriptPath), paramsMap) + } + + def joinUsingID(df1:DataFrame, df2:DataFrame):DataFrame = { + val tempDF1 = df1.withColumnRenamed("ID", "ID1") + tempDF1.join(df2, tempDF1.col("ID1").equalTo(df2.col("ID"))).drop("ID1") + } + + def computePredictedClassLabelsFromProbability(mlscoreoutput:MLOutput, isSingleNode:Boolean, sc:SparkContext): MLOutput = { + val mlNew = new MLContext(sc) + if(isSingleNode) { + mlNew.registerInput("Prob", mlscoreoutput.getMatrixBlock("means"), mlscoreoutput.getMatrixCharacteristics("means")); + } + else { + mlNew.registerInput("Prob", mlscoreoutput.getBinaryBlockedRDD("means"), mlscoreoutput.getMatrixCharacteristics("means")); + } + mlNew.registerOutput("Prediction") + mlNew.executeScript( + """ + Prob = read("temp1"); + Prediction = rowIndexMax(Prob); # assuming one-based label mapping + write(Prediction, "tempOut", "csv"); + """) + } +} \ No newline at end of file From 397d7294b0e64184532d10d83b9216d8a6103d50 Mon Sep 17 00:00:00 2001 From: Niketan Pansare Date: Sun, 7 Aug 2016 15:58:06 -0700 Subject: [PATCH 08/14] [SYSTEMML-234] [SYSTEMML-208] Added SVM and python test cases --- scripts/algorithms/l2-svm.dml | 5 +- scripts/algorithms/m-svm.dml | 5 +- .../java/org/apache/sysml/api/MLContext.java | 1 - .../org/apache/sysml/api/python/SystemML.py | 25 ++- .../java/org/apache/sysml/api/python/test.py | 126 ++++++++++++ .../sysml/api/ml/LinearRegression.scala | 1 - .../sysml/api/ml/LogisticRegression.scala | 88 +-------- .../apache/sysml/api/ml/PredictionUtils.scala | 89 ++++++++- .../scala/org/apache/sysml/api/ml/SVM.scala | 187 ++++++++++++++++++ 9 files changed, 436 insertions(+), 91 deletions(-) create mode 100644 src/main/java/org/apache/sysml/api/python/test.py create mode 100644 src/main/scala/org/apache/sysml/api/ml/SVM.scala diff --git a/scripts/algorithms/l2-svm.dml b/scripts/algorithms/l2-svm.dml index fa404185729..1117c711caf 100644 --- a/scripts/algorithms/l2-svm.dml +++ b/scripts/algorithms/l2-svm.dml @@ -160,4 +160,7 @@ extra_model_params[4,1] = dimensions w = t(append(t(w), t(extra_model_params))) write(w, $model, format=cmdLine_fmt) -write(debug_str, $Log) +logFile = $Log +if(logFile != " ") { + write(debug_str, logFile) +} \ No newline at end of file diff --git a/scripts/algorithms/m-svm.dml b/scripts/algorithms/m-svm.dml index e4a7cadb695..04f8a768157 100644 --- a/scripts/algorithms/m-svm.dml +++ b/scripts/algorithms/m-svm.dml @@ -175,4 +175,7 @@ for(iter_class in 1:ncol(debug_mat)){ debug_str = append(debug_str, iter_class + "," + iter + "," + obj) } } -write(debug_str, $Log) +logFile = $Log +if(logFile != " ") { + write(debug_str, logFile) +} \ No newline at end of file diff --git a/src/main/java/org/apache/sysml/api/MLContext.java b/src/main/java/org/apache/sysml/api/MLContext.java index 32dc544f8e2..54f313e7026 100644 --- a/src/main/java/org/apache/sysml/api/MLContext.java +++ b/src/main/java/org/apache/sysml/api/MLContext.java @@ -65,7 +65,6 @@ import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.controlprogram.context.ExecutionContextFactory; -import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; import org.apache.sysml.runtime.instructions.Instruction; import org.apache.sysml.runtime.instructions.cp.Data; import org.apache.sysml.runtime.instructions.spark.data.RDDObject; diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py index 85cc4138285..6f711f11767 100644 --- a/src/main/java/org/apache/sysml/api/python/SystemML.py +++ b/src/main/java/org/apache/sysml/api/python/SystemML.py @@ -30,6 +30,7 @@ import numpy as np import pandas as pd import sklearn as sk +from sklearn import metrics from pyspark.ml.feature import VectorAssembler from pyspark.mllib.linalg import Vectors import sys @@ -377,7 +378,7 @@ def predict(self, X): raise Exception('Unsupported input type') def score(self, X, y): - return sk.metrics.accuracy_score(y, self.predict(X)) + return metrics.accuracy_score(y, self.predict(X)) # Or we can create new Python project with package structure class LogisticRegression(BaseSystemMLEstimator): @@ -421,5 +422,25 @@ def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0 self.estimator.setIcpt(int(fit_intercept)) self.transferUsingDF = transferUsingDF self.setOutputRawPredictionsToFalse = False - + + def score(self, X, y): + return metrics.r2_score(y, self.predict(X), multioutput='variance_weighted') + + + class SVM(BaseSystemMLEstimator): + + def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False): + self.sqlCtx = sqlCtx + self.sc = sqlCtx._sc + self.uid = "svm" + self.estimator = self.sc._jvm.org.apache.sysml.api.ml.SVM(self.uid, self.sc._jsc.sc(), is_multi_class) + self.estimator.setMaxIter(max_iter) + if C <= 0: + raise Exception('C has to be positive') + reg = 1.0 / C + self.estimator.setRegParam(reg) + self.estimator.setTol(tol) + self.estimator.setIcpt(int(fit_intercept)) + self.transferUsingDF = transferUsingDF + self.setOutputRawPredictionsToFalse = False \ No newline at end of file diff --git a/src/main/java/org/apache/sysml/api/python/test.py b/src/main/java/org/apache/sysml/api/python/test.py new file mode 100644 index 00000000000..9a9ee055b2d --- /dev/null +++ b/src/main/java/org/apache/sysml/api/python/test.py @@ -0,0 +1,126 @@ +from sklearn import datasets, neighbors +import SystemML as sml +from pyspark.sql import SQLContext +from pyspark.context import SparkContext +import unittest +from pyspark.ml.evaluation import MulticlassClassificationEvaluator +from pyspark.ml import Pipeline +from pyspark.ml.feature import HashingTF, Tokenizer +import numpy as np + +sc = SparkContext() +sqlCtx = SQLContext(sc) + +# Currently not integrated with JUnit test +# ~/spark-1.6.1-scala-2.11/bin/spark-submit --master local[*] --driver-class-path SystemML.jar test.py +class TestMLLearn(unittest.TestCase): + def testLogisticSK1(self): + digits = datasets.load_digits() + X_digits = digits.data + y_digits = digits.target + n_samples = len(X_digits) + X_train = X_digits[:.9 * n_samples] + y_train = y_digits[:.9 * n_samples] + X_test = X_digits[.9 * n_samples:] + y_test = y_digits[.9 * n_samples:] + logistic = sml.mllearn.LogisticRegression(sqlCtx) + score = logistic.fit(X_train, y_train).score(X_test, y_test) + self.failUnless(score > 0.9) + + def testLogisticSK2(self): + digits = datasets.load_digits() + X_digits = digits.data + y_digits = digits.target + n_samples = len(X_digits) + X_train = X_digits[:.9 * n_samples] + y_train = y_digits[:.9 * n_samples] + X_test = X_digits[.9 * n_samples:] + y_test = y_digits[.9 * n_samples:] + # Convert to DataFrame for i/o: current way to transfer data + logistic = sml.mllearn.LogisticRegression(sqlCtx, transferUsingDF=True) + score = logistic.fit(X_train, y_train).score(X_test, y_test) + self.failUnless(score > 0.9) + + def testLogisticMLPipeline1(self): + training = sqlCtx.createDataFrame([ + (0L, "a b c d e spark", 1.0), + (1L, "b d", 2.0), + (2L, "spark f g h", 1.0), + (3L, "hadoop mapreduce", 2.0), + (4L, "b spark who", 1.0), + (5L, "g d a y", 2.0), + (6L, "spark fly", 1.0), + (7L, "was mapreduce", 2.0), + (8L, "e spark program", 1.0), + (9L, "a e c l", 2.0), + (10L, "spark compile", 1.0), + (11L, "hadoop software", 2.0) + ], ["id", "text", "label"]) + tokenizer = Tokenizer(inputCol="text", outputCol="words") + hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20) + lr = sml.mllearn.LogisticRegression(sqlCtx) + pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) + model = pipeline.fit(training) + test = sqlCtx.createDataFrame([ + (12L, "spark i j k", 1.0), + (13L, "l m n", 2.0), + (14L, "mapreduce spark", 1.0), + (15L, "apache hadoop", 2.0)], ["id", "text", "label"]) + result = model.transform(test) + predictionAndLabels = result.select("prediction", "label") + evaluator = MulticlassClassificationEvaluator() + score = evaluator.evaluate(predictionAndLabels) + self.failUnless(score == 1.0) + + def testLinearRegressionSK1(self): + diabetes = datasets.load_diabetes() + diabetes_X = diabetes.data[:, np.newaxis, 2] + diabetes_X_train = diabetes_X[:-20] + diabetes_X_test = diabetes_X[-20:] + diabetes_y_train = diabetes.target[:-20] + diabetes_y_test = diabetes.target[-20:] + regr = sml.mllearn.LinearRegression(sqlCtx) + regr.fit(diabetes_X_train, diabetes_y_train) + score = regr.score(diabetes_X_test, diabetes_y_test) + self.failUnless(score > 0.4) # TODO: Improve r2-score (may be I am using it incorrectly) + + def testLinearRegressionSK2(self): + diabetes = datasets.load_diabetes() + diabetes_X = diabetes.data[:, np.newaxis, 2] + diabetes_X_train = diabetes_X[:-20] + diabetes_X_test = diabetes_X[-20:] + diabetes_y_train = diabetes.target[:-20] + diabetes_y_test = diabetes.target[-20:] + regr = sml.mllearn.LinearRegression(sqlCtx, transferUsingDF=True) + regr.fit(diabetes_X_train, diabetes_y_train) + score = regr.score(diabetes_X_test, diabetes_y_test) + self.failUnless(score > 0.4) # TODO: Improve r2-score (may be I am using it incorrectly) + + def testSVMSK1(self): + digits = datasets.load_digits() + X_digits = digits.data + y_digits = digits.target + n_samples = len(X_digits) + X_train = X_digits[:.9 * n_samples] + y_train = y_digits[:.9 * n_samples] + X_test = X_digits[.9 * n_samples:] + y_test = y_digits[.9 * n_samples:] + svm = sml.mllearn.SVM(sqlCtx, is_multi_class=True) + score = svm.fit(X_train, y_train).score(X_test, y_test) + self.failUnless(score > 0.9) + + def testSVMSK2(self): + digits = datasets.load_digits() + X_digits = digits.data + y_digits = digits.target + n_samples = len(X_digits) + X_train = X_digits[:.9 * n_samples] + y_train = y_digits[:.9 * n_samples] + X_test = X_digits[.9 * n_samples:] + y_test = y_digits[.9 * n_samples:] + svm = sml.mllearn.SVM(sqlCtx, is_multi_class=True, transferUsingDF=True) + score = svm.fit(X_train, y_train).score(X_test, y_test) + self.failUnless(score > 0.9) + +if __name__ == '__main__': + unittest.main() diff --git a/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala index 28d3dcceaab..7f22f8f717b 100644 --- a/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala +++ b/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala @@ -51,7 +51,6 @@ class LinearRegression(override val uid: String, val sc: SparkContext, val solve } def transformSchema(schema: StructType): StructType = schema - // Note: will update the y_mb as this will be called by Python mllearn def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): LinearRegressionModel = { val ml = new MLContext(sc) if(y_mb.getNumColumns != 1) { diff --git a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala index 69e4126ac7c..f9ddf9c9964 100644 --- a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala +++ b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala @@ -35,7 +35,6 @@ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import scala.reflect.ClassTag import scala.collection.immutable.HashMap -import org.apache.spark.sql.functions.udf import org.apache.sysml.runtime.matrix.data.MatrixBlock import org.apache.sysml.runtime.DMLRuntimeException @@ -89,27 +88,8 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends // Note: will update the y_mb as this will be called by Python mllearn def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): LogisticRegressionModel = { val ml = new MLContext(sc) - val labelMapping = new java.util.HashMap[String, Int] val revLabelMapping = new java.util.HashMap[Int, String] - if(y_mb.getNumColumns != 1) { - throw new RuntimeException("Expected a column vector for y") - } - if(y_mb.isInSparseFormat()) { - throw new DMLRuntimeException("Sparse block is not implemented for fit") - } - else { - val denseBlock = y_mb.getDenseBlock() - var id:Int = 1 - for(i <- 0 until denseBlock.length) { - val v = denseBlock(i).toString() - if(!labelMapping.containsKey(v)) { - labelMapping.put(v, id) - revLabelMapping.put(id, v) - id += 1 - } - denseBlock.update(i, labelMapping.get(v)) - } - } + PredictionUtils.fillLabelMapping(y_mb, revLabelMapping) val mloutput = { ml.registerInput("X", X_mb); @@ -136,14 +116,8 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends val ml = new MLContext(df.rdd.sparkContext) val mcXin = new MatrixCharacteristics() val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features") - val temp = df.select("label").distinct.rdd.map(_.apply(0).toString).collect() - val labelMapping = new java.util.HashMap[String, Int] val revLabelMapping = new java.util.HashMap[Int, String] - for(i <- 0 until temp.length) { - labelMapping.put(temp(i), i+1) - revLabelMapping.put(i+1, temp(i)) - } - val yin = df.select("label").rdd.map( x => labelMapping.get(x.apply(0).toString).toString ) + val yin = PredictionUtils.fillLabelMapping(df, revLabelMapping) val mloutput = { ml.registerInput("X", Xin, mcXin); ml.registerInput("Y_vec", yin, "csv"); @@ -157,31 +131,6 @@ object LogisticRegressionModel { final val scriptPath = "scripts" + File.separator + "algorithms" + File.separator + "GLM-predict.dml" } -class LogisticRegressionModelSerializableData(val labelMapping: java.util.HashMap[Int, String]) extends Serializable { - def mapLabelStr(x:Double):String = { - if(labelMapping.containsKey(x.toInt)) - labelMapping.get(x.toInt) - else - throw new RuntimeException("Incorrect label mapping") - } - def mapLabelDouble(x:Double):Double = { - if(labelMapping.containsKey(x.toInt)) - labelMapping.get(x.toInt).toDouble - else - throw new RuntimeException("Incorrect label mapping") - } - val mapLabel_udf = { - try { - val it = labelMapping.values().iterator() - while(it.hasNext()) { - it.next().toDouble - } - udf(mapLabelDouble _) - } catch { - case e: Exception => udf(mapLabelStr _) - } - } -} /** * Logistic Regression Scala API */ @@ -206,46 +155,21 @@ class LogisticRegressionModel( val isSingleNode = true val ret = PredictionUtils.computePredictedClassLabelsFromProbability( PredictionUtils.doGLMPredict(isSingleNode, null, X, sc, mloutput, "B_out", getPredictParams), - isSingleNode, sc).getMatrixBlock("Prediction"); + isSingleNode, sc, "means").getMatrixBlock("Prediction"); if(ret.getNumColumns != 1) { throw new RuntimeException("Expected predicted label to be a column vector") } - if(ret.isInSparseFormat()) { - throw new RuntimeException("Since predicted label is a column vector, expected it to be in dense format") - } - else { - updateLabels(true, null, ret, null) - } + PredictionUtils.updateLabels(true, null, ret, null, labelMapping) return ret } } - def updateLabels(isSingleNode:Boolean, df:DataFrame, X: MatrixBlock, labelColName:String): DataFrame = { - if(isSingleNode) { - for(i <- 0 until X.getNumRows) { - val v:Int = X.getValue(i, 0).toInt - if(labelMapping.containsKey(v)) { - X.setValue(i, 0, labelMapping.get(v).toDouble) - } - else { - throw new RuntimeException("No mapping found for " + v + " in " + labelMapping.toString()) - } - } - return null - } - else { - val serObj = new LogisticRegressionModelSerializableData(labelMapping) - return df.withColumn(labelColName, serObj.mapLabel_udf(df(labelColName))) - .withColumnRenamed(labelColName, "prediction") - } - } - override def transform(df: DataFrame): DataFrame = { val isSingleNode = false val glmPredOut = PredictionUtils.doGLMPredict(isSingleNode, df, null, sc, mloutput, "B_out", getPredictParams()) - val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(glmPredOut, isSingleNode, sc) - val predictedDF = updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1").select("ID", "prediction") + val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(glmPredOut, isSingleNode, sc, "means") + val predictedDF = PredictionUtils.updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1", labelMapping).select("ID", "prediction") val prob = glmPredOut.getDF(df.sqlContext, "means", true).withColumnRenamed("C1", "probability").select("ID", "probability") val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID") diff --git a/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala b/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala index 60c5b8c0c2b..13494eedaf8 100644 --- a/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala +++ b/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala @@ -19,6 +19,8 @@ package org.apache.sysml.api.ml +import org.apache.spark.sql.functions.udf +import org.apache.spark.rdd.RDD import org.apache.sysml.api.{ MLContext, MLOutput } import org.apache.spark.sql.DataFrame import org.apache.spark.SparkContext @@ -45,18 +47,99 @@ object PredictionUtils { ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegressionModel.scriptPath), paramsMap) } + def fillLabelMapping(df: DataFrame, revLabelMapping: java.util.HashMap[Int, String]): RDD[String] = { + val temp = df.select("label").distinct.rdd.map(_.apply(0).toString).collect() + val labelMapping = new java.util.HashMap[String, Int] + for(i <- 0 until temp.length) { + labelMapping.put(temp(i), i+1) + revLabelMapping.put(i+1, temp(i)) + } + df.select("label").rdd.map( x => labelMapping.get(x.apply(0).toString).toString ) + } + + def fillLabelMapping(y_mb: MatrixBlock, revLabelMapping: java.util.HashMap[Int, String]): Unit = { + val labelMapping = new java.util.HashMap[String, Int] + if(y_mb.getNumColumns != 1) { + throw new RuntimeException("Expected a column vector for y") + } + if(y_mb.isInSparseFormat()) { + throw new DMLRuntimeException("Sparse block is not implemented for fit") + } + else { + val denseBlock = y_mb.getDenseBlock() + var id:Int = 1 + for(i <- 0 until denseBlock.length) { + val v = denseBlock(i).toString() + if(!labelMapping.containsKey(v)) { + labelMapping.put(v, id) + revLabelMapping.put(id, v) + id += 1 + } + denseBlock.update(i, labelMapping.get(v)) + } + } + } + + class LabelMappingData(val labelMapping: java.util.HashMap[Int, String]) extends Serializable { + def mapLabelStr(x:Double):String = { + if(labelMapping.containsKey(x.toInt)) + labelMapping.get(x.toInt) + else + throw new RuntimeException("Incorrect label mapping") + } + def mapLabelDouble(x:Double):Double = { + if(labelMapping.containsKey(x.toInt)) + labelMapping.get(x.toInt).toDouble + else + throw new RuntimeException("Incorrect label mapping") + } + val mapLabel_udf = { + try { + val it = labelMapping.values().iterator() + while(it.hasNext()) { + it.next().toDouble + } + udf(mapLabelDouble _) + } catch { + case e: Exception => udf(mapLabelStr _) + } + } + } + def updateLabels(isSingleNode:Boolean, df:DataFrame, X: MatrixBlock, labelColName:String, labelMapping: java.util.HashMap[Int, String]): DataFrame = { + if(isSingleNode) { + if(X.isInSparseFormat()) { + throw new RuntimeException("Since predicted label is a column vector, expected it to be in dense format") + } + for(i <- 0 until X.getNumRows) { + val v:Int = X.getValue(i, 0).toInt + if(labelMapping.containsKey(v)) { + X.setValue(i, 0, labelMapping.get(v).toDouble) + } + else { + throw new RuntimeException("No mapping found for " + v + " in " + labelMapping.toString()) + } + } + return null + } + else { + val serObj = new LabelMappingData(labelMapping) + return df.withColumn(labelColName, serObj.mapLabel_udf(df(labelColName))) + .withColumnRenamed(labelColName, "prediction") + } + } + def joinUsingID(df1:DataFrame, df2:DataFrame):DataFrame = { val tempDF1 = df1.withColumnRenamed("ID", "ID1") tempDF1.join(df2, tempDF1.col("ID1").equalTo(df2.col("ID"))).drop("ID1") } - def computePredictedClassLabelsFromProbability(mlscoreoutput:MLOutput, isSingleNode:Boolean, sc:SparkContext): MLOutput = { + def computePredictedClassLabelsFromProbability(mlscoreoutput:MLOutput, isSingleNode:Boolean, sc:SparkContext, inProbVar:String): MLOutput = { val mlNew = new MLContext(sc) if(isSingleNode) { - mlNew.registerInput("Prob", mlscoreoutput.getMatrixBlock("means"), mlscoreoutput.getMatrixCharacteristics("means")); + mlNew.registerInput("Prob", mlscoreoutput.getMatrixBlock(inProbVar), mlscoreoutput.getMatrixCharacteristics(inProbVar)); } else { - mlNew.registerInput("Prob", mlscoreoutput.getBinaryBlockedRDD("means"), mlscoreoutput.getMatrixCharacteristics("means")); + mlNew.registerInput("Prob", mlscoreoutput.getBinaryBlockedRDD(inProbVar), mlscoreoutput.getMatrixCharacteristics(inProbVar)); } mlNew.registerOutput("Prediction") mlNew.executeScript( diff --git a/src/main/scala/org/apache/sysml/api/ml/SVM.scala b/src/main/scala/org/apache/sysml/api/ml/SVM.scala new file mode 100644 index 00000000000..7a48c1ded13 --- /dev/null +++ b/src/main/scala/org/apache/sysml/api/ml/SVM.scala @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.api.ml + +import java.io.File +import org.apache.spark.SparkContext +import org.apache.spark.ml.{ Model, Estimator } +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.types.StructType +import org.apache.spark.ml.param.ParamMap +import org.apache.sysml.api.{ MLContext, MLOutput } +import org.apache.sysml.runtime.matrix.MatrixCharacteristics +import org.apache.sysml.runtime.matrix.data.MatrixBlock +import org.apache.sysml.runtime.DMLRuntimeException +import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils } + +object SVM { + final val scriptPathBinary = "scripts" + File.separator + "algorithms" + File.separator + "l2-svm.dml" + final val scriptPathMulticlass = "scripts" + File.separator + "algorithms" + File.separator + "m-svm.dml" +} + +class SVM (override val uid: String, val sc: SparkContext, val isMultiClass:Boolean=false) extends Estimator[SVMModel] with HasIcpt + with HasRegParam with HasTol with HasMaxOuterIter { + + def setIcpt(value: Int) = set(icpt, value) + def setMaxIter(value: Int) = set(maxOuterIter, value) + def setRegParam(value: Double) = set(regParam, value) + def setTol(value: Double) = set(tol, value) + + def setModelParams(m:SVMModel):SVMModel = { + m.setIcpt(this.getIcpt).setMaxIter(this.getMaxOuterIte).setRegParam(this.getRegParam).setTol(this.getTol) + } + + override def copy(extra: ParamMap): Estimator[SVMModel] = { + val that = new SVM(uid, sc, isMultiClass) + copyValues(that, extra) + } + def transformSchema(schema: StructType): StructType = schema + + def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): SVMModel = { + val ml = new MLContext(sc) + val revLabelMapping = new java.util.HashMap[Int, String] + PredictionUtils.fillLabelMapping(y_mb, revLabelMapping) + if(y_mb.getNumColumns != 1) { + throw new RuntimeException("Expected a column vector for y") + } + val mloutput = { + ml.registerInput("X", X_mb); + ml.registerInput("Y", y_mb); + ml.registerOutput("w"); + if(isMultiClass) + ml.executeScript(ScriptsUtils.getDMLScript(SVM.scriptPathMulticlass), getParamMap()) + else { + ml.executeScript(ScriptsUtils.getDMLScript(SVM.scriptPathBinary), getParamMap()) + } + } + setModelParams(new SVMModel("svm")(mloutput, sc, isMultiClass, revLabelMapping)) + } + + def getParamMap(): Map[String, String] = { + Map( "icpt" -> this.getIcpt.toString(), + "reg" -> this.getRegParam.toString(), + "tol" -> this.getTol.toString, + "maxiter" -> this.getMaxOuterIte.toString, + "X" -> " ", + "Y" -> " ", + "model" -> " ", + "Log" -> " ") + } + + def fit(df: DataFrame): SVMModel = { + val ml = new MLContext(df.rdd.sparkContext) + val mcXin = new MatrixCharacteristics() + val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features") + val revLabelMapping = new java.util.HashMap[Int, String] + val yin = PredictionUtils.fillLabelMapping(df, revLabelMapping) + val mloutput = { + ml.registerInput("X", Xin, mcXin); + ml.registerInput("Y", yin, "csv"); + ml.registerOutput("w"); + if(isMultiClass) + ml.executeScript(ScriptsUtils.getDMLScript(SVM.scriptPathMulticlass), getParamMap()) + else { + ml.executeScript(ScriptsUtils.getDMLScript(SVM.scriptPathBinary), getParamMap()) + } + } + setModelParams(new SVMModel("svm")(mloutput, sc, isMultiClass, revLabelMapping)) + } + +} + +object SVMModel { + final val predictionScriptPathBinary = "scripts" + File.separator + "algorithms" + File.separator + "l2-svm-predict.dml" + final val predictionScriptPathMulticlass = "scripts" + File.separator + "algorithms" + File.separator + "m-svm-predict.dml" +} + +class SVMModel (override val uid: String)(val mloutput: MLOutput, val sc: SparkContext, val isMultiClass:Boolean, val labelMapping: java.util.HashMap[Int, String]) extends Model[SVMModel] with HasIcpt + with HasRegParam with HasTol with HasMaxOuterIter { + override def copy(extra: ParamMap): SVMModel = { + val that = new SVMModel(uid)(mloutput, sc, isMultiClass, labelMapping) + copyValues(that, extra) + } + + def setIcpt(value: Int) = set(icpt, value) + def setMaxIter(value: Int) = set(maxOuterIter, value) + def setRegParam(value: Double) = set(regParam, value) + def setTol(value: Double) = set(tol, value) + + override def transformSchema(schema: StructType): StructType = schema + + def transform(df: DataFrame): DataFrame = { + val ml = new MLContext(sc) + val mcXin = new MatrixCharacteristics() + val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features") + ml.registerInput("X", Xin, mcXin); + ml.registerOutput("scores"); + val glmPredOut = { + if(isMultiClass) { + ml.registerInput("W", mloutput.getBinaryBlockedRDD("w"), mloutput.getMatrixCharacteristics("w")); + ml.executeScript(ScriptsUtils.getDMLScript(SVMModel.predictionScriptPathMulticlass), getPredictParams()) + } + else { + ml.registerInput("w", mloutput.getBinaryBlockedRDD("w"), mloutput.getMatrixCharacteristics("w")); + ml.executeScript(ScriptsUtils.getDMLScript(SVMModel.predictionScriptPathBinary), getPredictParams()) + } + } + val isSingleNode = false + val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(glmPredOut, isSingleNode, sc, "scores") + val predictedDF = PredictionUtils.updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1", labelMapping).select("ID", "prediction") + val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID") + return PredictionUtils.joinUsingID(dataset, predictedDF) + } + + def transform(X: MatrixBlock): MatrixBlock = { + val ml = new MLContext(sc) + ml.registerInput("X", X); + ml.registerInput("w", mloutput.getMatrixBlock("w"), mloutput.getMatrixCharacteristics("w")); + ml.registerOutput("scores"); + val glmPredOut = { + if(isMultiClass) { + ml.registerInput("W", mloutput.getMatrixBlock("w"), mloutput.getMatrixCharacteristics("w")); + ml.executeScript(ScriptsUtils.getDMLScript(SVMModel.predictionScriptPathMulticlass), getPredictParams()) + } + else { + ml.registerInput("w", mloutput.getMatrixBlock("w"), mloutput.getMatrixCharacteristics("w")); + ml.executeScript(ScriptsUtils.getDMLScript(SVMModel.predictionScriptPathBinary), getPredictParams()) + } + } + val isSingleNode = true + val ret = PredictionUtils.computePredictedClassLabelsFromProbability(glmPredOut, isSingleNode, sc, "scores").getMatrixBlock("Prediction"); + if(ret.getNumColumns != 1) { + throw new RuntimeException("Expected predicted label to be a column vector") + } + PredictionUtils.updateLabels(true, null, ret, null, labelMapping) + return ret + } + + + def getPredictParams(): Map[String, String] = { + Map( "icpt" -> this.getIcpt.toString(), + "reg" -> this.getRegParam.toString(), + "tol" -> this.getTol.toString, + "maxiter" -> this.getMaxOuterIte.toString, + "X" -> " ", + "Y" -> " ", + "model" -> " ", + "Log" -> " ") + } + +} \ No newline at end of file From d4aff09030624e0cee8aed269bd20d64f422b8a8 Mon Sep 17 00:00:00 2001 From: Niketan Pansare Date: Sun, 7 Aug 2016 17:15:43 -0700 Subject: [PATCH 09/14] [SYSTEMML-234] [SYSTEMML-208] Updating the documentation --- docs/algorithms-classification.md | 139 ++++++++++++++++++++++++++++++ docs/algorithms-regression.md | 62 +++++++++++++ 2 files changed, 201 insertions(+) diff --git a/docs/algorithms-classification.md b/docs/algorithms-classification.md index 2488a8c7f1a..339f2d882fe 100644 --- a/docs/algorithms-classification.md +++ b/docs/algorithms-classification.md @@ -127,6 +127,15 @@ Eqs. (1) and (2). ### Usage
+
+import SystemML as sml +# C = 1/reg +logistic = sml.mllearn.LogisticRegression(sqlCtx, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0) +# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame +y_test = logistic.fit(X_train, y_train).predict(X_test) +# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" +y_test = logistic.fit(df_train).transform(df_test) +
hadoop jar SystemML.jar -f MultiLogReg.dml -nvargs X= @@ -214,6 +223,56 @@ SystemML Language Reference for details. ### Examples
+
+# Scikit-learn way +from sklearn import datasets, neighbors +import SystemML as sml +from pyspark.sql import SQLContext +sqlCtx = SQLContext(sc) +digits = datasets.load_digits() +X_digits = digits.data +y_digits = digits.target + 1 +n_samples = len(X_digits) +X_train = X_digits[:.9 * n_samples] +y_train = y_digits[:.9 * n_samples] +X_test = X_digits[.9 * n_samples:] +y_test = y_digits[.9 * n_samples:] +logistic = sml.mllearn.LogisticRegression(sqlCtx) +print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test)) + +# MLPipeline way +from pyspark.ml import Pipeline +import SystemML as sml +from pyspark.ml.feature import HashingTF, Tokenizer +from pyspark.sql import SQLContext +sqlCtx = SQLContext(sc) +training = sqlCtx.createDataFrame([ + (0L, "a b c d e spark", 1.0), + (1L, "b d", 2.0), + (2L, "spark f g h", 1.0), + (3L, "hadoop mapreduce", 2.0), + (4L, "b spark who", 1.0), + (5L, "g d a y", 2.0), + (6L, "spark fly", 1.0), + (7L, "was mapreduce", 2.0), + (8L, "e spark program", 1.0), + (9L, "a e c l", 2.0), + (10L, "spark compile", 1.0), + (11L, "hadoop software", 2.0) +], ["id", "text", "label"]) +tokenizer = Tokenizer(inputCol="text", outputCol="words") +hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20) +lr = sml.mllearn.LogisticRegression(sqlCtx) +pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) +model = pipeline.fit(training) +test = sqlCtx.createDataFrame([ + (12L, "spark i j k"), + (13L, "l m n"), + (14L, "mapreduce spark"), + (15L, "apache hadoop")], ["id", "text"]) +prediction = model.transform(test) +prediction.show() +
hadoop jar SystemML.jar -f MultiLogReg.dml -nvargs X=/user/ml/X.mtx @@ -393,6 +452,15 @@ support vector machine (`y` with domain size `2`). **Binary-Class Support Vector Machines**:
+
+import SystemML as sml +# C = 1/reg +svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False) +# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame +y_test = svm.fit(X_train, y_train) +# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" +y_test = svm.fit(df_train) +
hadoop jar SystemML.jar -f l2-svm.dml -nvargs X= @@ -428,6 +496,12 @@ support vector machine (`y` with domain size `2`). **Binary-Class Support Vector Machines Prediction**:
+
+# X_test can be NumPy matrices or Pandas DataFrame +y_test = svm.predict(X_test) +# df_test is a DataFrame that contains the column "features" of type Vector +y_test = svm.transform(df_test) +
hadoop jar SystemML.jar -f l2-svm-predict.dml -nvargs X= @@ -630,6 +704,15 @@ class labels. **Multi-Class Support Vector Machines**:
+
+import SystemML as sml +# C = 1/reg +svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=True) +# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame +y_test = svm.fit(X_train, y_train) +# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" +y_test = svm.fit(df_train) +
hadoop jar SystemML.jar -f m-svm.dml -nvargs X= @@ -665,6 +748,12 @@ class labels. **Multi-Class Support Vector Machines Prediction**:
+
+# X_test can be NumPy matrices or Pandas DataFrame +y_test = svm.predict(X_test) +# df_test is a DataFrame that contains the column "features" of type Vector +y_test = svm.transform(df_test) +
hadoop jar SystemML.jar -f m-svm-predict.dml -nvargs X= @@ -747,6 +836,56 @@ SystemML Language Reference for details. **Multi-Class Support Vector Machines**:
+
+# Scikit-learn way +from sklearn import datasets, neighbors +import SystemML as sml +from pyspark.sql import SQLContext +sqlCtx = SQLContext(sc) +digits = datasets.load_digits() +X_digits = digits.data +y_digits = digits.target +n_samples = len(X_digits) +X_train = X_digits[:.9 * n_samples] +y_train = y_digits[:.9 * n_samples] +X_test = X_digits[.9 * n_samples:] +y_test = y_digits[.9 * n_samples:] +svm = sml.mllearn.SVM(sqlCtx, is_multi_class=True) +print('LogisticRegression score: %f' % svm.fit(X_train, y_train).score(X_test, y_test)) + +# MLPipeline way +from pyspark.ml import Pipeline +import SystemML as sml +from pyspark.ml.feature import HashingTF, Tokenizer +from pyspark.sql import SQLContext +sqlCtx = SQLContext(sc) +training = sqlCtx.createDataFrame([ + (0L, "a b c d e spark", 1.0), + (1L, "b d", 2.0), + (2L, "spark f g h", 1.0), + (3L, "hadoop mapreduce", 2.0), + (4L, "b spark who", 1.0), + (5L, "g d a y", 2.0), + (6L, "spark fly", 1.0), + (7L, "was mapreduce", 2.0), + (8L, "e spark program", 1.0), + (9L, "a e c l", 2.0), + (10L, "spark compile", 1.0), + (11L, "hadoop software", 2.0) +], ["id", "text", "label"]) +tokenizer = Tokenizer(inputCol="text", outputCol="words") +hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20) +svm = sml.mllearn.SVM(sqlCtx, is_multi_class=True) +pipeline = Pipeline(stages=[tokenizer, hashingTF, svm]) +model = pipeline.fit(training) +test = sqlCtx.createDataFrame([ + (12L, "spark i j k"), + (13L, "l m n"), + (14L, "mapreduce spark"), + (15L, "apache hadoop")], ["id", "text"]) +prediction = model.transform(test) +prediction.show() +
hadoop jar SystemML.jar -f m-svm.dml -nvargs X=/user/ml/X.mtx diff --git a/docs/algorithms-regression.md b/docs/algorithms-regression.md index 6472c176dfb..2ec549ccc60 100644 --- a/docs/algorithms-regression.md +++ b/docs/algorithms-regression.md @@ -80,6 +80,15 @@ efficient when the number of features $m$ is relatively small **Linear Regression - Direct Solve**:
+
+import SystemML as sml +# C = 1/reg +lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='direct-solve') +# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame +y_test = lr.fit(X_train, y_train) +# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" +y_test = lr.fit(df_train) +
hadoop jar SystemML.jar -f LinearRegDS.dml -nvargs X= @@ -111,6 +120,15 @@ efficient when the number of features $m$ is relatively small **Linear Regression - Conjugate Gradient**:
+
+import SystemML as sml +# C = 1/reg +lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg') +# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame +y_test = lr.fit(X_train, y_train) +# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" +y_test = lr.fit(df_train) +
hadoop jar SystemML.jar -f LinearRegCG.dml -nvargs X= @@ -196,6 +214,28 @@ SystemML Language Reference for details. **Linear Regression - Direct Solve**:
+
+import numpy as np +from sklearn import datasets +import SystemML as sml +from pyspark.sql import SQLContext +# Load the diabetes dataset +diabetes = datasets.load_diabetes() +# Use only one feature +diabetes_X = diabetes.data[:, np.newaxis, 2] +# Split the data into training/testing sets +diabetes_X_train = diabetes_X[:-20] +diabetes_X_test = diabetes_X[-20:] +# Split the targets into training/testing sets +diabetes_y_train = diabetes.target[:-20] +diabetes_y_test = diabetes.target[-20:] +# Create linear regression object +regr = sml.mllearn.LinearRegression(sqlCtx, solver='direct-solve') +# Train the model using the training sets +regr.fit(diabetes_X_train, diabetes_y_train) +# The mean square error +print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2)) +
hadoop jar SystemML.jar -f LinearRegDS.dml -nvargs X=/user/ml/X.mtx @@ -227,6 +267,28 @@ SystemML Language Reference for details. **Linear Regression - Conjugate Gradient**:
+
+import numpy as np +from sklearn import datasets +import SystemML as sml +from pyspark.sql import SQLContext +# Load the diabetes dataset +diabetes = datasets.load_diabetes() +# Use only one feature +diabetes_X = diabetes.data[:, np.newaxis, 2] +# Split the data into training/testing sets +diabetes_X_train = diabetes_X[:-20] +diabetes_X_test = diabetes_X[-20:] +# Split the targets into training/testing sets +diabetes_y_train = diabetes.target[:-20] +diabetes_y_test = diabetes.target[-20:] +# Create linear regression object +regr = sml.mllearn.LinearRegression(sqlCtx, solver='newton-cg') +# Train the model using the training sets +regr.fit(diabetes_X_train, diabetes_y_train) +# The mean square error +print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2)) +
hadoop jar SystemML.jar -f LinearRegCG.dml -nvargs X=/user/ml/X.mtx From ca671346e4e16134e0485ebf37de6d79e1254d30 Mon Sep 17 00:00:00 2001 From: Niketan Pansare Date: Sun, 7 Aug 2016 17:31:16 -0700 Subject: [PATCH 10/14] Updated documentation --- docs/algorithms-classification.md | 14 ++++++++++++++ docs/algorithms-regression.md | 8 ++++++++ 2 files changed, 22 insertions(+) diff --git a/docs/algorithms-classification.md b/docs/algorithms-classification.md index 339f2d882fe..4797429122a 100644 --- a/docs/algorithms-classification.md +++ b/docs/algorithms-classification.md @@ -128,6 +128,7 @@ Eqs. (1) and (2).
+{% highlight python %} import SystemML as sml # C = 1/reg logistic = sml.mllearn.LogisticRegression(sqlCtx, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0) @@ -135,6 +136,7 @@ logistic = sml.mllearn.LogisticRegression(sqlCtx, fit_intercept=True, max_iter=1 y_test = logistic.fit(X_train, y_train).predict(X_test) # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" y_test = logistic.fit(df_train).transform(df_test) +{% endhighlight %}
hadoop jar SystemML.jar -f MultiLogReg.dml @@ -224,6 +226,7 @@ SystemML Language Reference for details.
+{% highlight python %} # Scikit-learn way from sklearn import datasets, neighbors import SystemML as sml @@ -272,6 +275,7 @@ test = sqlCtx.createDataFrame([ (15L, "apache hadoop")], ["id", "text"]) prediction = model.transform(test) prediction.show() +{% endhighlight %}
hadoop jar SystemML.jar -f MultiLogReg.dml @@ -453,6 +457,7 @@ support vector machine (`y` with domain size `2`).
+{% highlight python %} import SystemML as sml # C = 1/reg svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False) @@ -460,6 +465,7 @@ svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C= y_test = svm.fit(X_train, y_train) # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" y_test = svm.fit(df_train) +{% endhighlight %}
hadoop jar SystemML.jar -f l2-svm.dml @@ -497,10 +503,12 @@ y_test = svm.fit(df_train)
+{% highlight python %} # X_test can be NumPy matrices or Pandas DataFrame y_test = svm.predict(X_test) # df_test is a DataFrame that contains the column "features" of type Vector y_test = svm.transform(df_test) +{% endhighlight %}
hadoop jar SystemML.jar -f l2-svm-predict.dml @@ -705,6 +713,7 @@ class labels.
+{% highlight python %} import SystemML as sml # C = 1/reg svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=True) @@ -712,6 +721,7 @@ svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C= y_test = svm.fit(X_train, y_train) # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" y_test = svm.fit(df_train) +{% endhighlight %}
hadoop jar SystemML.jar -f m-svm.dml @@ -749,10 +759,12 @@ y_test = svm.fit(df_train)
+{% highlight python %} # X_test can be NumPy matrices or Pandas DataFrame y_test = svm.predict(X_test) # df_test is a DataFrame that contains the column "features" of type Vector y_test = svm.transform(df_test) +{% endhighlight %}
hadoop jar SystemML.jar -f m-svm-predict.dml @@ -837,6 +849,7 @@ SystemML Language Reference for details.
+{% highlight python %} # Scikit-learn way from sklearn import datasets, neighbors import SystemML as sml @@ -885,6 +898,7 @@ test = sqlCtx.createDataFrame([ (15L, "apache hadoop")], ["id", "text"]) prediction = model.transform(test) prediction.show() +{% endhighlight %}
hadoop jar SystemML.jar -f m-svm.dml diff --git a/docs/algorithms-regression.md b/docs/algorithms-regression.md index 2ec549ccc60..628abcea230 100644 --- a/docs/algorithms-regression.md +++ b/docs/algorithms-regression.md @@ -81,6 +81,7 @@ efficient when the number of features $m$ is relatively small
+{% highlight python %} import SystemML as sml # C = 1/reg lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='direct-solve') @@ -88,6 +89,7 @@ lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol= y_test = lr.fit(X_train, y_train) # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" y_test = lr.fit(df_train) +{% endhighlight %}
hadoop jar SystemML.jar -f LinearRegDS.dml @@ -121,6 +123,7 @@ y_test = lr.fit(df_train)
+{% highlight python %} import SystemML as sml # C = 1/reg lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg') @@ -128,6 +131,7 @@ lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol= y_test = lr.fit(X_train, y_train) # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" y_test = lr.fit(df_train) +{% endhighlight %}
hadoop jar SystemML.jar -f LinearRegCG.dml @@ -215,6 +219,7 @@ SystemML Language Reference for details.
+{% highlight python %} import numpy as np from sklearn import datasets import SystemML as sml @@ -235,6 +240,7 @@ regr = sml.mllearn.LinearRegression(sqlCtx, solver='direct-solve') regr.fit(diabetes_X_train, diabetes_y_train) # The mean square error print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2)) +{% endhighlight %}
hadoop jar SystemML.jar -f LinearRegDS.dml @@ -268,6 +274,7 @@ print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) -
+{% highlight python %} import numpy as np from sklearn import datasets import SystemML as sml @@ -288,6 +295,7 @@ regr = sml.mllearn.LinearRegression(sqlCtx, solver='newton-cg') regr.fit(diabetes_X_train, diabetes_y_train) # The mean square error print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2)) +{% endhighlight %}
hadoop jar SystemML.jar -f LinearRegCG.dml From cfe6087b16dc0f4d401e68bb22b72dae3fa272b4 Mon Sep 17 00:00:00 2001 From: Niketan Pansare Date: Sun, 7 Aug 2016 17:51:57 -0700 Subject: [PATCH 11/14] Updating the LinRegDS documentation --- docs/algorithms-regression.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/algorithms-regression.md b/docs/algorithms-regression.md index 628abcea230..2153342d1aa 100644 --- a/docs/algorithms-regression.md +++ b/docs/algorithms-regression.md @@ -84,7 +84,7 @@ efficient when the number of features $m$ is relatively small {% highlight python %} import SystemML as sml # C = 1/reg -lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='direct-solve') +lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, C=1.0, solver='direct-solve') # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame y_test = lr.fit(X_train, y_train) # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" From 65eb8889b053b062ee91b96545d405fa6e82d9c9 Mon Sep 17 00:00:00 2001 From: Niketan Pansare Date: Sun, 7 Aug 2016 22:34:34 -0700 Subject: [PATCH 12/14] Added naive bayes and scipy sparse matrix --- docs/algorithms-classification.md | 49 +++++- docs/algorithms-regression.md | 4 +- scripts/algorithms/naive-bayes-predict.dml | 15 +- scripts/algorithms/naive-bayes.dml | 5 +- .../org/apache/sysml/api/python/SystemML.py | 35 +++- .../java/org/apache/sysml/api/python/test.py | 52 ++++++ .../spark/utils/RDDConverterUtilsExt.java | 18 ++ .../sysml/api/ml/LogisticRegression.scala | 2 +- .../org/apache/sysml/api/ml/NaiveBayes.scala | 156 ++++++++++++++++++ 9 files changed, 313 insertions(+), 23 deletions(-) create mode 100644 src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala diff --git a/docs/algorithms-classification.md b/docs/algorithms-classification.md index 4797429122a..f25d78ea459 100644 --- a/docs/algorithms-classification.md +++ b/docs/algorithms-classification.md @@ -132,7 +132,7 @@ Eqs. (1) and (2). import SystemML as sml # C = 1/reg logistic = sml.mllearn.LogisticRegression(sqlCtx, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0) -# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame +# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix y_test = logistic.fit(X_train, y_train).predict(X_test) # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" y_test = logistic.fit(df_train).transform(df_test) @@ -461,7 +461,7 @@ support vector machine (`y` with domain size `2`). import SystemML as sml # C = 1/reg svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False) -# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame +# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix y_test = svm.fit(X_train, y_train) # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" y_test = svm.fit(df_train) @@ -504,7 +504,7 @@ y_test = svm.fit(df_train)
{% highlight python %} -# X_test can be NumPy matrices or Pandas DataFrame +# X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix y_test = svm.predict(X_test) # df_test is a DataFrame that contains the column "features" of type Vector y_test = svm.transform(df_test) @@ -717,7 +717,7 @@ class labels. import SystemML as sml # C = 1/reg svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=True) -# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame +# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix y_test = svm.fit(X_train, y_train) # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" y_test = svm.fit(df_train) @@ -760,7 +760,7 @@ y_test = svm.fit(df_train)
{% highlight python %} -# X_test can be NumPy matrices or Pandas DataFrame +# X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix y_test = svm.predict(X_test) # df_test is a DataFrame that contains the column "features" of type Vector y_test = svm.transform(df_test) @@ -1024,6 +1024,16 @@ applicable when all features are counts of categorical values. **Naive Bayes**:
+
+{% highlight python %} +import SystemML as sml +nb = sml.mllearn.NaiveBayes(sqlCtx, laplace=1.0) +# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix +y_test = nb.fit(X_train, y_train) +# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" +y_test = nb.fit(df_train) +{% endhighlight %} +
hadoop jar SystemML.jar -f naive-bayes.dml -nvargs X= @@ -1055,6 +1065,14 @@ applicable when all features are counts of categorical values. **Naive Bayes Prediction**:
+
+{% highlight python %} +# X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix +y_test = nb.predict(X_test) +# df_test is a DataFrame that contains the column "features" of type Vector +y_test = nb.transform(df_test) +{% endhighlight %} +
hadoop jar SystemML.jar -f naive-bayes-predict.dml -nvargs X= @@ -1127,6 +1145,27 @@ SystemML Language Reference for details. **Naive Bayes**:
+
+{% highlight python %} +from sklearn.datasets import fetch_20newsgroups +from sklearn.feature_extraction.text import TfidfVectorizer +import SystemML as sml +from sklearn import metrics +from pyspark.sql import SQLContext +sqlCtx = SQLContext(sc) +categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] +newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) +newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) +vectorizer = TfidfVectorizer() +# Both vectors and vectors_test are SciPy CSR matrix +vectors = vectorizer.fit_transform(newsgroups_train.data) +vectors_test = vectorizer.transform(newsgroups_test.data) +nb = sml.mllearn.NaiveBayes(sqlCtx) +nb.fit(vectors, newsgroups_train.target) +pred = nb.predict(vectors_test) +metrics.f1_score(newsgroups_test.target, pred, average='weighted') +{% endhighlight %} +
hadoop jar SystemML.jar -f naive-bayes.dml -nvargs X=/user/ml/X.mtx diff --git a/docs/algorithms-regression.md b/docs/algorithms-regression.md index 2153342d1aa..5241f5f1d68 100644 --- a/docs/algorithms-regression.md +++ b/docs/algorithms-regression.md @@ -85,7 +85,7 @@ efficient when the number of features $m$ is relatively small import SystemML as sml # C = 1/reg lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, C=1.0, solver='direct-solve') -# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame +# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix y_test = lr.fit(X_train, y_train) # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" y_test = lr.fit(df_train) @@ -127,7 +127,7 @@ y_test = lr.fit(df_train) import SystemML as sml # C = 1/reg lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg') -# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame +# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrames or SciPy Sparse matrices y_test = lr.fit(X_train, y_train) # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" y_test = lr.fit(df_train) diff --git a/scripts/algorithms/naive-bayes-predict.dml b/scripts/algorithms/naive-bayes-predict.dml index e6f8fa4a5e9..b687bfa77ab 100644 --- a/scripts/algorithms/naive-bayes-predict.dml +++ b/scripts/algorithms/naive-bayes-predict.dml @@ -28,7 +28,6 @@ cmdLine_Y = ifdef($Y, " ") cmdLine_accuracy = ifdef($accuracy, " ") cmdLine_confusion = ifdef($confusion, " ") -cmdLine_probabilities = ifdef($probabilities, " ") cmdLine_fmt = ifdef($fmt, "text") D = read($X) @@ -51,13 +50,13 @@ model = append(conditionals, prior) log_probs = D_w_ones %*% t(log(model)) -if(cmdLine_probabilities != " "){ - mx = rowMaxs(log_probs) - ones = matrix(1, rows=1, cols=nrow(prior)) - probs = log_probs - mx %*% ones - probs = exp(probs)/(rowSums(exp(probs)) %*% ones) - write(probs, cmdLine_probabilities, format=cmdLine_fmt) -} + +mx = rowMaxs(log_probs) +ones = matrix(1, rows=1, cols=nrow(prior)) +probs = log_probs - mx %*% ones +probs = exp(probs)/(rowSums(exp(probs)) %*% ones) +write(probs, $probabilities, format=cmdLine_fmt) + if(cmdLine_Y != " "){ C = read(cmdLine_Y) diff --git a/scripts/algorithms/naive-bayes.dml b/scripts/algorithms/naive-bayes.dml index a01a5fc0d4f..c1dc44c7c5f 100644 --- a/scripts/algorithms/naive-bayes.dml +++ b/scripts/algorithms/naive-bayes.dml @@ -74,7 +74,10 @@ acc = sum(rowIndexMax(logProbs) == C) / numRows * 100 acc_str = "Training Accuracy (%): " + acc print(acc_str) -write(acc, $accuracy) +accuracyFile = $accuracy +if(accuracyFile != " ") { + write(acc, accuracyFile) +} extraModelParams = as.matrix(numFeatures) classPrior = rbind(classPrior, extraModelParams) diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py index 6f711f11767..bf25ef96e58 100644 --- a/src/main/java/org/apache/sysml/api/python/SystemML.py +++ b/src/main/java/org/apache/sysml/api/python/SystemML.py @@ -35,7 +35,8 @@ from pyspark.mllib.linalg import Vectors import sys from pyspark.ml import Estimator, Model - +from scipy.sparse import spmatrix +from scipy.sparse import coo_matrix class MLContext(object): @@ -269,7 +270,19 @@ def getNumCols(numPyArr): return numPyArr.shape[1] def convertToMatrixBlock(sc, src): - if isinstance(sc, SparkContext): + if isinstance(src, spmatrix): + src = coo_matrix(src, dtype=np.float64) + numRows = src.shape[0] + numCols = src.shape[1] + data = src.data.astype(np.float64) + row = src.row.astype(np.int32) + col = src.col.astype(np.int32) + nnz = len(src.col) + buf1 = bytearray(data.tostring()) + buf2 = bytearray(row.tostring()) + buf3 = bytearray(col.tostring()) + return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertSciPyCOOToMB(buf1, buf2, buf3, numRows, numCols, nnz) + elif isinstance(sc, SparkContext): src = np.asarray(src) numCols = getNumCols(src) numRows = src.shape[0] @@ -319,7 +332,7 @@ def _fit(self, X): def fit(self, X, y=None, params=None): if y is None: return self._fit(X) - elif y is not None and (isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame)): + elif y is not None and (isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame) or isinstance(X, spmatrix)): if self.transferUsingDF: pdfX = convertToPandasDF(X) pdfY = convertToPandasDF(y) @@ -346,7 +359,7 @@ def transform(self, X): return self.predict(X) def predict(self, X): - if isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame): + if isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame) or isinstance(X, spmatrix): if self.transferUsingDF: pdfX = convertToPandasDF(X) df = assemble(self.sqlCtx, pdfX, pdfX.columns, 'features').select('features') @@ -442,5 +455,15 @@ def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0 self.estimator.setTol(tol) self.estimator.setIcpt(int(fit_intercept)) self.transferUsingDF = transferUsingDF - self.setOutputRawPredictionsToFalse = False - \ No newline at end of file + self.setOutputRawPredictionsToFalse = False + + class NaiveBayes(BaseSystemMLEstimator): + + def __init__(self, sqlCtx, laplace=1.0, transferUsingDF=False): + self.sqlCtx = sqlCtx + self.sc = sqlCtx._sc + self.uid = "nb" + self.estimator = self.sc._jvm.org.apache.sysml.api.ml.NaiveBayes(self.uid, self.sc._jsc.sc()) + self.estimator.setLaplace(laplace) + self.transferUsingDF = transferUsingDF + self.setOutputRawPredictionsToFalse = False \ No newline at end of file diff --git a/src/main/java/org/apache/sysml/api/python/test.py b/src/main/java/org/apache/sysml/api/python/test.py index 9a9ee055b2d..21a1f79fd5c 100644 --- a/src/main/java/org/apache/sysml/api/python/test.py +++ b/src/main/java/org/apache/sysml/api/python/test.py @@ -1,3 +1,24 @@ +#!/usr/bin/python +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- from sklearn import datasets, neighbors import SystemML as sml from pyspark.sql import SQLContext @@ -7,6 +28,9 @@ from pyspark.ml import Pipeline from pyspark.ml.feature import HashingTF, Tokenizer import numpy as np +from sklearn.datasets import fetch_20newsgroups +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn import metrics sc = SparkContext() sqlCtx = SQLContext(sc) @@ -122,5 +146,33 @@ def testSVMSK2(self): score = svm.fit(X_train, y_train).score(X_test, y_test) self.failUnless(score > 0.9) + def testNaiveBayesSK1(self): + digits = datasets.load_digits() + X_digits = digits.data + y_digits = digits.target + n_samples = len(X_digits) + X_train = X_digits[:.9 * n_samples] + y_train = y_digits[:.9 * n_samples] + X_test = X_digits[.9 * n_samples:] + y_test = y_digits[.9 * n_samples:] + nb = sml.mllearn.NaiveBayes(sqlCtx) + score = nb.fit(X_train, y_train).score(X_test, y_test) + self.failUnless(score > 0.85) + + def testNaiveBayesSK2(self): + categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] + newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) + newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) + vectorizer = TfidfVectorizer() + # Both vectors and vectors_test are SciPy CSR matrix + vectors = vectorizer.fit_transform(newsgroups_train.data) + vectors_test = vectorizer.transform(newsgroups_test.data) + nb = sml.mllearn.NaiveBayes(sqlCtx) + nb.fit(vectors, newsgroups_train.target) + pred = nb.predict(vectors_test) + score = metrics.f1_score(newsgroups_test.target, pred, average='weighted') + self.failUnless(score > 0.8) + + if __name__ == '__main__': unittest.main() diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java index 114e78fc3d3..72ab2303ecf 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java @@ -266,6 +266,24 @@ public static MatrixBlock convertPy4JArrayToMB(byte [] data, int rlen, int clen) return convertPy4JArrayToMB(data, rlen, clen, false); } + public static MatrixBlock convertSciPyCOOToMB(byte [] data, byte [] row, byte [] col, int rlen, int clen, int nnz) throws DMLRuntimeException { + MatrixBlock mb = new MatrixBlock(rlen, clen, true); + mb.allocateSparseRowsBlock(false); + ByteBuffer buf1 = ByteBuffer.wrap(data); + buf1.order(ByteOrder.nativeOrder()); + ByteBuffer buf2 = ByteBuffer.wrap(row); + buf2.order(ByteOrder.nativeOrder()); + ByteBuffer buf3 = ByteBuffer.wrap(col); + buf3.order(ByteOrder.nativeOrder()); + for(int i = 0; i < nnz; i++) { + double val = buf1.getDouble(); + int rowIndex = buf2.getInt(); + int colIndex = buf3.getInt(); + mb.setValue(rowIndex, colIndex, val); // TODO: Improve the performance + } + return mb; + } + public static MatrixBlock convertPy4JArrayToMB(byte [] data, int rlen, int clen, boolean isSparse) throws DMLRuntimeException { MatrixBlock mb = new MatrixBlock(rlen, clen, isSparse, -1); if(isSparse) { diff --git a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala index f9ddf9c9964..7e6b9223dc6 100644 --- a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala +++ b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala @@ -159,7 +159,7 @@ class LogisticRegressionModel( if(ret.getNumColumns != 1) { throw new RuntimeException("Expected predicted label to be a column vector") } - PredictionUtils.updateLabels(true, null, ret, null, labelMapping) + PredictionUtils.updateLabels(isSingleNode, null, ret, null, labelMapping) return ret } } diff --git a/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala b/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala new file mode 100644 index 00000000000..a6fc367f41d --- /dev/null +++ b/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.api.ml + +import java.io.File +import org.apache.spark.SparkContext +import org.apache.spark.ml.{ Model, Estimator } +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.types.StructType +import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam } +import org.apache.sysml.api.{ MLContext, MLOutput } +import org.apache.sysml.runtime.matrix.MatrixCharacteristics +import org.apache.sysml.runtime.matrix.data.MatrixBlock +import org.apache.sysml.runtime.DMLRuntimeException +import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils } + +trait HasLaplace extends Params { + final val laplace: Param[Double] = new Param[Double](this, "laplace", "Laplace smoothing specified by the user to avoid creation of 0 probabilities.") + setDefault(laplace, 1.0) + final def getLaplace: Double = $(laplace) +} + +object NaiveBayes { + final val scriptPath = "scripts" + File.separator + "algorithms" + File.separator + "naive-bayes.dml" +} + +class NaiveBayes(override val uid: String, val sc: SparkContext) extends Estimator[NaiveBayesModel] with HasLaplace { + override def copy(extra: ParamMap): Estimator[NaiveBayesModel] = { + val that = new NaiveBayes(uid, sc) + copyValues(that, extra) + } + def setLaplace(value: Double) = set(laplace, value) + override def transformSchema(schema: StructType): StructType = schema + + // Note: will update the y_mb as this will be called by Python mllearn + def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): NaiveBayesModel = { + val ml = new MLContext(sc) + val revLabelMapping = new java.util.HashMap[Int, String] + PredictionUtils.fillLabelMapping(y_mb, revLabelMapping) + + val mloutput = { + ml.registerInput("D", X_mb); + ml.registerInput("C", y_mb); + ml.registerOutput("classPrior"); + ml.registerOutput("classConditionals"); + ml.executeScript(ScriptsUtils.getDMLScript(NaiveBayes.scriptPath), getParamMap()) + } + new NaiveBayesModel("naivebayes")(mloutput, revLabelMapping, sc) + } + + def fit(df: DataFrame): NaiveBayesModel = { + val ml = new MLContext(df.rdd.sparkContext) + val mcXin = new MatrixCharacteristics() + val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features") + val revLabelMapping = new java.util.HashMap[Int, String] + val yin = PredictionUtils.fillLabelMapping(df, revLabelMapping) + val mloutput = { + ml.registerInput("D", Xin, mcXin); + ml.registerInput("C", yin, "csv"); + ml.registerOutput("classPrior"); + ml.registerOutput("classConditionals"); + ml.executeScript(ScriptsUtils.getDMLScript(NaiveBayes.scriptPath), getParamMap()) + } + new NaiveBayesModel("naive")(mloutput, revLabelMapping, sc) + } + + def getParamMap(): Map[String, String] = { + Map("X" -> " ", + "Y" -> " ", + "prior" -> " ", + "conditionals" -> " ", + "accuracy" -> " ", + "laplace" -> getLaplace.toString()) + } +} + + +object NaiveBayesModel { + final val scriptPath = "scripts" + File.separator + "algorithms" + File.separator + "naive-bayes-predict.dml" +} + +class NaiveBayesModel( + override val uid: String)( + val mloutput: MLOutput, val labelMapping: java.util.HashMap[Int, String], val sc: SparkContext) extends Model[NaiveBayesModel] with HasLaplace { + override def copy(extra: ParamMap): NaiveBayesModel = { + val that = new NaiveBayesModel(uid)(mloutput, labelMapping, sc) + copyValues(that, extra) + } + + def transformSchema(schema: StructType): StructType = schema + + var priorMB: MatrixBlock = null + var conditionalMB: MatrixBlock = null + def setPriorAndConditional(prior:MatrixBlock, conditional:MatrixBlock) { + priorMB = prior + conditionalMB = conditional + } + + def transform(X: MatrixBlock): MatrixBlock = { + val isSingleNode = true + val ml = new MLContext(sc) + ml.registerInput("D", X) + ml.registerInput("prior", mloutput.getMatrixBlock("classPrior"), mloutput.getMatrixCharacteristics("classPrior")) + ml.registerInput("conditionals", mloutput.getMatrixBlock("classConditionals"), mloutput.getMatrixCharacteristics("classConditionals")) + ml.registerOutput("probs") + val nbPredict = ml.executeScript(ScriptsUtils.getDMLScript(NaiveBayesModel.scriptPath), getPredictParams()) + val ret = PredictionUtils.computePredictedClassLabelsFromProbability(nbPredict, isSingleNode, sc, "probs").getMatrixBlock("Prediction"); + if(ret.getNumColumns != 1) { + throw new RuntimeException("Expected predicted label to be a column vector") + } + PredictionUtils.updateLabels(isSingleNode, null, ret, null, labelMapping) + return ret + } + + def transform(df: org.apache.spark.sql.DataFrame): org.apache.spark.sql.DataFrame = { + val isSingleNode = false + val ml = new MLContext(sc) + val mcXin = new MatrixCharacteristics() + val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features") + ml.registerInput("D", Xin, mcXin); + ml.registerInput("prior", mloutput.getMatrixBlock("classPrior"), mloutput.getMatrixCharacteristics("classPrior")) + ml.registerInput("conditionals", mloutput.getMatrixBlock("classConditionals"), mloutput.getMatrixCharacteristics("classConditionals")) + ml.registerOutput("probs") + val nbPredict = ml.executeScript(ScriptsUtils.getDMLScript(NaiveBayesModel.scriptPath), getPredictParams()) + val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(nbPredict, isSingleNode, sc, "probs") + val predictedDF = PredictionUtils.updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1", labelMapping).select("ID", "prediction") + val prob = nbPredict.getDF(df.sqlContext, "probs", true).withColumnRenamed("C1", "probability").select("ID", "probability") + val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID") + return PredictionUtils.joinUsingID(dataset, PredictionUtils.joinUsingID(prob, predictedDF)) + } + + def getPredictParams(): Map[String, String] = { + Map("X" -> " ", + "prior" -> " ", + "conditionals" -> " ", + "probabilities" -> " ") + } + +} \ No newline at end of file From 21e91c7dc6bbe0ea7314e262c890b222c677935f Mon Sep 17 00:00:00 2001 From: Niketan Pansare Date: Mon, 8 Aug 2016 22:57:37 -0700 Subject: [PATCH 13/14] Added BaseSystemMLClassifier and updated the classifier to use new MLContext --- .../java/org/apache/sysml/api/MLContext.java | 142 +++++++++------- .../java/org/apache/sysml/api/MLOutput.java | 48 +++--- .../api/mlcontext/BinaryBlockMatrix.java | 9 + .../mlcontext/MLContextConversionUtil.java | 13 +- .../sysml/api/mlcontext/MLContextUtil.java | 29 +++- .../apache/sysml/api/mlcontext/MLResults.java | 9 +- .../apache/sysml/api/mlcontext/Matrix.java | 2 +- .../org/apache/sysml/api/python/SystemML.py | 17 +- .../sysml/api/ml/BaseSystemMLClassifier.scala | 157 ++++++++++++++++++ .../sysml/api/ml/LogisticRegression.scala | 154 +++++------------ .../org/apache/sysml/api/ml/NaiveBayes.scala | 131 +++++---------- .../apache/sysml/api/ml/PredictionUtils.scala | 36 ++++ .../scala/org/apache/sysml/api/ml/SVM.scala | 156 +++++------------ 13 files changed, 479 insertions(+), 424 deletions(-) create mode 100644 src/main/scala/org/apache/sysml/api/ml/BaseSystemMLClassifier.scala diff --git a/src/main/java/org/apache/sysml/api/MLContext.java b/src/main/java/org/apache/sysml/api/MLContext.java index 54f313e7026..d8a290d107a 100644 --- a/src/main/java/org/apache/sysml/api/MLContext.java +++ b/src/main/java/org/apache/sysml/api/MLContext.java @@ -65,6 +65,7 @@ import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.controlprogram.context.ExecutionContextFactory; +import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; import org.apache.sysml.runtime.instructions.Instruction; import org.apache.sysml.runtime.instructions.cp.Data; import org.apache.sysml.runtime.instructions.spark.data.RDDObject; @@ -476,25 +477,6 @@ public void registerInput(String varName, RDD rdd, String format, long r registerInput(varName, rdd.toJavaRDD().mapToPair(new ConvertStringToLongTextPair()), format, rlen, clen, nnz, null); } - public void registerInput(String varName, MatrixBlock mb) throws DMLRuntimeException { - MatrixCharacteristics mc = new MatrixCharacteristics(mb.getNumRows(), mb.getNumColumns(), OptimizerUtils.DEFAULT_BLOCKSIZE, OptimizerUtils.DEFAULT_BLOCKSIZE, mb.getNonZeros()); - registerInput(varName, mb, mc); - } - - public void registerInput(String varName, MatrixBlock mb, MatrixCharacteristics mc) throws DMLRuntimeException { - if(_variables == null) - _variables = new LocalVariableMap(); - if(_inVarnames == null) - _inVarnames = new ArrayList(); - - MatrixObject mo = new MatrixObject(ValueType.DOUBLE, "temp", new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo)); - mo.acquireModify(mb); - mo.release(); - _variables.put(varName, mo); - _inVarnames.add(varName); - checkIfRegisteringInputAllowed(); - } - // All CSV related methods call this ... It provides access to dimensions, nnz, file properties. private void registerInput(String varName, JavaPairRDD textOrCsv_rdd, String format, long rlen, long clen, long nnz, FileFormatProperties props) throws DMLRuntimeException { if(!(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)) { @@ -618,6 +600,24 @@ public void registerInput(String varName, JavaPairRDD checkIfRegisteringInputAllowed(); } + public void registerInput(String varName, MatrixBlock mb) throws DMLRuntimeException { + MatrixCharacteristics mc = new MatrixCharacteristics(mb.getNumRows(), mb.getNumColumns(), OptimizerUtils.DEFAULT_BLOCKSIZE, OptimizerUtils.DEFAULT_BLOCKSIZE, mb.getNonZeros()); + registerInput(varName, mb, mc); + } + + public void registerInput(String varName, MatrixBlock mb, MatrixCharacteristics mc) throws DMLRuntimeException { + if(_variables == null) + _variables = new LocalVariableMap(); + if(_inVarnames == null) + _inVarnames = new ArrayList(); + MatrixObject mo = new MatrixObject(ValueType.DOUBLE, "temp", new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo)); + mo.acquireModify(mb); + mo.release(); + _variables.put(varName, mo); + _inVarnames.add(varName); + checkIfRegisteringInputAllowed(); + } + // ============================================================================================= /** @@ -1240,56 +1240,80 @@ private MLOutput compileAndExecuteScript(String dmlScriptFilePath, String [] arg * @throws ParseException */ private synchronized MLOutput compileAndExecuteScript(String dmlScriptFilePath, String [] args, boolean isFile, boolean isNamedArgument, boolean isPyDML, String configFilePath) throws IOException, DMLException { - // Set active MLContext. - _activeMLContext = this; - - if(_monitorUtils != null) { - _monitorUtils.resetMonitoringData(); - } - - if(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK) { - - // Depending on whether registerInput/registerOutput was called initialize the variables - String[] inputs; String[] outputs; - if(_inVarnames != null) { - inputs = _inVarnames.toArray(new String[0]); - } - else { - inputs = new String[0]; - } - if(_outVarnames != null) { - outputs = _outVarnames.toArray(new String[0]); - } - else { - outputs = new String[0]; + try { + if(getActiveMLContext() != null) { + throw new DMLRuntimeException("SystemML (and hence by definition MLContext) doesnot support parallel execute() calls from same or different MLContexts. " + + "As a temporary fix, please do explicit synchronization, i.e. synchronized(MLContext.class) { ml.execute(...) } "); } - Map outMetadata = new HashMap(); - Map argVals = DMLScript.createArgumentsMap(isNamedArgument, args); + // Set active MLContext. + _activeMLContext = this; - // Run the DML script - ExecutionContext ec = executeUsingSimplifiedCompilationChain(dmlScriptFilePath, isFile, argVals, isPyDML, inputs, outputs, _variables, configFilePath); + if(_monitorUtils != null) { + _monitorUtils.resetMonitoringData(); + } - // Now collect the output - if(_outVarnames != null) { - if(_variables == null) { - throw new DMLRuntimeException("The symbol table returned after executing the script is empty"); + if(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK) { + + Map> retVal = null; + + // Depending on whether registerInput/registerOutput was called initialize the variables + String[] inputs; String[] outputs; + if(_inVarnames != null) { + inputs = _inVarnames.toArray(new String[0]); + } + else { + inputs = new String[0]; + } + if(_outVarnames != null) { + outputs = _outVarnames.toArray(new String[0]); } + else { + outputs = new String[0]; + } + Map outMetadata = new HashMap(); + + Map argVals = DMLScript.createArgumentsMap(isNamedArgument, args); - for( String ovar : _outVarnames ) { - if( _variables.keySet().contains(ovar) ) { - outMetadata.put(ovar, ec.getMatrixCharacteristics(ovar)); // For converting output to dataframe + // Run the DML script + ExecutionContext ec = executeUsingSimplifiedCompilationChain(dmlScriptFilePath, isFile, argVals, isPyDML, inputs, outputs, _variables, configFilePath); + + // Now collect the output + if(_outVarnames != null) { + if(_variables == null) { + throw new DMLRuntimeException("The symbol table returned after executing the script is empty"); } - else { - throw new DMLException("Error: The variable " + ovar + " is not available as output after the execution of the DMLScript."); + + for( String ovar : _outVarnames ) { + if( _variables.keySet().contains(ovar) ) { + if(retVal == null) { + retVal = new HashMap>(); + } + retVal.put(ovar, ((SparkExecutionContext) ec).getBinaryBlockRDDHandleForVariable(ovar)); + outMetadata.put(ovar, ec.getMatrixCharacteristics(ovar)); // For converting output to dataframe + } + else { + throw new DMLException("Error: The variable " + ovar + " is not available as output after the execution of the DMLScript."); + } } } + + return new MLOutput(retVal, outMetadata); } - - return new MLOutput(_variables, ec, outMetadata); + else { + throw new DMLRuntimeException("Unsupported runtime:" + DMLScript.rtplatform.name()); + } + } - else { - throw new DMLRuntimeException("Unsupported runtime:" + DMLScript.rtplatform.name()); + finally { + // Remove global dml config and all thread-local configs + // TODO enable cleanup whenever invalid GNMF MLcontext is fixed + // (the test is invalid because it assumes that status of previous execute is kept) + //ConfigurationManager.setGlobalConfig(new DMLConfig()); + //ConfigurationManager.clearLocalConfigs(); + + // Reset active MLContext. + _activeMLContext = null; } } @@ -1451,4 +1475,4 @@ public MLMatrix read(SQLContext sqlContext, String filePath, String format) thro // return MLMatrix.createMLMatrix(this, sqlContext, blocks, mc); // } -} +} \ No newline at end of file diff --git a/src/main/java/org/apache/sysml/api/MLOutput.java b/src/main/java/org/apache/sysml/api/MLOutput.java index 3ef68a9f151..55daf176510 100644 --- a/src/main/java/org/apache/sysml/api/MLOutput.java +++ b/src/main/java/org/apache/sysml/api/MLOutput.java @@ -39,8 +39,6 @@ import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import org.apache.sysml.runtime.DMLRuntimeException; -import org.apache.sysml.runtime.controlprogram.LocalVariableMap; -import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; import org.apache.sysml.runtime.instructions.spark.functions.GetMLBlock; import org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt; @@ -48,7 +46,7 @@ import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.MatrixIndexes; import org.apache.sysml.runtime.util.UtilFunctions; -import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; + import scala.Tuple2; /** @@ -57,39 +55,31 @@ */ public class MLOutput { - private LocalVariableMap _variables; - private ExecutionContext _ec; + Map> _outputs; private Map _outMetadata = null; - public MLOutput(LocalVariableMap variables, ExecutionContext ec, Map outMetadata) { - this._variables = variables; - this._ec = ec; - this._outMetadata = outMetadata; - } - public MatrixBlock getMatrixBlock(String varName) throws DMLRuntimeException { - if( _variables.keySet().contains(varName) ) { - MatrixObject mo = _ec.getMatrixObject(varName); - MatrixBlock mb = mo.acquireRead(); - mo.release(); - return mb; - } - else { - throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table."); - } + MatrixCharacteristics mc = getMatrixCharacteristics(varName); + // The matrix block is always pushed to an RDD and then we do collect + // We can later avoid this by returning symbol table rather than "Map> _outputs" + MatrixBlock mb = SparkExecutionContext.toMatrixBlock(getBinaryBlockedRDD(varName), (int) mc.getRows(), (int) mc.getCols(), + mc.getRowsPerBlock(), mc.getColsPerBlock(), mc.getNonZeros()); + return mb; + } + public MLOutput(Map> outputs, Map outMetadata) { + this._outputs = outputs; + this._outMetadata = outMetadata; } public JavaPairRDD getBinaryBlockedRDD(String varName) throws DMLRuntimeException { - if( _variables.keySet().contains(varName) ) { - return ((SparkExecutionContext) _ec).getBinaryBlockRDDHandleForVariable(varName); - } - else { - throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table."); + if(_outputs.containsKey(varName)) { + return _outputs.get(varName); } + throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table."); } public MatrixCharacteristics getMatrixCharacteristics(String varName) throws DMLRuntimeException { - if(_outMetadata.containsKey(varName)) { + if(_outputs.containsKey(varName)) { return _outMetadata.get(varName); } throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table."); @@ -255,7 +245,7 @@ public Iterable>> call(Tuple2>> retVal = new ArrayList>>(); for(int i = 0; i < lrlen; i++) { @@ -263,7 +253,7 @@ public Iterable>> call(Tuple2>(startRowIndex + i + 1, new Tuple2(kv._1.getColumnIndex(), partialRow))); + retVal.add(new Tuple2>(startRowIndex + i, new Tuple2(kv._1.getColumnIndex(), partialRow))); } return retVal; } @@ -427,4 +417,4 @@ public Row call(Tuple2>> arg0) return RowFactory.create(row); } } -} +} \ No newline at end of file diff --git a/src/main/java/org/apache/sysml/api/mlcontext/BinaryBlockMatrix.java b/src/main/java/org/apache/sysml/api/mlcontext/BinaryBlockMatrix.java index 8c9f923ad5d..ea6fcf0a55b 100644 --- a/src/main/java/org/apache/sysml/api/mlcontext/BinaryBlockMatrix.java +++ b/src/main/java/org/apache/sysml/api/mlcontext/BinaryBlockMatrix.java @@ -21,6 +21,8 @@ import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.sql.DataFrame; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.MatrixIndexes; @@ -97,6 +99,13 @@ public BinaryBlockMatrix(JavaPairRDD binaryBlocks, public JavaPairRDD getBinaryBlocks() { return binaryBlocks; } + + public MatrixBlock getMatrixBlock() throws DMLRuntimeException { + MatrixCharacteristics mc = getMatrixCharacteristics(); + MatrixBlock mb = SparkExecutionContext.toMatrixBlock(binaryBlocks, (int) mc.getRows(), (int) mc.getCols(), + mc.getRowsPerBlock(), mc.getColsPerBlock(), mc.getNonZeros()); + return mb; + } /** * Obtain the SystemML binary-block matrix characteristics diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java index 33226d2b87a..161ad174fd9 100644 --- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java +++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java @@ -676,7 +676,7 @@ public static double[][] matrixObjectToDoubleMatrix(MatrixObject matrixObject) { * @return the {@code MatrixObject} converted to a {@code DataFrame} */ public static DataFrame matrixObjectToDataFrame(MatrixObject matrixObject, - SparkExecutionContext sparkExecutionContext) { + SparkExecutionContext sparkExecutionContext, boolean isVectorDF) { try { @SuppressWarnings("unchecked") JavaPairRDD binaryBlockMatrix = (JavaPairRDD) sparkExecutionContext @@ -686,8 +686,17 @@ public static DataFrame matrixObjectToDataFrame(MatrixObject matrixObject, MLContext activeMLContext = (MLContext) MLContextProxy.getActiveMLContext(); SparkContext sc = activeMLContext.getSparkContext(); SQLContext sqlContext = new SQLContext(sc); - DataFrame df = RDDConverterUtilsExt.binaryBlockToDataFrame(binaryBlockMatrix, matrixCharacteristics, + DataFrame df = null; + if(isVectorDF) { + df = RDDConverterUtilsExt.binaryBlockToVectorDataFrame(binaryBlockMatrix, matrixCharacteristics, + sqlContext); + } + else { + df = RDDConverterUtilsExt.binaryBlockToDataFrame(binaryBlockMatrix, matrixCharacteristics, sqlContext); + } + + return df; } catch (DMLRuntimeException e) { throw new MLContextException("DMLRuntimeException while converting matrix object to DataFrame", e); diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java b/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java index feb616ecefd..fc942e98c52 100644 --- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java +++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java @@ -44,7 +44,9 @@ import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.parser.ParseException; +import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.controlprogram.LocalVariableMap; +import org.apache.sysml.runtime.controlprogram.caching.CacheException; import org.apache.sysml.runtime.controlprogram.caching.FrameObject; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.instructions.cp.BooleanObject; @@ -52,8 +54,12 @@ import org.apache.sysml.runtime.instructions.cp.DoubleObject; import org.apache.sysml.runtime.instructions.cp.IntObject; import org.apache.sysml.runtime.instructions.cp.StringObject; +import org.apache.sysml.runtime.matrix.MatrixCharacteristics; +import org.apache.sysml.runtime.matrix.MatrixFormatMetaData; +import org.apache.sysml.runtime.matrix.data.InputInfo; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.MatrixIndexes; +import org.apache.sysml.runtime.matrix.data.OutputInfo; /** * Utility class containing methods for working with the MLContext API. @@ -72,7 +78,7 @@ public final class MLContextUtil { */ @SuppressWarnings("rawtypes") public static final Class[] COMPLEX_DATA_TYPES = { JavaRDD.class, RDD.class, DataFrame.class, - BinaryBlockMatrix.class, Matrix.class, (new double[][] {}).getClass() }; + BinaryBlockMatrix.class, Matrix.class, (new double[][] {}).getClass(), MatrixBlock.class }; /** * All data types supported by the MLContext API @@ -391,6 +397,8 @@ public static Map convertInputParametersForParser(Map getRDDStringIJV(String outputName) { */ public DataFrame getDataFrame(String outputName) { MatrixObject mo = getMatrixObject(outputName); - DataFrame df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext); + DataFrame df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, false); + return df; + } + + public DataFrame getDataFrame(String outputName, boolean isVectorDF) { + MatrixObject mo = getMatrixObject(outputName); + DataFrame df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, isVectorDF); return df; } @@ -271,6 +277,7 @@ public Matrix getMatrix(String outputName) { Matrix matrix = new Matrix(mo, sparkExecutionContext); return matrix; } + /** * Obtain an output as a {@code BinaryBlockMatrix}. diff --git a/src/main/java/org/apache/sysml/api/mlcontext/Matrix.java b/src/main/java/org/apache/sysml/api/mlcontext/Matrix.java index 178a6e5a04d..3ee41b7fa85 100644 --- a/src/main/java/org/apache/sysml/api/mlcontext/Matrix.java +++ b/src/main/java/org/apache/sysml/api/mlcontext/Matrix.java @@ -108,7 +108,7 @@ public RDD asRDDStringIJV() { * @return the matrix as a {@code DataFrame} */ public DataFrame asDataFrame() { - DataFrame df = MLContextConversionUtil.matrixObjectToDataFrame(matrixObject, sparkExecutionContext); + DataFrame df = MLContextConversionUtil.matrixObjectToDataFrame(matrixObject, sparkExecutionContext, false); return df; } diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py index bf25ef96e58..c03bd1bab49 100644 --- a/src/main/java/org/apache/sysml/api/python/SystemML.py +++ b/src/main/java/org/apache/sysml/api/python/SystemML.py @@ -38,6 +38,8 @@ from scipy.sparse import spmatrix from scipy.sparse import coo_matrix +SUPPORTED_TYPES = (np.ndarray, pd.DataFrame, spmatrix) + class MLContext(object): """ @@ -274,7 +276,7 @@ def convertToMatrixBlock(sc, src): src = coo_matrix(src, dtype=np.float64) numRows = src.shape[0] numCols = src.shape[1] - data = src.data.astype(np.float64) + data = src.data row = src.row.astype(np.int32) col = src.col.astype(np.int32) nnz = len(src.col) @@ -308,12 +310,7 @@ def convertToPandasDF(X): return X def tolist(inputCols): - if isinstance(inputCols, pd.indexes.base.Index): - return inputCols.get_values().tolist() - elif isinstance(inputCols, list): - return inputCols - else: - raise Exception('inputCols should be of type pandas.indexes.base.Index or list') + return list(inputCols) def assemble(sqlCtx, pdf, inputCols, outputCol): tmpDF = sqlCtx.createDataFrame(pdf, tolist(pdf.columns)) @@ -322,6 +319,8 @@ def assemble(sqlCtx, pdf, inputCols, outputCol): class mllearn: class BaseSystemMLEstimator(Estimator): + # TODO: Allow users to set featuresCol (with default 'features') and labelCol (with default 'label') + def _fit(self, X): if hasattr(X, '_jdf') and 'features' in X.columns and 'label' in X.columns: self.model = self.estimator.fit(X._jdf) @@ -332,7 +331,7 @@ def _fit(self, X): def fit(self, X, y=None, params=None): if y is None: return self._fit(X) - elif y is not None and (isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame) or isinstance(X, spmatrix)): + elif y is not None and isinstance(X, SUPPORTED_TYPES): if self.transferUsingDF: pdfX = convertToPandasDF(X) pdfY = convertToPandasDF(y) @@ -359,7 +358,7 @@ def transform(self, X): return self.predict(X) def predict(self, X): - if isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame) or isinstance(X, spmatrix): + if isinstance(X, SUPPORTED_TYPES): if self.transferUsingDF: pdfX = convertToPandasDF(X) df = assemble(self.sqlCtx, pdfX, pdfX.columns, 'features').select('features') diff --git a/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLClassifier.scala b/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLClassifier.scala new file mode 100644 index 00000000000..5174aabdb72 --- /dev/null +++ b/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLClassifier.scala @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.api.ml + +import org.apache.spark.rdd.RDD +import java.io.File +import org.apache.spark.SparkContext +import org.apache.spark.ml.{ Model, Estimator } +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.types.StructType +import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam } +import org.apache.sysml.runtime.matrix.MatrixCharacteristics +import org.apache.sysml.runtime.matrix.data.MatrixBlock +import org.apache.sysml.runtime.DMLRuntimeException +import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils } +import org.apache.sysml.api.mlcontext._ +import org.apache.sysml.api.mlcontext.ScriptFactory._ + +trait HasLaplace extends Params { + final val laplace: Param[Double] = new Param[Double](this, "laplace", "Laplace smoothing specified by the user to avoid creation of 0 probabilities.") + setDefault(laplace, 1.0) + final def getLaplace: Double = $(laplace) +} +trait HasIcpt extends Params { + final val icpt: Param[Int] = new Param[Int](this, "icpt", "Intercept presence, shifting and rescaling X columns") + setDefault(icpt, 0) + final def getIcpt: Int = $(icpt) +} +trait HasMaxOuterIter extends Params { + final val maxOuterIter: Param[Int] = new Param[Int](this, "maxOuterIter", "max. number of outer (Newton) iterations") + setDefault(maxOuterIter, 100) + final def getMaxOuterIte: Int = $(maxOuterIter) +} +trait HasMaxInnerIter extends Params { + final val maxInnerIter: Param[Int] = new Param[Int](this, "maxInnerIter", "max. number of inner (conjugate gradient) iterations, 0 = no max") + setDefault(maxInnerIter, 0) + final def getMaxInnerIter: Int = $(maxInnerIter) +} +trait HasTol extends Params { + final val tol: DoubleParam = new DoubleParam(this, "tol", "the convergence tolerance for iterative algorithms") + setDefault(tol, 0.000001) + final def getTol: Double = $(tol) +} +trait HasRegParam extends Params { + final val regParam: DoubleParam = new DoubleParam(this, "tol", "the convergence tolerance for iterative algorithms") + setDefault(regParam, 0.000001) + final def getRegParam: Double = $(regParam) +} + + +trait BaseSystemMLClassifier { + def transformSchema(schema: StructType): StructType = schema + + // Returns the script and variables for X and y + def getTrainingScript(isSingleNode:Boolean):(Script, String, String) + + def fit(X_mb: MatrixBlock, y_mb: MatrixBlock, sc: SparkContext): (MLResults, java.util.HashMap[Int, String]) = { + val isSingleNode = true + val ml = new org.apache.sysml.api.mlcontext.MLContext(sc) + val revLabelMapping = new java.util.HashMap[Int, String] + PredictionUtils.fillLabelMapping(y_mb, revLabelMapping) + val ret = getTrainingScript(isSingleNode) + val script = ret._1.in(ret._2, X_mb).in(ret._3, y_mb) + (ml.execute(script), revLabelMapping) + } + + def fit(df: DataFrame, sc: SparkContext): (MLResults, java.util.HashMap[Int, String]) = { + val isSingleNode = false + val ml = new MLContext(df.rdd.sparkContext) + val mcXin = new MatrixCharacteristics() + val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features") + val revLabelMapping = new java.util.HashMap[Int, String] + val yin = PredictionUtils.fillLabelMapping(df, revLabelMapping) + val ret = getTrainingScript(isSingleNode) + val Xbin = new BinaryBlockMatrix(Xin, mcXin) + val script = ret._1.in(ret._2, Xbin).in(ret._3, yin) + (ml.execute(script), revLabelMapping) + } + + def toDouble(i:Int): java.lang.Double = { + double2Double(i.toDouble) + } + def toDouble(d:Double): java.lang.Double = { + double2Double(d) + } + +} + +trait BaseSystemMLClassifierModel { + + def toDouble(i:Int): java.lang.Double = { + double2Double(i.toDouble) + } + def toDouble(d:Double): java.lang.Double = { + double2Double(d) + } + + def transformSchema(schema: StructType): StructType = schema + + // Returns the script and variable for X + def getPredictionScript(mloutput: MLResults, isSingleNode:Boolean): (Script, String) + + def transform(X: MatrixBlock, mloutput: MLResults, labelMapping: java.util.HashMap[Int, String], sc: SparkContext, probVar:String): MatrixBlock = { + val isSingleNode = true + val ml = new MLContext(sc) + val script = getPredictionScript(mloutput, isSingleNode) + val modelPredict = ml.execute(script._1.in(script._2, X)) + val ret = PredictionUtils.computePredictedClassLabelsFromProbability(modelPredict, isSingleNode, sc, probVar) + .getBinaryBlockMatrix("Prediction").getMatrixBlock + + if(ret.getNumColumns != 1) { + throw new RuntimeException("Expected predicted label to be a column vector") + } + PredictionUtils.updateLabels(isSingleNode, null, ret, null, labelMapping) + return ret + } + + def transform(df: DataFrame, mloutput: MLResults, labelMapping: java.util.HashMap[Int, String], sc: SparkContext, + probVar:String, outputProb:Boolean=true): DataFrame = { + val isSingleNode = false + val ml = new MLContext(sc) + val mcXin = new MatrixCharacteristics() + val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features") + val script = getPredictionScript(mloutput, isSingleNode) + val Xin_bin = new BinaryBlockMatrix(Xin, mcXin) + val modelPredict = ml.execute(script._1.in(script._2, Xin_bin)) + val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(modelPredict, isSingleNode, sc, probVar) + val predictedDF = PredictionUtils.updateLabels(isSingleNode, predLabelOut.getDataFrame("Prediction"), null, "C1", labelMapping).select("ID", "prediction") + if(outputProb) { + val prob = modelPredict.getDataFrame(probVar, true).withColumnRenamed("C1", "probability").select("ID", "probability") + val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID") + return PredictionUtils.joinUsingID(dataset, PredictionUtils.joinUsingID(prob, predictedDF)) + } + else { + val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID") + return PredictionUtils.joinUsingID(dataset, predictedDF) + } + + } +} \ No newline at end of file diff --git a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala index 7e6b9223dc6..3098da9c21d 100644 --- a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala +++ b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala @@ -19,50 +19,20 @@ package org.apache.sysml.api.ml +import org.apache.spark.rdd.RDD import java.io.File -import org.apache.sysml.api.{ MLContext, MLOutput } -import org.apache.sysml.runtime.matrix.MatrixCharacteristics -import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils } -import org.apache.spark.{ SparkContext } +import org.apache.spark.SparkContext +import org.apache.spark.ml.{ Model, Estimator } import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType -import org.apache.spark.ml.{ Model, Estimator } -import org.apache.spark.ml.classification._ import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam } -import org.apache.spark.ml.param.shared._ -import org.apache.spark.SparkConf -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.regression.LabeledPoint -import scala.reflect.ClassTag -import scala.collection.immutable.HashMap +import org.apache.sysml.runtime.matrix.MatrixCharacteristics import org.apache.sysml.runtime.matrix.data.MatrixBlock import org.apache.sysml.runtime.DMLRuntimeException +import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils } +import org.apache.sysml.api.mlcontext._ +import org.apache.sysml.api.mlcontext.ScriptFactory._ -trait HasIcpt extends Params { - final val icpt: Param[Int] = new Param[Int](this, "icpt", "Intercept presence, shifting and rescaling X columns") - setDefault(icpt, 0) - final def getIcpt: Int = $(icpt) -} -trait HasMaxOuterIter extends Params { - final val maxOuterIter: Param[Int] = new Param[Int](this, "maxOuterIter", "max. number of outer (Newton) iterations") - setDefault(maxOuterIter, 100) - final def getMaxOuterIte: Int = $(maxOuterIter) -} -trait HasMaxInnerIter extends Params { - final val maxInnerIter: Param[Int] = new Param[Int](this, "maxInnerIter", "max. number of inner (conjugate gradient) iterations, 0 = no max") - setDefault(maxInnerIter, 0) - final def getMaxInnerIter: Int = $(maxInnerIter) -} -trait HasTol extends Params { - final val tol: DoubleParam = new DoubleParam(this, "tol", "the convergence tolerance for iterative algorithms") - setDefault(tol, 0.000001) - final def getTol: Double = $(tol) -} -trait HasRegParam extends Params { - final val regParam: DoubleParam = new DoubleParam(this, "tol", "the convergence tolerance for iterative algorithms") - setDefault(regParam, 0.000001) - final def getRegParam: Double = $(regParam) -} object LogisticRegression { final val scriptPath = "scripts" + File.separator + "algorithms" + File.separator + "MultiLogReg.dml" } @@ -71,7 +41,7 @@ object LogisticRegression { * Logistic Regression Scala API */ class LogisticRegression(override val uid: String, val sc: SparkContext) extends Estimator[LogisticRegressionModel] with HasIcpt - with HasRegParam with HasTol with HasMaxOuterIter with HasMaxInnerIter { + with HasRegParam with HasTol with HasMaxOuterIter with HasMaxInnerIter with BaseSystemMLClassifier { def setIcpt(value: Int) = set(icpt, value) def setMaxOuterIter(value: Int) = set(maxOuterIter, value) @@ -83,48 +53,31 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends val that = new LogisticRegression(uid, sc) copyValues(that, extra) } - override def transformSchema(schema: StructType): StructType = schema // Note: will update the y_mb as this will be called by Python mllearn def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): LogisticRegressionModel = { - val ml = new MLContext(sc) - val revLabelMapping = new java.util.HashMap[Int, String] - PredictionUtils.fillLabelMapping(y_mb, revLabelMapping) - - val mloutput = { - ml.registerInput("X", X_mb); - ml.registerInput("Y_vec", y_mb); - ml.registerOutput("B_out"); - ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath), getParamMap()) - } - new LogisticRegressionModel("logisticRegression")(mloutput, revLabelMapping, sc) + val ret = fit(X_mb, y_mb, sc) + new LogisticRegressionModel("log")(ret._1, ret._2, sc) } - def getParamMap():Map[String, String] = { - Map( - "icpt" -> this.getIcpt.toString(), - "reg" -> this.getRegParam.toString(), - "tol" -> this.getTol.toString, - "moi" -> this.getMaxOuterIte.toString, - "mii" -> this.getMaxInnerIter.toString, - - "X" -> " ", - "Y" -> " ", - "B" -> " ") + def fit(df: DataFrame): LogisticRegressionModel = { + val ret = fit(df, sc) + new LogisticRegressionModel("log")(ret._1, ret._2, sc) } - override def fit(df: DataFrame): LogisticRegressionModel = { - val ml = new MLContext(df.rdd.sparkContext) - val mcXin = new MatrixCharacteristics() - val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features") - val revLabelMapping = new java.util.HashMap[Int, String] - val yin = PredictionUtils.fillLabelMapping(df, revLabelMapping) - val mloutput = { - ml.registerInput("X", Xin, mcXin); - ml.registerInput("Y_vec", yin, "csv"); - ml.registerOutput("B_out"); - ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath), getParamMap()) - } - new LogisticRegressionModel("logisticRegression")(mloutput, revLabelMapping, sc) + + + def getTrainingScript(isSingleNode:Boolean):(Script, String, String) = { + val script = dml(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath)) + .in("$X", " ") + .in("$Y", " ") + .in("$B", " ") + .in("$icpt", toDouble(getIcpt)) + .in("$reg", toDouble(getRegParam)) + .in("$tol", toDouble(getTol)) + .in("$moi", toDouble(getMaxOuterIte)) + .in("$mii", toDouble(getMaxInnerIter)) + .out("B_out") + (script, "X", "Y_vec") } } object LogisticRegressionModel { @@ -135,55 +88,22 @@ object LogisticRegressionModel { * Logistic Regression Scala API */ -class LogisticRegressionModel( - override val uid: String)( - val mloutput: MLOutput, val labelMapping: java.util.HashMap[Int, String], val sc: SparkContext) extends Model[LogisticRegressionModel] with HasIcpt - with HasRegParam with HasTol with HasMaxOuterIter with HasMaxInnerIter { +class LogisticRegressionModel(override val uid: String)( + val mloutput: MLResults, val labelMapping: java.util.HashMap[Int, String], val sc: SparkContext) + extends Model[LogisticRegressionModel] with HasIcpt + with HasRegParam with HasTol with HasMaxOuterIter with HasMaxInnerIter with BaseSystemMLClassifierModel { override def copy(extra: ParamMap): LogisticRegressionModel = { val that = new LogisticRegressionModel(uid)(mloutput, labelMapping, sc) copyValues(that, extra) } var outputRawPredictions = true def setOutputRawPredictions(outRawPred:Boolean): Unit = { outputRawPredictions = outRawPred } - override def transformSchema(schema: StructType): StructType = schema - - def transform(X: MatrixBlock): MatrixBlock = { - if(outputRawPredictions) { - throw new RuntimeException("Outputting raw prediction is not supported") - } - else { - val isSingleNode = true - val ret = PredictionUtils.computePredictedClassLabelsFromProbability( - PredictionUtils.doGLMPredict(isSingleNode, null, X, sc, mloutput, "B_out", getPredictParams), - isSingleNode, sc, "means").getMatrixBlock("Prediction"); - if(ret.getNumColumns != 1) { - throw new RuntimeException("Expected predicted label to be a column vector") - } - PredictionUtils.updateLabels(isSingleNode, null, ret, null, labelMapping) - return ret - } - } - - override def transform(df: DataFrame): DataFrame = { - val isSingleNode = false - val glmPredOut = PredictionUtils.doGLMPredict(isSingleNode, df, null, sc, mloutput, "B_out", getPredictParams()) - val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(glmPredOut, isSingleNode, sc, "means") - val predictedDF = PredictionUtils.updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1", labelMapping).select("ID", "prediction") - val prob = glmPredOut.getDF(df.sqlContext, "means", true).withColumnRenamed("C1", "probability").select("ID", "probability") - val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID") - - if(outputRawPredictions) { - // Not supported: rawPred = 1 / (1 + exp(- X * t(B_full)) ); - } - return PredictionUtils.joinUsingID(dataset, PredictionUtils.joinUsingID(prob, predictedDF)) - } - - def getPredictParams(): Map[String, String] = { - Map("X" -> " ", - "B" -> " ", - "dfam" -> "3") - } + def getPredictionScript(mloutput: MLResults, isSingleNode:Boolean): (Script, String) = + PredictionUtils.getGLMPredictionScript(mloutput.getBinaryBlockMatrix("B_out"), isSingleNode) + + def transform(X: MatrixBlock): MatrixBlock = transform(X, mloutput, labelMapping, sc, "means") + def transform(df: DataFrame): DataFrame = transform(df, mloutput, labelMapping, sc, "means") } /** @@ -210,7 +130,7 @@ object LogisticRegressionExample { LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 2.3)))) val lr = new LogisticRegression("log", sc) val lrmodel = lr.fit(training.toDF) - lrmodel.mloutput.getDF(sqlContext, "B_out").show() + // lrmodel.mloutput.getDF(sqlContext, "B_out").show() val testing = sc.parallelize(Seq( LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)), diff --git a/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala b/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala index a6fc367f41d..28836221582 100644 --- a/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala +++ b/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala @@ -19,75 +19,52 @@ package org.apache.sysml.api.ml +import org.apache.spark.rdd.RDD import java.io.File import org.apache.spark.SparkContext import org.apache.spark.ml.{ Model, Estimator } import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam } -import org.apache.sysml.api.{ MLContext, MLOutput } import org.apache.sysml.runtime.matrix.MatrixCharacteristics import org.apache.sysml.runtime.matrix.data.MatrixBlock import org.apache.sysml.runtime.DMLRuntimeException import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils } - -trait HasLaplace extends Params { - final val laplace: Param[Double] = new Param[Double](this, "laplace", "Laplace smoothing specified by the user to avoid creation of 0 probabilities.") - setDefault(laplace, 1.0) - final def getLaplace: Double = $(laplace) -} +import org.apache.sysml.api.mlcontext._ +import org.apache.sysml.api.mlcontext.ScriptFactory._ object NaiveBayes { final val scriptPath = "scripts" + File.separator + "algorithms" + File.separator + "naive-bayes.dml" } -class NaiveBayes(override val uid: String, val sc: SparkContext) extends Estimator[NaiveBayesModel] with HasLaplace { +class NaiveBayes(override val uid: String, val sc: SparkContext) extends Estimator[NaiveBayesModel] with HasLaplace with BaseSystemMLClassifier { override def copy(extra: ParamMap): Estimator[NaiveBayesModel] = { val that = new NaiveBayes(uid, sc) copyValues(that, extra) } def setLaplace(value: Double) = set(laplace, value) - override def transformSchema(schema: StructType): StructType = schema // Note: will update the y_mb as this will be called by Python mllearn def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): NaiveBayesModel = { - val ml = new MLContext(sc) - val revLabelMapping = new java.util.HashMap[Int, String] - PredictionUtils.fillLabelMapping(y_mb, revLabelMapping) - - val mloutput = { - ml.registerInput("D", X_mb); - ml.registerInput("C", y_mb); - ml.registerOutput("classPrior"); - ml.registerOutput("classConditionals"); - ml.executeScript(ScriptsUtils.getDMLScript(NaiveBayes.scriptPath), getParamMap()) - } - new NaiveBayesModel("naivebayes")(mloutput, revLabelMapping, sc) + val ret = fit(X_mb, y_mb, sc) + new NaiveBayesModel("naive")(ret._1, ret._2, sc) } def fit(df: DataFrame): NaiveBayesModel = { - val ml = new MLContext(df.rdd.sparkContext) - val mcXin = new MatrixCharacteristics() - val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features") - val revLabelMapping = new java.util.HashMap[Int, String] - val yin = PredictionUtils.fillLabelMapping(df, revLabelMapping) - val mloutput = { - ml.registerInput("D", Xin, mcXin); - ml.registerInput("C", yin, "csv"); - ml.registerOutput("classPrior"); - ml.registerOutput("classConditionals"); - ml.executeScript(ScriptsUtils.getDMLScript(NaiveBayes.scriptPath), getParamMap()) - } - new NaiveBayesModel("naive")(mloutput, revLabelMapping, sc) + val ret = fit(df, sc) + new NaiveBayesModel("naive")(ret._1, ret._2, sc) } - def getParamMap(): Map[String, String] = { - Map("X" -> " ", - "Y" -> " ", - "prior" -> " ", - "conditionals" -> " ", - "accuracy" -> " ", - "laplace" -> getLaplace.toString()) + def getTrainingScript(isSingleNode:Boolean):(Script, String, String) = { + val script = dml(ScriptsUtils.getDMLScript(NaiveBayes.scriptPath)) + .in("$X", " ") + .in("$Y", " ") + .in("$prior", " ") + .in("$conditionals", " ") + .in("$accuracy", " ") + .in("$laplace", toDouble(getLaplace)) + .out("classPrior", "classConditionals") + (script, "D", "C") } } @@ -96,61 +73,37 @@ object NaiveBayesModel { final val scriptPath = "scripts" + File.separator + "algorithms" + File.separator + "naive-bayes-predict.dml" } -class NaiveBayesModel( - override val uid: String)( - val mloutput: MLOutput, val labelMapping: java.util.HashMap[Int, String], val sc: SparkContext) extends Model[NaiveBayesModel] with HasLaplace { +class NaiveBayesModel(override val uid: String) + (val mloutput: MLResults, val labelMapping: java.util.HashMap[Int, String], val sc: SparkContext) + extends Model[NaiveBayesModel] with HasLaplace with BaseSystemMLClassifierModel { + override def copy(extra: ParamMap): NaiveBayesModel = { val that = new NaiveBayesModel(uid)(mloutput, labelMapping, sc) copyValues(that, extra) } - def transformSchema(schema: StructType): StructType = schema - - var priorMB: MatrixBlock = null - var conditionalMB: MatrixBlock = null - def setPriorAndConditional(prior:MatrixBlock, conditional:MatrixBlock) { - priorMB = prior - conditionalMB = conditional - } - - def transform(X: MatrixBlock): MatrixBlock = { - val isSingleNode = true - val ml = new MLContext(sc) - ml.registerInput("D", X) - ml.registerInput("prior", mloutput.getMatrixBlock("classPrior"), mloutput.getMatrixCharacteristics("classPrior")) - ml.registerInput("conditionals", mloutput.getMatrixBlock("classConditionals"), mloutput.getMatrixCharacteristics("classConditionals")) - ml.registerOutput("probs") - val nbPredict = ml.executeScript(ScriptsUtils.getDMLScript(NaiveBayesModel.scriptPath), getPredictParams()) - val ret = PredictionUtils.computePredictedClassLabelsFromProbability(nbPredict, isSingleNode, sc, "probs").getMatrixBlock("Prediction"); - if(ret.getNumColumns != 1) { - throw new RuntimeException("Expected predicted label to be a column vector") + def getPredictionScript(mloutput: MLResults, isSingleNode:Boolean): (Script, String) = { + val script = dml(ScriptsUtils.getDMLScript(NaiveBayesModel.scriptPath)) + .in("$X", " ") + .in("$prior", " ") + .in("$conditionals", " ") + .in("$probabilities", " ") + .out("probs") + + val classPrior = mloutput.getBinaryBlockMatrix("classPrior") + val classConditionals = mloutput.getBinaryBlockMatrix("classConditionals") + val ret = if(isSingleNode) { + script.in("prior", classPrior.getMatrixBlock, classPrior.getMatrixMetadata) + .in("conditionals", classConditionals.getMatrixBlock, classConditionals.getMatrixMetadata) + } + else { + script.in("prior", classPrior.getBinaryBlocks, classPrior.getMatrixMetadata) + .in("conditionals", classConditionals.getBinaryBlocks, classConditionals.getMatrixMetadata) } - PredictionUtils.updateLabels(isSingleNode, null, ret, null, labelMapping) - return ret + (ret, "D") } - def transform(df: org.apache.spark.sql.DataFrame): org.apache.spark.sql.DataFrame = { - val isSingleNode = false - val ml = new MLContext(sc) - val mcXin = new MatrixCharacteristics() - val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features") - ml.registerInput("D", Xin, mcXin); - ml.registerInput("prior", mloutput.getMatrixBlock("classPrior"), mloutput.getMatrixCharacteristics("classPrior")) - ml.registerInput("conditionals", mloutput.getMatrixBlock("classConditionals"), mloutput.getMatrixCharacteristics("classConditionals")) - ml.registerOutput("probs") - val nbPredict = ml.executeScript(ScriptsUtils.getDMLScript(NaiveBayesModel.scriptPath), getPredictParams()) - val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(nbPredict, isSingleNode, sc, "probs") - val predictedDF = PredictionUtils.updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1", labelMapping).select("ID", "prediction") - val prob = nbPredict.getDF(df.sqlContext, "probs", true).withColumnRenamed("C1", "probability").select("ID", "probability") - val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID") - return PredictionUtils.joinUsingID(dataset, PredictionUtils.joinUsingID(prob, predictedDF)) - } + def transform(X: MatrixBlock): MatrixBlock = transform(X, mloutput, labelMapping, sc, "probs") + def transform(df: DataFrame): DataFrame = transform(df, mloutput, labelMapping, sc, "probs") - def getPredictParams(): Map[String, String] = { - Map("X" -> " ", - "prior" -> " ", - "conditionals" -> " ", - "probabilities" -> " ") - } - } \ No newline at end of file diff --git a/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala b/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala index 13494eedaf8..f91a82cadc2 100644 --- a/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala +++ b/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala @@ -28,9 +28,28 @@ import org.apache.sysml.runtime.matrix.data.MatrixBlock import org.apache.sysml.runtime.DMLRuntimeException import org.apache.sysml.runtime.matrix.MatrixCharacteristics import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils } +import org.apache.sysml.api.mlcontext.MLResults +import org.apache.sysml.api.mlcontext.ScriptFactory._ +import org.apache.sysml.api.mlcontext.Script +import org.apache.sysml.api.mlcontext.BinaryBlockMatrix object PredictionUtils { + def getGLMPredictionScript(B_full: BinaryBlockMatrix, isSingleNode:Boolean): (Script, String) = { + val script = dml(ScriptsUtils.getDMLScript(LogisticRegressionModel.scriptPath)) + .in("$X", " ") + .in("$B", " ") + .in("$dfam", "3") + .out("means") + val ret = if(isSingleNode) { + script.in("B_full", B_full.getMatrixBlock, B_full.getMatrixMetadata) + } + else { + script.in("B_full", B_full) + } + (ret, "X") + } + def doGLMPredict(isSingleNode:Boolean, df:DataFrame, X: MatrixBlock, sc:SparkContext, mloutput:MLOutput, B:String, paramsMap: Map[String, String]): MLOutput = { val ml = new MLContext(sc) if(isSingleNode) { @@ -149,4 +168,21 @@ object PredictionUtils { write(Prediction, "tempOut", "csv"); """) } + + def computePredictedClassLabelsFromProbability(mlscoreoutput:MLResults, isSingleNode:Boolean, sc:SparkContext, inProbVar:String): MLResults = { + val ml = new org.apache.sysml.api.mlcontext.MLContext(sc) + val script = dml( + """ + Prob = read("temp1"); + Prediction = rowIndexMax(Prob); # assuming one-based label mapping + write(Prediction, "tempOut", "csv"); + """).out("Prediction") + val probVar = mlscoreoutput.getBinaryBlockMatrix(inProbVar) + if(isSingleNode) { + ml.execute(script.in("Prob", probVar.getMatrixBlock, probVar.getMatrixMetadata)) + } + else { + ml.execute(script.in("Prob", probVar)) + } + } } \ No newline at end of file diff --git a/src/main/scala/org/apache/sysml/api/ml/SVM.scala b/src/main/scala/org/apache/sysml/api/ml/SVM.scala index 7a48c1ded13..93e91ec4660 100644 --- a/src/main/scala/org/apache/sysml/api/ml/SVM.scala +++ b/src/main/scala/org/apache/sysml/api/ml/SVM.scala @@ -19,17 +19,19 @@ package org.apache.sysml.api.ml +import org.apache.spark.rdd.RDD import java.io.File import org.apache.spark.SparkContext import org.apache.spark.ml.{ Model, Estimator } import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType -import org.apache.spark.ml.param.ParamMap -import org.apache.sysml.api.{ MLContext, MLOutput } +import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam } import org.apache.sysml.runtime.matrix.MatrixCharacteristics import org.apache.sysml.runtime.matrix.data.MatrixBlock import org.apache.sysml.runtime.DMLRuntimeException import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils } +import org.apache.sysml.api.mlcontext._ +import org.apache.sysml.api.mlcontext.ScriptFactory._ object SVM { final val scriptPathBinary = "scripts" + File.separator + "algorithms" + File.separator + "l2-svm.dml" @@ -37,71 +39,41 @@ object SVM { } class SVM (override val uid: String, val sc: SparkContext, val isMultiClass:Boolean=false) extends Estimator[SVMModel] with HasIcpt - with HasRegParam with HasTol with HasMaxOuterIter { + with HasRegParam with HasTol with HasMaxOuterIter with BaseSystemMLClassifier { def setIcpt(value: Int) = set(icpt, value) def setMaxIter(value: Int) = set(maxOuterIter, value) def setRegParam(value: Double) = set(regParam, value) def setTol(value: Double) = set(tol, value) - def setModelParams(m:SVMModel):SVMModel = { - m.setIcpt(this.getIcpt).setMaxIter(this.getMaxOuterIte).setRegParam(this.getRegParam).setTol(this.getTol) - } - override def copy(extra: ParamMap): Estimator[SVMModel] = { val that = new SVM(uid, sc, isMultiClass) copyValues(that, extra) } - def transformSchema(schema: StructType): StructType = schema - def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): SVMModel = { - val ml = new MLContext(sc) - val revLabelMapping = new java.util.HashMap[Int, String] - PredictionUtils.fillLabelMapping(y_mb, revLabelMapping) - if(y_mb.getNumColumns != 1) { - throw new RuntimeException("Expected a column vector for y") - } - val mloutput = { - ml.registerInput("X", X_mb); - ml.registerInput("Y", y_mb); - ml.registerOutput("w"); - if(isMultiClass) - ml.executeScript(ScriptsUtils.getDMLScript(SVM.scriptPathMulticlass), getParamMap()) - else { - ml.executeScript(ScriptsUtils.getDMLScript(SVM.scriptPathBinary), getParamMap()) - } - } - setModelParams(new SVMModel("svm")(mloutput, sc, isMultiClass, revLabelMapping)) + def getTrainingScript(isSingleNode:Boolean):(Script, String, String) = { + val script = dml(ScriptsUtils.getDMLScript(if(isMultiClass) SVM.scriptPathMulticlass else SVM.scriptPathBinary)) + .in("$X", " ") + .in("$Y", " ") + .in("$model", " ") + .in("$Log", " ") + .in("$icpt", toDouble(getIcpt)) + .in("$reg", toDouble(getRegParam)) + .in("$tol", toDouble(getTol)) + .in("$maxiter", toDouble(getMaxOuterIte)) + .out("w") + (script, "X", "Y") } - def getParamMap(): Map[String, String] = { - Map( "icpt" -> this.getIcpt.toString(), - "reg" -> this.getRegParam.toString(), - "tol" -> this.getTol.toString, - "maxiter" -> this.getMaxOuterIte.toString, - "X" -> " ", - "Y" -> " ", - "model" -> " ", - "Log" -> " ") + // Note: will update the y_mb as this will be called by Python mllearn + def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): SVMModel = { + val ret = fit(X_mb, y_mb, sc) + new SVMModel("svm")(ret._1, sc, isMultiClass, ret._2) } def fit(df: DataFrame): SVMModel = { - val ml = new MLContext(df.rdd.sparkContext) - val mcXin = new MatrixCharacteristics() - val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features") - val revLabelMapping = new java.util.HashMap[Int, String] - val yin = PredictionUtils.fillLabelMapping(df, revLabelMapping) - val mloutput = { - ml.registerInput("X", Xin, mcXin); - ml.registerInput("Y", yin, "csv"); - ml.registerOutput("w"); - if(isMultiClass) - ml.executeScript(ScriptsUtils.getDMLScript(SVM.scriptPathMulticlass), getParamMap()) - else { - ml.executeScript(ScriptsUtils.getDMLScript(SVM.scriptPathBinary), getParamMap()) - } - } - setModelParams(new SVMModel("svm")(mloutput, sc, isMultiClass, revLabelMapping)) + val ret = fit(df, sc) + new SVMModel("svm")(ret._1, sc, isMultiClass, ret._2) } } @@ -111,77 +83,31 @@ object SVMModel { final val predictionScriptPathMulticlass = "scripts" + File.separator + "algorithms" + File.separator + "m-svm-predict.dml" } -class SVMModel (override val uid: String)(val mloutput: MLOutput, val sc: SparkContext, val isMultiClass:Boolean, val labelMapping: java.util.HashMap[Int, String]) extends Model[SVMModel] with HasIcpt - with HasRegParam with HasTol with HasMaxOuterIter { +class SVMModel (override val uid: String)(val mloutput: MLResults, val sc: SparkContext, val isMultiClass:Boolean, + val labelMapping: java.util.HashMap[Int, String]) extends Model[SVMModel] with BaseSystemMLClassifierModel { override def copy(extra: ParamMap): SVMModel = { val that = new SVMModel(uid)(mloutput, sc, isMultiClass, labelMapping) copyValues(that, extra) } - def setIcpt(value: Int) = set(icpt, value) - def setMaxIter(value: Int) = set(maxOuterIter, value) - def setRegParam(value: Double) = set(regParam, value) - def setTol(value: Double) = set(tol, value) - - override def transformSchema(schema: StructType): StructType = schema - - def transform(df: DataFrame): DataFrame = { - val ml = new MLContext(sc) - val mcXin = new MatrixCharacteristics() - val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features") - ml.registerInput("X", Xin, mcXin); - ml.registerOutput("scores"); - val glmPredOut = { - if(isMultiClass) { - ml.registerInput("W", mloutput.getBinaryBlockedRDD("w"), mloutput.getMatrixCharacteristics("w")); - ml.executeScript(ScriptsUtils.getDMLScript(SVMModel.predictionScriptPathMulticlass), getPredictParams()) - } - else { - ml.registerInput("w", mloutput.getBinaryBlockedRDD("w"), mloutput.getMatrixCharacteristics("w")); - ml.executeScript(ScriptsUtils.getDMLScript(SVMModel.predictionScriptPathBinary), getPredictParams()) - } + def getPredictionScript(mloutput: MLResults, isSingleNode:Boolean): (Script, String) = { + val script = dml(ScriptsUtils.getDMLScript(if(isMultiClass) SVMModel.predictionScriptPathMulticlass else SVMModel.predictionScriptPathBinary)) + .in("$X", " ") + .in("$model", " ") + .out("scores") + + val w = mloutput.getBinaryBlockMatrix("w") + val wVar = if(isMultiClass) "W" else "w" + + val ret = if(isSingleNode) { + script.in(wVar, w.getMatrixBlock, w.getMatrixMetadata) } - val isSingleNode = false - val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(glmPredOut, isSingleNode, sc, "scores") - val predictedDF = PredictionUtils.updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1", labelMapping).select("ID", "prediction") - val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID") - return PredictionUtils.joinUsingID(dataset, predictedDF) - } - - def transform(X: MatrixBlock): MatrixBlock = { - val ml = new MLContext(sc) - ml.registerInput("X", X); - ml.registerInput("w", mloutput.getMatrixBlock("w"), mloutput.getMatrixCharacteristics("w")); - ml.registerOutput("scores"); - val glmPredOut = { - if(isMultiClass) { - ml.registerInput("W", mloutput.getMatrixBlock("w"), mloutput.getMatrixCharacteristics("w")); - ml.executeScript(ScriptsUtils.getDMLScript(SVMModel.predictionScriptPathMulticlass), getPredictParams()) - } - else { - ml.registerInput("w", mloutput.getMatrixBlock("w"), mloutput.getMatrixCharacteristics("w")); - ml.executeScript(ScriptsUtils.getDMLScript(SVMModel.predictionScriptPathBinary), getPredictParams()) - } + else { + script.in(wVar, w) } - val isSingleNode = true - val ret = PredictionUtils.computePredictedClassLabelsFromProbability(glmPredOut, isSingleNode, sc, "scores").getMatrixBlock("Prediction"); - if(ret.getNumColumns != 1) { - throw new RuntimeException("Expected predicted label to be a column vector") - } - PredictionUtils.updateLabels(true, null, ret, null, labelMapping) - return ret + (ret, "X") } - - def getPredictParams(): Map[String, String] = { - Map( "icpt" -> this.getIcpt.toString(), - "reg" -> this.getRegParam.toString(), - "tol" -> this.getTol.toString, - "maxiter" -> this.getMaxOuterIte.toString, - "X" -> " ", - "Y" -> " ", - "model" -> " ", - "Log" -> " ") - } - + def transform(X: MatrixBlock): MatrixBlock = transform(X, mloutput, labelMapping, sc, "scores") + def transform(df: DataFrame): DataFrame = transform(df, mloutput, labelMapping, sc, "scores") } \ No newline at end of file From 3a2a4cfb8a8f7e3b254e7ef7bfebe72c30013b0e Mon Sep 17 00:00:00 2001 From: Niketan Pansare Date: Tue, 9 Aug 2016 13:16:07 -0700 Subject: [PATCH 14/14] Modified Linear Regression to support new MLContext and added support for Spark 2.0 --- .../org/apache/sysml/api/python/SystemML.py | 34 +++-- .../sysml/api/ml/BaseSystemMLClassifier.scala | 71 ++++++----- .../sysml/api/ml/BaseSystemMLRegressor.scala | 86 +++++++++++++ .../sysml/api/ml/LinearRegression.scala | 118 ++++++------------ .../sysml/api/ml/LogisticRegression.scala | 6 +- .../org/apache/sysml/api/ml/NaiveBayes.scala | 4 +- .../apache/sysml/api/ml/PredictionUtils.scala | 40 +----- .../scala/org/apache/sysml/api/ml/SVM.scala | 4 +- .../apache/sysml/api/ml/ScriptsUtils.scala | 2 + 9 files changed, 195 insertions(+), 170 deletions(-) create mode 100644 src/main/scala/org/apache/sysml/api/ml/BaseSystemMLRegressor.scala diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py index c03bd1bab49..689403ea883 100644 --- a/src/main/java/org/apache/sysml/api/python/SystemML.py +++ b/src/main/java/org/apache/sysml/api/python/SystemML.py @@ -321,13 +321,15 @@ class mllearn: class BaseSystemMLEstimator(Estimator): # TODO: Allow users to set featuresCol (with default 'features') and labelCol (with default 'label') + # Returns a model after calling fit(df) on Estimator object on JVM def _fit(self, X): if hasattr(X, '_jdf') and 'features' in X.columns and 'label' in X.columns: self.model = self.estimator.fit(X._jdf) return self else: raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns') - + + # Returns a model after calling fit(X:MatrixBlock, y:MatrixBlock) on Estimator object on JVM def fit(self, X, y=None, params=None): if y is None: return self._fit(X) @@ -356,7 +358,8 @@ def fit(self, X, y=None, params=None): def transform(self, X): return self.predict(X) - + + # Returns either a DataFrame or MatrixBlock after calling transform(X:MatrixBlock, y:MatrixBlock) on Model object on JVM def predict(self, X): if isinstance(X, SUPPORTED_TYPES): if self.transferUsingDF: @@ -389,12 +392,23 @@ def predict(self, X): else: raise Exception('Unsupported input type') + class BaseSystemMLClassifier(BaseSystemMLEstimator): + + # Scores the predicted value with ground truth 'y' def score(self, X, y): return metrics.accuracy_score(y, self.predict(X)) + class BaseSystemMLRegressor(BaseSystemMLEstimator): + + # Scores the predicted value with ground truth 'y' + def score(self, X, y): + return metrics.r2_score(y, self.predict(X), multioutput='variance_weighted') + + # Or we can create new Python project with package structure - class LogisticRegression(BaseSystemMLEstimator): + class LogisticRegression(BaseSystemMLClassifier): + # See https://apache.github.io/incubator-systemml/algorithms-reference for usage def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False): self.sqlCtx = sqlCtx self.sc = sqlCtx._sc @@ -415,8 +429,9 @@ def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_i if solver != 'newton-cg': raise Exception('Only newton-cg solver supported') - class LinearRegression(BaseSystemMLEstimator): + class LinearRegression(BaseSystemMLRegressor): + # See https://apache.github.io/incubator-systemml/algorithms-reference for usage def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False): self.sqlCtx = sqlCtx self.sc = sqlCtx._sc @@ -435,12 +450,10 @@ def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0 self.transferUsingDF = transferUsingDF self.setOutputRawPredictionsToFalse = False - def score(self, X, y): - return metrics.r2_score(y, self.predict(X), multioutput='variance_weighted') - - class SVM(BaseSystemMLEstimator): + class SVM(BaseSystemMLClassifier): + # See https://apache.github.io/incubator-systemml/algorithms-reference for usage def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False): self.sqlCtx = sqlCtx self.sc = sqlCtx._sc @@ -456,8 +469,9 @@ def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0 self.transferUsingDF = transferUsingDF self.setOutputRawPredictionsToFalse = False - class NaiveBayes(BaseSystemMLEstimator): + class NaiveBayes(BaseSystemMLClassifier): + # See https://apache.github.io/incubator-systemml/algorithms-reference for usage def __init__(self, sqlCtx, laplace=1.0, transferUsingDF=False): self.sqlCtx = sqlCtx self.sc = sqlCtx._sc @@ -465,4 +479,4 @@ def __init__(self, sqlCtx, laplace=1.0, transferUsingDF=False): self.estimator = self.sc._jvm.org.apache.sysml.api.ml.NaiveBayes(self.uid, self.sc._jsc.sc()) self.estimator.setLaplace(laplace) self.transferUsingDF = transferUsingDF - self.setOutputRawPredictionsToFalse = False \ No newline at end of file + self.setOutputRawPredictionsToFalse = False \ No newline at end of file diff --git a/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLClassifier.scala b/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLClassifier.scala index 5174aabdb72..98def7c21bd 100644 --- a/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLClassifier.scala +++ b/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLClassifier.scala @@ -23,7 +23,6 @@ import org.apache.spark.rdd.RDD import java.io.File import org.apache.spark.SparkContext import org.apache.spark.ml.{ Model, Estimator } -import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam } import org.apache.sysml.runtime.matrix.MatrixCharacteristics @@ -32,6 +31,7 @@ import org.apache.sysml.runtime.DMLRuntimeException import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils } import org.apache.sysml.api.mlcontext._ import org.apache.sysml.api.mlcontext.ScriptFactory._ +import org.apache.spark.sql._ trait HasLaplace extends Params { final val laplace: Param[Double] = new Param[Double](this, "laplace", "Laplace smoothing specified by the user to avoid creation of 0 probabilities.") @@ -64,16 +64,41 @@ trait HasRegParam extends Params { final def getRegParam: Double = $(regParam) } - -trait BaseSystemMLClassifier { +trait BaseSystemMLEstimator { + def transformSchema(schema: StructType): StructType = schema // Returns the script and variables for X and y def getTrainingScript(isSingleNode:Boolean):(Script, String, String) + def toDouble(i:Int): java.lang.Double = { + double2Double(i.toDouble) + } + + def toDouble(d:Double): java.lang.Double = { + double2Double(d) + } +} + +trait BaseSystemMLEstimatorModel { + def toDouble(i:Int): java.lang.Double = { + double2Double(i.toDouble) + } + def toDouble(d:Double): java.lang.Double = { + double2Double(d) + } + + def transformSchema(schema: StructType): StructType = schema + + // Returns the script and variable for X + def getPredictionScript(mloutput: MLResults, isSingleNode:Boolean): (Script, String) +} + +trait BaseSystemMLClassifier extends BaseSystemMLEstimator { + def fit(X_mb: MatrixBlock, y_mb: MatrixBlock, sc: SparkContext): (MLResults, java.util.HashMap[Int, String]) = { val isSingleNode = true - val ml = new org.apache.sysml.api.mlcontext.MLContext(sc) + val ml = new MLContext(sc) val revLabelMapping = new java.util.HashMap[Int, String] PredictionUtils.fillLabelMapping(y_mb, revLabelMapping) val ret = getTrainingScript(isSingleNode) @@ -81,11 +106,11 @@ trait BaseSystemMLClassifier { (ml.execute(script), revLabelMapping) } - def fit(df: DataFrame, sc: SparkContext): (MLResults, java.util.HashMap[Int, String]) = { + def fit(df: ScriptsUtils.SparkDataType, sc: SparkContext): (MLResults, java.util.HashMap[Int, String]) = { val isSingleNode = false val ml = new MLContext(df.rdd.sparkContext) val mcXin = new MatrixCharacteristics() - val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features") + val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df.asInstanceOf[DataFrame], mcXin, false, "features") val revLabelMapping = new java.util.HashMap[Int, String] val yin = PredictionUtils.fillLabelMapping(df, revLabelMapping) val ret = getTrainingScript(isSingleNode) @@ -93,29 +118,9 @@ trait BaseSystemMLClassifier { val script = ret._1.in(ret._2, Xbin).in(ret._3, yin) (ml.execute(script), revLabelMapping) } - - def toDouble(i:Int): java.lang.Double = { - double2Double(i.toDouble) - } - def toDouble(d:Double): java.lang.Double = { - double2Double(d) - } - } -trait BaseSystemMLClassifierModel { - - def toDouble(i:Int): java.lang.Double = { - double2Double(i.toDouble) - } - def toDouble(d:Double): java.lang.Double = { - double2Double(d) - } - - def transformSchema(schema: StructType): StructType = schema - - // Returns the script and variable for X - def getPredictionScript(mloutput: MLResults, isSingleNode:Boolean): (Script, String) +trait BaseSystemMLClassifierModel extends BaseSystemMLEstimatorModel { def transform(X: MatrixBlock, mloutput: MLResults, labelMapping: java.util.HashMap[Int, String], sc: SparkContext, probVar:String): MatrixBlock = { val isSingleNode = true @@ -131,13 +136,13 @@ trait BaseSystemMLClassifierModel { PredictionUtils.updateLabels(isSingleNode, null, ret, null, labelMapping) return ret } - - def transform(df: DataFrame, mloutput: MLResults, labelMapping: java.util.HashMap[Int, String], sc: SparkContext, + + def transform(df: ScriptsUtils.SparkDataType, mloutput: MLResults, labelMapping: java.util.HashMap[Int, String], sc: SparkContext, probVar:String, outputProb:Boolean=true): DataFrame = { val isSingleNode = false val ml = new MLContext(sc) val mcXin = new MatrixCharacteristics() - val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features") + val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df.asInstanceOf[DataFrame], mcXin, false, "features") val script = getPredictionScript(mloutput, isSingleNode) val Xin_bin = new BinaryBlockMatrix(Xin, mcXin) val modelPredict = ml.execute(script._1.in(script._2, Xin_bin)) @@ -145,11 +150,11 @@ trait BaseSystemMLClassifierModel { val predictedDF = PredictionUtils.updateLabels(isSingleNode, predLabelOut.getDataFrame("Prediction"), null, "C1", labelMapping).select("ID", "prediction") if(outputProb) { val prob = modelPredict.getDataFrame(probVar, true).withColumnRenamed("C1", "probability").select("ID", "probability") - val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID") - return PredictionUtils.joinUsingID(dataset, PredictionUtils.joinUsingID(prob, predictedDF)) + val dataset = RDDConverterUtils.addIDToDataFrame(df.asInstanceOf[DataFrame], df.sqlContext, "ID") + return PredictionUtils.joinUsingID(dataset, PredictionUtils.joinUsingID(prob, predictedDF)) } else { - val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID") + val dataset = RDDConverterUtils.addIDToDataFrame(df.asInstanceOf[DataFrame], df.sqlContext, "ID") return PredictionUtils.joinUsingID(dataset, predictedDF) } diff --git a/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLRegressor.scala b/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLRegressor.scala new file mode 100644 index 00000000000..5bcde30a2ea --- /dev/null +++ b/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLRegressor.scala @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.api.ml + +import org.apache.spark.rdd.RDD +import java.io.File +import org.apache.spark.SparkContext +import org.apache.spark.ml.{ Model, Estimator } +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.types.StructType +import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam } +import org.apache.sysml.runtime.matrix.MatrixCharacteristics +import org.apache.sysml.runtime.matrix.data.MatrixBlock +import org.apache.sysml.runtime.DMLRuntimeException +import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils } +import org.apache.sysml.api.mlcontext._ +import org.apache.sysml.api.mlcontext.ScriptFactory._ + +trait BaseSystemMLRegressor extends BaseSystemMLEstimator { + + def fit(X_mb: MatrixBlock, y_mb: MatrixBlock, sc: SparkContext): MLResults = { + val isSingleNode = true + val ml = new MLContext(sc) + val ret = getTrainingScript(isSingleNode) + val script = ret._1.in(ret._2, X_mb).in(ret._3, y_mb) + ml.execute(script) + } + + def fit(df: ScriptsUtils.SparkDataType, sc: SparkContext): MLResults = { + val isSingleNode = false + val ml = new MLContext(df.rdd.sparkContext) + val mcXin = new MatrixCharacteristics() + val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df.asInstanceOf[DataFrame], mcXin, false, "features") + val yin = df.select("label") + val ret = getTrainingScript(isSingleNode) + val Xbin = new BinaryBlockMatrix(Xin, mcXin) + val script = ret._1.in(ret._2, Xbin).in(ret._3, yin) + ml.execute(script) + } +} + +trait BaseSystemMLRegressorModel extends BaseSystemMLEstimatorModel { + + def transform(X: MatrixBlock, mloutput: MLResults, sc: SparkContext, predictionVar:String): MatrixBlock = { + val isSingleNode = true + val ml = new MLContext(sc) + val script = getPredictionScript(mloutput, isSingleNode) + val modelPredict = ml.execute(script._1.in(script._2, X)) + val ret = modelPredict.getBinaryBlockMatrix(predictionVar).getMatrixBlock + + if(ret.getNumColumns != 1) { + throw new RuntimeException("Expected prediction to be a column vector") + } + return ret + } + + def transform(df: ScriptsUtils.SparkDataType, mloutput: MLResults, sc: SparkContext, predictionVar:String): DataFrame = { + val isSingleNode = false + val ml = new MLContext(sc) + val mcXin = new MatrixCharacteristics() + val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df.asInstanceOf[DataFrame], mcXin, false, "features") + val script = getPredictionScript(mloutput, isSingleNode) + val Xin_bin = new BinaryBlockMatrix(Xin, mcXin) + val modelPredict = ml.execute(script._1.in(script._2, Xin_bin)) + val predictedDF = modelPredict.getDataFrame(predictionVar).select("ID", "C1").withColumnRenamed("C1", "prediction") + val dataset = RDDConverterUtils.addIDToDataFrame(df.asInstanceOf[DataFrame], df.sqlContext, "ID") + return PredictionUtils.joinUsingID(dataset, predictedDF) + } +} \ No newline at end of file diff --git a/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala index 7f22f8f717b..cce646d76ea 100644 --- a/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala +++ b/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala @@ -19,17 +19,19 @@ package org.apache.sysml.api.ml +import org.apache.spark.rdd.RDD import java.io.File import org.apache.spark.SparkContext import org.apache.spark.ml.{ Model, Estimator } import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType -import org.apache.spark.ml.param.ParamMap -import org.apache.sysml.api.{ MLContext, MLOutput } +import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam } import org.apache.sysml.runtime.matrix.MatrixCharacteristics import org.apache.sysml.runtime.matrix.data.MatrixBlock import org.apache.sysml.runtime.DMLRuntimeException import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils } +import org.apache.sysml.api.mlcontext._ +import org.apache.sysml.api.mlcontext.ScriptFactory._ object LinearRegression { final val scriptPathCG = "scripts" + File.separator + "algorithms" + File.separator + "LinearRegCG.dml" @@ -37,8 +39,9 @@ object LinearRegression { } // algorithm = "direct-solve", "conjugate-gradient" -class LinearRegression(override val uid: String, val sc: SparkContext, val solver:String="direct-solve") extends Estimator[LinearRegressionModel] with HasIcpt - with HasRegParam with HasTol with HasMaxOuterIter { +class LinearRegression(override val uid: String, val sc: SparkContext, val solver:String="direct-solve") + extends Estimator[LinearRegressionModel] with HasIcpt + with HasRegParam with HasTol with HasMaxOuterIter with BaseSystemMLRegressor { def setIcpt(value: Int) = set(icpt, value) def setMaxIter(value: Int) = set(maxOuterIter, value) @@ -49,97 +52,46 @@ class LinearRegression(override val uid: String, val sc: SparkContext, val solve val that = new LinearRegression(uid, sc, solver) copyValues(that, extra) } - def transformSchema(schema: StructType): StructType = schema - def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): LinearRegressionModel = { - val ml = new MLContext(sc) - if(y_mb.getNumColumns != 1) { - throw new RuntimeException("Expected a column vector for y") - } - val mloutput = { - ml.registerInput("X", X_mb); - ml.registerInput("y", y_mb); - ml.registerOutput("beta_out"); - if(solver.compareTo("direct-solve") == 0) - ml.executeScript(ScriptsUtils.getDMLScript(LinearRegression.scriptPathDS), getParamMap()) - else if(solver.compareTo("newton-cg") == 0) { - ml.executeScript(ScriptsUtils.getDMLScript(LinearRegression.scriptPathCG), getParamMap()) - } - else { - throw new DMLRuntimeException("The algorithm should be direct-solve or conjugate-gradient") - } - } - new LinearRegressionModel("linearRegression")(mloutput, sc) + + def getTrainingScript(isSingleNode:Boolean):(Script, String, String) = { + val script = dml(ScriptsUtils.getDMLScript( + if(solver.compareTo("direct-solve") == 0) LinearRegression.scriptPathDS + else if(solver.compareTo("newton-cg") == 0) LinearRegression.scriptPathCG + else throw new DMLRuntimeException("The algorithm should be direct-solve or newton-cg"))) + .in("$X", " ") + .in("$Y", " ") + .in("$B", " ") + .in("$Log", " ") + .in("$fmt", "binary") + .in("$icpt", toDouble(getIcpt)) + .in("$reg", toDouble(getRegParam)) + .in("$tol", toDouble(getTol)) + .in("$maxi", toDouble(getMaxOuterIte)) + .out("beta_out") + (script, "X", "y") } - def getParamMap(): Map[String, String] = { - Map( "icpt" -> this.getIcpt.toString(), - "reg" -> this.getRegParam.toString(), - "tol" -> this.getTol.toString, - "maxi" -> this.getMaxOuterIte.toString, + def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): LinearRegressionModel = + new LinearRegressionModel("lr")(fit(X_mb, y_mb, sc), sc) + + def fit(df: ScriptsUtils.SparkDataType): LinearRegressionModel = + new LinearRegressionModel("lr")(fit(df, sc), sc) - "X" -> " ", - "Y" -> " ", - "B" -> " ", - "O" -> " ", - "Log" -> " ", - "fmt" -> "binary") - } - - def fit(df: DataFrame): LinearRegressionModel = { - val ml = new MLContext(df.rdd.sparkContext) - val mcXin = new MatrixCharacteristics() - val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features") - val yin = df.select("label") - val mloutput = { - ml.registerInput("X", Xin, mcXin); - ml.registerInput("y", yin); - ml.registerOutput("beta_out"); - if(solver.compareTo("direct-solve") == 0) - ml.executeScript(ScriptsUtils.getDMLScript(LinearRegression.scriptPathDS), getParamMap()) - else if(solver.compareTo("newton-cg") == 0) { - ml.executeScript(ScriptsUtils.getDMLScript(LinearRegression.scriptPathCG), getParamMap()) - } - else { - throw new DMLRuntimeException("The algorithm should be direct-solve or conjugate-gradient") - } - } - new LinearRegressionModel("linearRegression")(mloutput, sc) - } } -class LinearRegressionModel(override val uid: String)(val mloutput: MLOutput, val sc: SparkContext) extends Model[LinearRegressionModel] with HasIcpt - with HasRegParam with HasTol with HasMaxOuterIter { +class LinearRegressionModel(override val uid: String)(val mloutput: MLResults, val sc: SparkContext) extends Model[LinearRegressionModel] with HasIcpt + with HasRegParam with HasTol with HasMaxOuterIter with BaseSystemMLRegressorModel { override def copy(extra: ParamMap): LinearRegressionModel = { val that = new LinearRegressionModel(uid)(mloutput, sc) copyValues(that, extra) } - override def transformSchema(schema: StructType): StructType = schema + def getPredictionScript(mloutput: MLResults, isSingleNode:Boolean): (Script, String) = + PredictionUtils.getGLMPredictionScript(mloutput.getBinaryBlockMatrix("beta_out"), isSingleNode) - def transform(df: DataFrame): DataFrame = { - val isSingleNode = false - val glmPredOut = PredictionUtils.doGLMPredict(isSingleNode, df, null, sc, mloutput, "beta_out", getPredictParams()) - val predictedDF = glmPredOut.getDF(df.sqlContext, "means").withColumnRenamed("C1", "prediction") - val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID") - return PredictionUtils.joinUsingID(dataset, predictedDF) - } - - def transform(X: MatrixBlock): MatrixBlock = { - val isSingleNode = true - return PredictionUtils.doGLMPredict(isSingleNode, null, X, sc, mloutput, "beta_out", getPredictParams()).getMatrixBlock("means") - } + def transform(df: ScriptsUtils.SparkDataType): DataFrame = transform(df, mloutput, sc, "means") + def transform(X: MatrixBlock): MatrixBlock = transform(X, mloutput, sc, "means") - def getPredictParams(): Map[String, String] = { - Map("X" -> " ", - "B" -> " ", - // Gaussian distribution - "dfam" -> "1", "vpow" -> "0.0", - // identity link function - "link" -> "1", "lpow" -> "1.0" -// // Dispersion value: TODO -// ,"disp" -> "5.0" - ) - } } \ No newline at end of file diff --git a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala index 3098da9c21d..a9ca6ab188c 100644 --- a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala +++ b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala @@ -60,7 +60,7 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends new LogisticRegressionModel("log")(ret._1, ret._2, sc) } - def fit(df: DataFrame): LogisticRegressionModel = { + def fit(df: ScriptsUtils.SparkDataType): LogisticRegressionModel = { val ret = fit(df, sc) new LogisticRegressionModel("log")(ret._1, ret._2, sc) } @@ -100,10 +100,10 @@ class LogisticRegressionModel(override val uid: String)( def setOutputRawPredictions(outRawPred:Boolean): Unit = { outputRawPredictions = outRawPred } def getPredictionScript(mloutput: MLResults, isSingleNode:Boolean): (Script, String) = - PredictionUtils.getGLMPredictionScript(mloutput.getBinaryBlockMatrix("B_out"), isSingleNode) + PredictionUtils.getGLMPredictionScript(mloutput.getBinaryBlockMatrix("B_out"), isSingleNode, 3) def transform(X: MatrixBlock): MatrixBlock = transform(X, mloutput, labelMapping, sc, "means") - def transform(df: DataFrame): DataFrame = transform(df, mloutput, labelMapping, sc, "means") + def transform(df: ScriptsUtils.SparkDataType): DataFrame = transform(df, mloutput, labelMapping, sc, "means") } /** diff --git a/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala b/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala index 28836221582..fd05f27828c 100644 --- a/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala +++ b/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala @@ -50,7 +50,7 @@ class NaiveBayes(override val uid: String, val sc: SparkContext) extends Estimat new NaiveBayesModel("naive")(ret._1, ret._2, sc) } - def fit(df: DataFrame): NaiveBayesModel = { + def fit(df: ScriptsUtils.SparkDataType): NaiveBayesModel = { val ret = fit(df, sc) new NaiveBayesModel("naive")(ret._1, ret._2, sc) } @@ -104,6 +104,6 @@ class NaiveBayesModel(override val uid: String) } def transform(X: MatrixBlock): MatrixBlock = transform(X, mloutput, labelMapping, sc, "probs") - def transform(df: DataFrame): DataFrame = transform(df, mloutput, labelMapping, sc, "probs") + def transform(df: ScriptsUtils.SparkDataType): DataFrame = transform(df, mloutput, labelMapping, sc, "probs") } \ No newline at end of file diff --git a/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala b/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala index f91a82cadc2..8e3893d578b 100644 --- a/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala +++ b/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala @@ -21,7 +21,6 @@ package org.apache.sysml.api.ml import org.apache.spark.sql.functions.udf import org.apache.spark.rdd.RDD -import org.apache.sysml.api.{ MLContext, MLOutput } import org.apache.spark.sql.DataFrame import org.apache.spark.SparkContext import org.apache.sysml.runtime.matrix.data.MatrixBlock @@ -35,11 +34,11 @@ import org.apache.sysml.api.mlcontext.BinaryBlockMatrix object PredictionUtils { - def getGLMPredictionScript(B_full: BinaryBlockMatrix, isSingleNode:Boolean): (Script, String) = { + def getGLMPredictionScript(B_full: BinaryBlockMatrix, isSingleNode:Boolean, dfam:java.lang.Integer=1): (Script, String) = { val script = dml(ScriptsUtils.getDMLScript(LogisticRegressionModel.scriptPath)) .in("$X", " ") .in("$B", " ") - .in("$dfam", "3") + .in("$dfam", dfam) .out("means") val ret = if(isSingleNode) { script.in("B_full", B_full.getMatrixBlock, B_full.getMatrixMetadata) @@ -50,23 +49,7 @@ object PredictionUtils { (ret, "X") } - def doGLMPredict(isSingleNode:Boolean, df:DataFrame, X: MatrixBlock, sc:SparkContext, mloutput:MLOutput, B:String, paramsMap: Map[String, String]): MLOutput = { - val ml = new MLContext(sc) - if(isSingleNode) { - ml.registerInput("X", X); - ml.registerInput("B_full", mloutput.getMatrixBlock(B), mloutput.getMatrixCharacteristics(B)); - } - else { - val mcXin = new MatrixCharacteristics() - val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features") - ml.registerInput("X", Xin, mcXin); - ml.registerInput("B_full", mloutput.getBinaryBlockedRDD(B), mloutput.getMatrixCharacteristics(B)); - } - ml.registerOutput("means"); - ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegressionModel.scriptPath), paramsMap) - } - - def fillLabelMapping(df: DataFrame, revLabelMapping: java.util.HashMap[Int, String]): RDD[String] = { + def fillLabelMapping(df: ScriptsUtils.SparkDataType, revLabelMapping: java.util.HashMap[Int, String]): RDD[String] = { val temp = df.select("label").distinct.rdd.map(_.apply(0).toString).collect() val labelMapping = new java.util.HashMap[String, Int] for(i <- 0 until temp.length) { @@ -152,23 +135,6 @@ object PredictionUtils { tempDF1.join(df2, tempDF1.col("ID1").equalTo(df2.col("ID"))).drop("ID1") } - def computePredictedClassLabelsFromProbability(mlscoreoutput:MLOutput, isSingleNode:Boolean, sc:SparkContext, inProbVar:String): MLOutput = { - val mlNew = new MLContext(sc) - if(isSingleNode) { - mlNew.registerInput("Prob", mlscoreoutput.getMatrixBlock(inProbVar), mlscoreoutput.getMatrixCharacteristics(inProbVar)); - } - else { - mlNew.registerInput("Prob", mlscoreoutput.getBinaryBlockedRDD(inProbVar), mlscoreoutput.getMatrixCharacteristics(inProbVar)); - } - mlNew.registerOutput("Prediction") - mlNew.executeScript( - """ - Prob = read("temp1"); - Prediction = rowIndexMax(Prob); # assuming one-based label mapping - write(Prediction, "tempOut", "csv"); - """) - } - def computePredictedClassLabelsFromProbability(mlscoreoutput:MLResults, isSingleNode:Boolean, sc:SparkContext, inProbVar:String): MLResults = { val ml = new org.apache.sysml.api.mlcontext.MLContext(sc) val script = dml( diff --git a/src/main/scala/org/apache/sysml/api/ml/SVM.scala b/src/main/scala/org/apache/sysml/api/ml/SVM.scala index 93e91ec4660..07a7283a80d 100644 --- a/src/main/scala/org/apache/sysml/api/ml/SVM.scala +++ b/src/main/scala/org/apache/sysml/api/ml/SVM.scala @@ -71,7 +71,7 @@ class SVM (override val uid: String, val sc: SparkContext, val isMultiClass:Bool new SVMModel("svm")(ret._1, sc, isMultiClass, ret._2) } - def fit(df: DataFrame): SVMModel = { + def fit(df: ScriptsUtils.SparkDataType): SVMModel = { val ret = fit(df, sc) new SVMModel("svm")(ret._1, sc, isMultiClass, ret._2) } @@ -109,5 +109,5 @@ class SVMModel (override val uid: String)(val mloutput: MLResults, val sc: Spark } def transform(X: MatrixBlock): MatrixBlock = transform(X, mloutput, labelMapping, sc, "scores") - def transform(df: DataFrame): DataFrame = transform(df, mloutput, labelMapping, sc, "scores") + def transform(df: ScriptsUtils.SparkDataType): DataFrame = transform(df, mloutput, labelMapping, sc, "scores") } \ No newline at end of file diff --git a/src/main/scala/org/apache/sysml/api/ml/ScriptsUtils.scala b/src/main/scala/org/apache/sysml/api/ml/ScriptsUtils.scala index fdf682d2b7b..10f9d33bf32 100644 --- a/src/main/scala/org/apache/sysml/api/ml/ScriptsUtils.scala +++ b/src/main/scala/org/apache/sysml/api/ml/ScriptsUtils.scala @@ -26,6 +26,8 @@ import org.apache.sysml.runtime.DMLRuntimeException object ScriptsUtils { var systemmlHome = System.getenv("SYSTEMML_HOME") + + type SparkDataType = org.apache.spark.sql.DataFrame // org.apache.spark.sql.Dataset[_] /** * set SystemML home