From c1bc4fd1639a75afd914f3bd7f280a3880548798 Mon Sep 17 00:00:00 2001
From: Niketan Pansare <npansar@us.ibm.com>
Date: Fri, 5 Aug 2016 08:17:47 -0700
Subject: [PATCH 01/14] [SYSTEMML-234] [SYSTEMML-208] Added mllearn library to
 support scikit-learn and MLPipeline

---
 .../java/org/apache/sysml/api/MLContext.java  |  27 ++-
 .../java/org/apache/sysml/api/MLOutput.java   |  39 +++-
 .../org/apache/sysml/api/python/SystemML.py   | 174 +++++++++++++-
 .../spark/utils/RDDConverterUtilsExt.java     |  43 ++++
 .../sysml/api/ml/LogisticRegression.scala     | 218 ++++++++++++++----
 5 files changed, 441 insertions(+), 60 deletions(-)
diff --git a/src/main/java/org/apache/sysml/api/MLContext.java b/src/main/java/org/apache/sysml/api/MLContext.java
index a03c8b7753e..32b0ce949e5 100644
--- a/src/main/java/org/apache/sysml/api/MLContext.java
+++ b/src/main/java/org/apache/sysml/api/MLContext.java
@@ -477,6 +477,25 @@ public void registerInput(String varName, RDD<String> rdd, String format, long r
 		registerInput(varName, rdd.toJavaRDD().mapToPair(new ConvertStringToLongTextPair()), format, rlen, clen, nnz, null);
 	}
 	
+	public void registerInput(String varName, MatrixBlock mb) throws DMLRuntimeException {
+		MatrixCharacteristics mc = new MatrixCharacteristics(mb.getNumRows(), mb.getNumColumns(), OptimizerUtils.DEFAULT_BLOCKSIZE, OptimizerUtils.DEFAULT_BLOCKSIZE, mb.getNonZeros());
+		registerInput(varName, mb, mc);
+	}
+	
+	public void registerInput(String varName, MatrixBlock mb, MatrixCharacteristics mc) throws DMLRuntimeException {
+		if(_variables == null)
+			_variables = new LocalVariableMap();
+		if(_inVarnames == null)
+			_inVarnames = new ArrayList<String>();
+	
+		MatrixObject mo = new MatrixObject(ValueType.DOUBLE, "temp", new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo));
+		mo.acquireModify(mb); 
+		mo.release();
+		_variables.put(varName, mo);
+		_inVarnames.add(varName);
+		checkIfRegisteringInputAllowed();
+	}
+	
 	// All CSV related methods call this ... It provides access to dimensions, nnz, file properties.
 	private void registerInput(String varName, JavaPairRDD<LongWritable, Text> textOrCsv_rdd, String format, long rlen, long clen, long nnz, FileFormatProperties props) throws DMLRuntimeException {
 		if(!(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)) {
@@ -1237,8 +1256,6 @@ private synchronized MLOutput compileAndExecuteScript(String dmlScriptFilePath,
 			
 			if(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK) {
 				
-				Map<String, JavaPairRDD<MatrixIndexes,MatrixBlock>> retVal = null;
-				
 				// Depending on whether registerInput/registerOutput was called initialize the variables 
 				String[] inputs; String[] outputs;
 				if(_inVarnames != null) {
@@ -1268,10 +1285,6 @@ private synchronized MLOutput compileAndExecuteScript(String dmlScriptFilePath,
 					
 					for( String ovar : _outVarnames ) {
 						if( _variables.keySet().contains(ovar) ) {
-							if(retVal == null) {
-								retVal = new HashMap<String, JavaPairRDD<MatrixIndexes,MatrixBlock>>();
-							}
-							retVal.put(ovar, ((SparkExecutionContext) ec).getBinaryBlockRDDHandleForVariable(ovar));
 							outMetadata.put(ovar, ec.getMatrixCharacteristics(ovar)); // For converting output to dataframe
 						}
 						else {
@@ -1280,7 +1293,7 @@ private synchronized MLOutput compileAndExecuteScript(String dmlScriptFilePath,
 					}
 				}
 				
-				return new MLOutput(retVal, outMetadata);
+				return new MLOutput(_variables, ec, outMetadata);
 			}
 			else {
 				throw new DMLRuntimeException("Unsupported runtime:" + DMLScript.rtplatform.name());
diff --git a/src/main/java/org/apache/sysml/api/MLOutput.java b/src/main/java/org/apache/sysml/api/MLOutput.java
index a3e601910c7..3ef68a9f151 100644
--- a/src/main/java/org/apache/sysml/api/MLOutput.java
+++ b/src/main/java/org/apache/sysml/api/MLOutput.java
@@ -39,13 +39,16 @@
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.LocalVariableMap;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysml.runtime.instructions.spark.functions.GetMLBlock;
 import org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
 import org.apache.sysml.runtime.util.UtilFunctions;
-
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import scala.Tuple2;
 
 /**
@@ -54,25 +57,39 @@
  */
 public class MLOutput {
 	
-	
-	
-	Map<String, JavaPairRDD<MatrixIndexes,MatrixBlock>> _outputs;
+	private LocalVariableMap _variables;
+	private ExecutionContext _ec;
 	private Map<String, MatrixCharacteristics> _outMetadata = null;
 	
-	public MLOutput(Map<String, JavaPairRDD<MatrixIndexes,MatrixBlock>> outputs, Map<String, MatrixCharacteristics> outMetadata) {
-		this._outputs = outputs;
+	public MLOutput(LocalVariableMap variables, ExecutionContext ec, Map<String, MatrixCharacteristics> outMetadata) {
+		this._variables = variables;
+		this._ec = ec;
 		this._outMetadata = outMetadata;
 	}
 	
+	public MatrixBlock getMatrixBlock(String varName) throws DMLRuntimeException {
+		if( _variables.keySet().contains(varName) ) {
+			MatrixObject mo = _ec.getMatrixObject(varName);
+			MatrixBlock mb = mo.acquireRead();
+			mo.release();
+			return mb;
+		}
+		else {
+			throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table.");
+		}
+	}
+	
 	public JavaPairRDD<MatrixIndexes,MatrixBlock> getBinaryBlockedRDD(String varName) throws DMLRuntimeException {
-		if(_outputs.containsKey(varName)) {
-			return _outputs.get(varName);
+		if( _variables.keySet().contains(varName) ) {
+			return ((SparkExecutionContext) _ec).getBinaryBlockRDDHandleForVariable(varName);
+		}
+		else {
+			throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table.");
 		}
-		throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table.");
 	}
 	
 	public MatrixCharacteristics getMatrixCharacteristics(String varName) throws DMLRuntimeException {
-		if(_outputs.containsKey(varName)) {
+		if(_outMetadata.containsKey(varName)) {
 			return _outMetadata.get(varName);
 		}
 		throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table.");
@@ -246,7 +263,7 @@ public Iterable<Tuple2<Long, Tuple2<Long, Double[]>>> call(Tuple2<MatrixIndexes,
 				for(int j = 0; j < lclen; j++) {
 					partialRow[j] = blk.getValue(i, j);
 				}
-				retVal.add(new Tuple2<Long, Tuple2<Long,Double[]>>(startRowIndex + i, new Tuple2<Long,Double[]>(kv._1.getColumnIndex(), partialRow)));
+				retVal.add(new Tuple2<Long, Tuple2<Long,Double[]>>(startRowIndex + i + 1, new Tuple2<Long,Double[]>(kv._1.getColumnIndex(), partialRow)));
 			}
 			return retVal;
 		}
diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py
index 8ad3117db65..3dfef67eb17 100644
--- a/src/main/java/org/apache/sysml/api/python/SystemML.py
+++ b/src/main/java/org/apache/sysml/api/python/SystemML.py
@@ -25,7 +25,13 @@
 import os
 from pyspark.sql import DataFrame, SQLContext
 from pyspark.rdd import RDD
-
+import numpy as np
+import pandas as pd
+import sklearn as sk
+from pyspark.ml.feature import VectorAssembler
+from pyspark.mllib.linalg import Vectors
+import sys
+from pyspark.ml import Estimator, Model
 
 class MLContext(object):
 
@@ -57,6 +63,7 @@ def __init__(self, sc, *args):
             setForcedSparkExecType = (args[1] if len(args) > 1 else False)
             self.sc = sc
             self.ml = sc._jvm.org.apache.sysml.api.MLContext(sc._jsc, monitorPerformance, setForcedSparkExecType)
+            self.sqlCtx = SQLContext(sc)
         except Py4JError:
             traceback.print_exc()
 
@@ -171,7 +178,6 @@ def registerInput(self, varName, src, *args):
             else:
                 raise TypeError('Arguments do not match MLContext-API')
         except Py4JJavaError:
-
             traceback.print_exc()
 
     def registerOutput(self, varName):
@@ -232,6 +238,10 @@ def getDF(self, sqlContext, varName):
         except Py4JJavaError:
             traceback.print_exc()
 
+    def getPandasDF(self, sqlContext, varName):
+        df = self.toDF(sqlContext, varName).sort('ID').drop('ID')
+        return df.toPandas()
+        
     def getMLMatrix(self, sqlContext, varName):
         raise Exception('Not supported in Python MLContext')
         #try:
@@ -247,3 +257,163 @@ def getStringRDD(self, varName, format):
         #    return rdd
         #except Py4JJavaError:
         #    traceback.print_exc()
+
+def getNumCols(numPyArr):
+	if len(numPyArr.shape) == 1:
+		return 1
+	else:
+		return numPyArr.shape[1]
+       
+def convertToJavaMatrix(sc, src):
+	if isinstance(src, np.ndarray):
+		from array import array
+		numCols = getNumCols(src)
+		numRows = src.shape[0]
+		if src.dtype.type is np.float64:
+			arr = src.reshape(-1)
+		else:
+			arr = array('d', src.reshape(-1))
+		buf = bytearray(arr.tostring())
+		return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols)
+	else:
+		raise Exception('Type is not supported')
+
+def convertToNumpyArr(sc, mb):
+	numRows = mb.getNumRows()
+	numCols = mb.getNumColumns()
+	buf = sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertMBtoPy4JDenseArr(mb)
+	return np.frombuffer(buf, count=numRows*numCols, dtype=np.float64)
+
+class mllearn:
+    # Or we can create new Python project with package structure
+    class LogisticRegression(Estimator):
+
+        def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
+            self.sqlCtx = sqlCtx
+            self.sc = sqlCtx._sc
+            self.log = self.sc._jvm.org.apache.sysml.api.ml.LogisticRegression("lr", self.sc._jsc.sc())
+            self.transferUsingDF = transferUsingDF
+            if penalty != 'l2':
+                raise Exception('Only l2 penalty is supported')
+            if fit_intercept:
+                self.icpt = 1
+            else:
+                self.icpt = 0
+            self.max_iter = max_iter
+            self.max_inner_iter = max_inner_iter
+            self.tol = tol
+            if C == 0:
+                raise Exception('C cannot be 0')
+            reg = 1/C
+            self.reg = reg
+            self.updateLog()
+            if solver != 'newton-cg':
+                raise Exception('Only newton-cg solver supported')
+             
+        def updateLog(self):
+            self.log.setMaxOuterIter(self.max_iter)
+            self.log.setMaxInnerIter(self.max_inner_iter) 
+            self.log.setRegParam(self.reg)
+            self.log.setTol(self.tol)
+            self.log.setIcpt(self.icpt)
+            
+        def convertToPDF(self, X):
+            if isinstance(X, np.ndarray):
+                colNames = []
+                numCols = getNumCols(X)
+                for i in range(0, numCols):
+                    colNames = colNames + [ str('C' + str(i))]
+                pdfX = pd.DataFrame(X, columns=colNames)
+            elif isinstance(X, pd.core.frame.DataFrame):
+                pdfX = X
+            else:
+                raise Exception('The input type not supported')
+            return pdfX
+            
+        def tolist(self, inputCols):
+            if isinstance(inputCols, pd.indexes.base.Index):
+                return inputCols.get_values().tolist()
+            elif isinstance(inputCols, list):
+                return inputCols
+            else:
+                raise Exception('inputCols should be of type pandas.indexes.base.Index or list')
+                
+        def assemble(self, pdf, inputCols, outputCol):
+            tmpDF = self.sqlCtx.createDataFrame(pdf, self.tolist(pdf.columns))
+            assembler = VectorAssembler(inputCols=self.tolist(inputCols), outputCol=outputCol)
+            return assembler.transform(tmpDF)
+            
+        def _fit(self, X):
+            if hasattr(X, '_jdf') and 'features' in X.columns and 'label' in X.columns:
+                self.model = self.log.fit(X._jdf)
+                return self
+            else:
+                raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns')
+            
+        # TOOD: Ignoring kwargs
+        def fit(self, X, *args, **kwargs):
+            self.updateLog()
+            numArgs = len(args) + 1
+            if numArgs == 1:
+                return self._fit(X)
+            elif numArgs == 2 and (isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame)):
+                y = args[0]
+                if self.transferUsingDF:
+                    pdfX = self.convertToPDF(X)
+                    pdfY = self.convertToPDF(y)
+                    if getNumCols(pdfY) != 1:
+                        raise Exception('y should be a column vector')
+                    if pdfX.shape[0] != pdfY.shape[0]:
+                        raise Exception('Number of rows of X and y should match')
+                    colNames = pdfX.columns
+                    pdfX['label'] = pdfY[pdfY.columns[0]]
+                    df = self.assemble(pdfX, colNames, 'features').select('features', 'label')
+                    self.model = self.log.fit(df._jdf)
+                else:
+                    numColsy = getNumCols(y)
+                    if numColsy != 1:
+                        raise Exception('Expected y to be a column vector')
+                    self.model = self.log.fit(convertToJavaMatrix(self.sc, X), convertToJavaMatrix(self.sc, y))
+                self.model.setOutputRawPredictions(False)
+                return self
+            else:
+                raise Exception('Unsupported input type')
+        
+        def transform(self, X):
+            return self.predict(X)
+            
+        def predict(self, X):
+            if isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame):
+                if self.transferUsingDF:
+                    pdfX = self.convertToPDF(X)
+                    df = self.assemble(pdfX, pdfX.columns, 'features').select('features')
+                    retjDF = self.model.transform(df._jdf)
+                    retDF = DataFrame(retjDF, self.sqlCtx)
+                    retPDF = retDF.sort('ID').select('prediction').toPandas()
+                    if isinstance(X, np.ndarray):
+                        return retPDF.as_matrix().flatten()
+                    else:
+                        return retPDF
+                else:
+                    retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToJavaMatrix(self.sc, X)))
+                    if isinstance(X, np.ndarray):
+                        return retNumPy
+                    else:
+                        return retNumPy # TODO: Convert to Pandas
+            elif hasattr(X, '_jdf'):
+                if 'features' in X.columns:
+                    # No need to assemble as input DF is likely coming via MLPipeline
+                    df = X
+                else:
+                    assembler = VectorAssembler(inputCols=X.columns, outputCol='features')
+                    df = assembler.transform(X)
+                retjDF = self.model.transform(df._jdf)
+                retDF = DataFrame(retjDF, self.sqlCtx)
+                # Return DF
+                return retDF.sort('ID')
+            else:
+                raise Exception('Unsupported input type')
+                
+        def score(self, X, y):
+            return sk.metrics.accuracy_score(y, self.predict(X))
+                
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
index f022e40b947..114e78fc3d3 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
@@ -46,6 +46,8 @@
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 
 import scala.Tuple2;
 
@@ -260,6 +262,47 @@ public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToBinaryBlock(Jav
 		return dataFrameToBinaryBlock(sc, df, mcOut, false, columns);
 	}
 	
+	public static MatrixBlock convertPy4JArrayToMB(byte [] data, int rlen, int clen) throws DMLRuntimeException {
+		return convertPy4JArrayToMB(data, rlen, clen, false);
+	}
+	
+	public static MatrixBlock convertPy4JArrayToMB(byte [] data, int rlen, int clen, boolean isSparse) throws DMLRuntimeException {
+		MatrixBlock mb = new MatrixBlock(rlen, clen, isSparse, -1);
+		if(isSparse) {
+			throw new DMLRuntimeException("Convertion to sparse format not supported");
+		}
+		else {
+			double [] denseBlock = new double[rlen*clen];
+			ByteBuffer buf = ByteBuffer.wrap(data);
+			buf.order(ByteOrder.nativeOrder());
+			for(int i = 0; i < rlen*clen; i++) {
+				denseBlock[i] = buf.getDouble();
+			}
+			mb.init( denseBlock, rlen, clen );
+		}
+		mb.examSparsity();
+		return mb;
+	}
+	
+	public static byte [] convertMBtoPy4JDenseArr(MatrixBlock mb) throws DMLRuntimeException {
+		byte [] ret = null;
+		if(mb.isInSparseFormat()) {
+			throw new DMLRuntimeException("Sparse to dense conversion is not yet implemented");
+		}
+		else {
+			double [] denseBlock = mb.getDenseBlock();
+			if(denseBlock == null) {
+				throw new DMLRuntimeException("Sparse to dense conversion is not yet implemented");
+			}
+			int times = Double.SIZE / Byte.SIZE;
+			ret = new byte[denseBlock.length * times];
+			for(int i=0;i < denseBlock.length;i++){
+		        ByteBuffer.wrap(ret, i*times, times).order(ByteOrder.nativeOrder()).putDouble(denseBlock[i]);
+			}
+		}
+		return ret;
+	}
+	
 	/**
 	 * Converts DataFrame into binary blocked RDD. 
 	 * Note: mcOut will be set if you don't know the dimensions.
diff --git a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
index 2fabde1b14a..c679ff6d212 100644
--- a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
@@ -34,6 +34,10 @@ import org.apache.spark.SparkConf
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
 import scala.reflect.ClassTag
+import scala.collection.immutable.HashMap
+import org.apache.spark.sql.functions.udf
+import org.apache.sysml.runtime.matrix.data.MatrixBlock
+import org.apache.sysml.runtime.DMLRuntimeException
 
 trait HasIcpt extends Params {
   final val icpt: Param[Int] = new Param[Int](this, "icpt", "Intercept presence, shifting and rescaling X columns")
@@ -81,12 +85,62 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends
     copyValues(that, extra)
   }
   override def transformSchema(schema: StructType): StructType = schema
+  
+  // Note: will update the y_mb as this will be called by Python mllearn
+  def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): LogisticRegressionModel = {
+    val ml = new MLContext(sc)
+    val labelMapping = new java.util.HashMap[String, Int]
+    val revLabelMapping = new java.util.HashMap[Int, String]
+    if(y_mb.getNumColumns != 1) {
+      throw new RuntimeException("Expected a column vector for y")
+    }
+    if(y_mb.isInSparseFormat()) {
+      throw new DMLRuntimeException("Sparse block is not implemented for fit")
+    }
+    else {
+      val denseBlock = y_mb.getDenseBlock()
+      var id:Int = 1
+      for(i <- 0 until denseBlock.length) {
+        val v = denseBlock(i).toString()
+        if(!labelMapping.containsKey(v)) {
+          labelMapping.put(v, id)
+          revLabelMapping.put(id, v)
+          id += 1
+        }
+        denseBlock.update(i, labelMapping.get(v))
+      }  
+    }
+    
+    val mloutput = {
+      val paramsMap: Map[String, String] = Map(
+        "icpt" -> this.getIcpt.toString(),
+        "reg" -> this.getRegParam.toString(),
+        "tol" -> this.getTol.toString,
+        "moi" -> this.getMaxOuterIte.toString,
+        "mii" -> this.getMaxInnerIter.toString,
+
+        "X" -> " ",
+        "Y" -> " ",
+        "B" -> " ")
+      ml.registerInput("X", X_mb);
+      ml.registerInput("Y_vec", y_mb);
+      ml.registerOutput("B_out");
+      ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath), paramsMap)
+    }
+    new LogisticRegressionModel("logisticRegression")(mloutput, revLabelMapping, sc)
+  }
   override def fit(df: DataFrame): LogisticRegressionModel = {
     val ml = new MLContext(df.rdd.sparkContext)
     val mcXin = new MatrixCharacteristics()
     val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features")
-    val yin = df.select("label").rdd.map { _.apply(0).toString() }
-
+    val temp = df.select("label").distinct.rdd.map(_.apply(0).toString).collect()
+    val labelMapping = new java.util.HashMap[String, Int]
+    val revLabelMapping = new java.util.HashMap[Int, String]
+    for(i <- 0 until temp.length) {
+      labelMapping.put(temp(i), i+1)
+      revLabelMapping.put(i+1, temp(i))
+    }
+    val yin = df.select("label").rdd.map( x => labelMapping.get(x.apply(0).toString).toString )
     val mloutput = {
       val paramsMap: Map[String, String] = Map(
         "icpt" -> this.getIcpt.toString(),
@@ -102,67 +156,151 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends
       ml.registerInput("Y_vec", yin, "csv");
       ml.registerOutput("B_out");
       ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath), paramsMap)
-      //ml.execute(ScriptsUtils.resolvePath(LogisticRegression.scriptPath), paramsMap)
     }
-    new LogisticRegressionModel("logisticRegression")(mloutput)
+    new LogisticRegressionModel("logisticRegression")(mloutput, revLabelMapping, sc)
   }
 }
 object LogisticRegressionModel {
   final val scriptPath = "scripts" + File.separator + "algorithms" + File.separator + "GLM-predict.dml"
 }
 
+class LogisticRegressionModelSerializableData(val labelMapping: java.util.HashMap[Int, String]) extends Serializable {
+ def mapLabelStr(x:Double):String = {
+   if(labelMapping.containsKey(x.toInt))
+     labelMapping.get(x.toInt)
+   else
+     throw new RuntimeException("Incorrect label mapping")
+ }
+ def mapLabelDouble(x:Double):Double = {
+   if(labelMapping.containsKey(x.toInt))
+     labelMapping.get(x.toInt).toDouble
+   else
+     throw new RuntimeException("Incorrect label mapping")
+ }
+ val mapLabel_udf =  {
+      try {
+        val it = labelMapping.values().iterator()
+        while(it.hasNext()) {
+          it.next().toDouble
+        }
+        udf(mapLabelDouble _)
+      } catch {
+        case e: Exception => udf(mapLabelStr _)
+      }
+    }
+}
 /**
  * Logistic Regression Scala API
  */
 
 class LogisticRegressionModel(
   override val uid: String)(
-    val mloutput: MLOutput) extends Model[LogisticRegressionModel] with HasIcpt
+    val mloutput: MLOutput, val labelMapping: java.util.HashMap[Int, String], val sc: SparkContext) extends Model[LogisticRegressionModel] with HasIcpt
     with HasRegParam with HasTol with HasMaxOuterIter with HasMaxInnerIter {
   override def copy(extra: ParamMap): LogisticRegressionModel = {
-    val that = new LogisticRegressionModel(uid)(mloutput)
+    val that = new LogisticRegressionModel(uid)(mloutput, labelMapping, sc)
     copyValues(that, extra)
   }
+  var outputRawPredictions = true
+  def setOutputRawPredictions(outRawPred:Boolean): Unit = { outputRawPredictions = outRawPred }
   override def transformSchema(schema: StructType): StructType = schema
-  override def transform(df: DataFrame): DataFrame = {
-    val ml = new MLContext(df.rdd.sparkContext)
-
-    val mcXin = new MatrixCharacteristics()
-    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features")
-
-    val mlscoreoutput = {
-      val paramsMap: Map[String, String] = Map(
+   
+  def transform(X: MatrixBlock): MatrixBlock = {
+    if(outputRawPredictions) {
+      throw new RuntimeException("Outputting raw prediction is not supported")
+    }
+    else {
+      val isSingleNode = true
+      val ret = computePredictedLabels(doGLMPredict(isSingleNode, null, X), isSingleNode).getMatrixBlock("Prediction");
+      if(ret.getNumColumns != 1) {
+        throw new RuntimeException("Expected predicted label to be a column vector")
+      }
+      if(ret.isInSparseFormat()) {
+        throw new RuntimeException("Since predicted label is a column vector, expected it to be in dense format")
+      }
+      else {
+        updateLabels(true, null, ret, null)
+      }
+      return ret
+    }
+  }
+  
+  def updateLabels(isSingleNode:Boolean, df:DataFrame, X: MatrixBlock, labelColName:String): DataFrame = {
+    if(isSingleNode) {
+      for(i <- 0 until X.getNumRows) {
+        val v:Int = X.getValue(i, 0).toInt
+        if(labelMapping.containsKey(v)) {
+          X.setValue(i, 0, labelMapping.get(v).toDouble)
+        }
+        else {
+          throw new RuntimeException("No mapping found for " + v + " in " + labelMapping.toString())
+        }
+      }
+      return null
+    }
+    else {
+      val serObj = new LogisticRegressionModelSerializableData(labelMapping)
+      return df.withColumn(labelColName, serObj.mapLabel_udf(df(labelColName)))
+               .withColumnRenamed(labelColName, "prediction")
+    }
+  }
+  
+  def doGLMPredict(isSingleNode:Boolean, df:DataFrame, X: MatrixBlock): MLOutput = {
+    val ml = new MLContext(sc)
+    val paramsMap: Map[String, String] = Map(
         "X" -> " ",
-        "B" -> " ")
-      ml.registerInput("X", Xin, mcXin);
-      ml.registerInput("B_full", mloutput.getBinaryBlockedRDD("B_out"), mloutput.getMatrixCharacteristics("B_out"));
+        "B" -> " ",
+        "dfam" -> "3")
+      if(isSingleNode) {
+        ml.registerInput("X", X);
+        ml.registerInput("B_full", mloutput.getMatrixBlock("B_out"), mloutput.getMatrixCharacteristics("B_out"));
+      }
+      else {
+        val mcXin = new MatrixCharacteristics()
+        val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features")
+        ml.registerInput("X", Xin, mcXin);
+        ml.registerInput("B_full", mloutput.getBinaryBlockedRDD("B_out"), mloutput.getMatrixCharacteristics("B_out"));  
+      }
       ml.registerOutput("means");
       ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegressionModel.scriptPath), paramsMap)
+  }
+  
+  def computePredictedLabels(mlscoreoutput:MLOutput, isSingleNode:Boolean): MLOutput = {
+    val mlNew = new MLContext(sc)
+    if(isSingleNode) {
+      mlNew.registerInput("Prob", mlscoreoutput.getMatrixBlock("means"), mlscoreoutput.getMatrixCharacteristics("means"));
+    }
+    else {
+      mlNew.registerInput("Prob", mlscoreoutput.getBinaryBlockedRDD("means"), mlscoreoutput.getMatrixCharacteristics("means"));
     }
+    mlNew.registerOutput("Prediction")
+    mlNew.executeScript(
+      """
+        Prob = read("temp1");
+        Prediction = rowIndexMax(Prob); # assuming one-based label mapping
+        write(Prediction, "tempOut", "csv");
+        """)
+  }
+  
+  def joinUsingID(df1:DataFrame, df2:DataFrame):DataFrame = {
+    val tempDF1 = df1.withColumnRenamed("ID", "ID1")
+    tempDF1.join(df2, tempDF1.col("ID1").equalTo(df2.col("ID"))).drop("ID1")
+  }
+  
+  override def transform(df: DataFrame): DataFrame = {
+    val ml = new MLContext(df.rdd.sparkContext)
 
-    val prob = mlscoreoutput.getDF(df.sqlContext, "means", true).withColumnRenamed("C1", "probability")
-
-    val mlNew = new MLContext(df.rdd.sparkContext)
-    mlNew.registerInput("X", Xin, mcXin);
-    mlNew.registerInput("B_full", mloutput.getBinaryBlockedRDD("B_out"), mloutput.getMatrixCharacteristics("B_out"));
-    mlNew.registerInput("Prob", mlscoreoutput.getBinaryBlockedRDD("means"), mlscoreoutput.getMatrixCharacteristics("means"));
-    mlNew.registerOutput("Prediction");
-    mlNew.registerOutput("rawPred");
-
-    val outNew = mlNew.executeScript("Prob = read(\"temp1\"); "
-      + "Prediction = rowIndexMax(Prob); "
-      + "write(Prediction, \"tempOut\", \"csv\")"
-      + "X = read(\"temp2\");"
-      + "B_full = read(\"temp3\");"
-      + "rawPred = 1 / (1 + exp(- X * t(B_full)) );" // Raw prediction logic: 
-      + "write(rawPred, \"tempOut1\", \"csv\")");
-
-    val pred = outNew.getDF(df.sqlContext, "Prediction").withColumnRenamed("C1", "prediction").withColumnRenamed("ID", "ID1")
-    val rawPred = outNew.getDF(df.sqlContext, "rawPred", true).withColumnRenamed("C1", "rawPrediction").withColumnRenamed("ID", "ID2")
-    var predictionsNProb = prob.join(pred, prob.col("ID").equalTo(pred.col("ID1"))).select("ID", "probability", "prediction")
-    predictionsNProb = predictionsNProb.join(rawPred, predictionsNProb.col("ID").equalTo(rawPred.col("ID2"))).select("ID", "probability", "prediction", "rawPrediction")
-    val dataset1 = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID")
-    dataset1.join(predictionsNProb, dataset1.col("ID").equalTo(predictionsNProb.col("ID")))
+    val isSingleNode = false
+    val glmPredOut = doGLMPredict(isSingleNode, df, null)
+    val predLabelOut = computePredictedLabels(glmPredOut, isSingleNode)
+    val predictedDF = updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1").select("ID", "prediction")
+    val prob = glmPredOut.getDF(df.sqlContext, "means", true).withColumnRenamed("C1", "probability").select("ID", "probability")
+    val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID")
+    
+    if(outputRawPredictions) {
+      // Not supported: rawPred = 1 / (1 + exp(- X * t(B_full)) );
+    }
+    return joinUsingID(dataset, joinUsingID(prob, predictedDF))
   }
 }
 

From c4ab1e6ebde04687e4b47cae64f45e7b4dc6511f Mon Sep 17 00:00:00 2001
From: Niketan Pansare <npansar@us.ibm.com>
Date: Fri, 5 Aug 2016 15:38:01 -0700
Subject: [PATCH 02/14] [SYSTEMML-234] [SYSTEMML-208] Updated the code as per
 Manoj's suggestion

---
 .../org/apache/sysml/api/python/SystemML.py   | 21 +++++++------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py
index 3dfef67eb17..753527c8292 100644
--- a/src/main/java/org/apache/sysml/api/python/SystemML.py
+++ b/src/main/java/org/apache/sysml/api/python/SystemML.py
@@ -259,24 +259,19 @@ def getStringRDD(self, varName, format):
         #    traceback.print_exc()
 
 def getNumCols(numPyArr):
-	if len(numPyArr.shape) == 1:
+	if numPyArr.ndim == 1:
 		return 1
 	else:
 		return numPyArr.shape[1]
        
 def convertToJavaMatrix(sc, src):
-	if isinstance(src, np.ndarray):
-		from array import array
-		numCols = getNumCols(src)
-		numRows = src.shape[0]
-		if src.dtype.type is np.float64:
-			arr = src.reshape(-1)
-		else:
-			arr = array('d', src.reshape(-1))
-		buf = bytearray(arr.tostring())
-		return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols)
-	else:
-		raise Exception('Type is not supported')
+	src = np.asarray(src)
+	numCols = getNumCols(src)
+	numRows = src.shape[0]
+	arr = src.ravel().astype(np.float64)
+	buf = bytearray(arr.tostring())
+	return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols)
+	
 
 def convertToNumpyArr(sc, mb):
 	numRows = mb.getNumRows()

From 5f8c532742816e717d8951a8a8b298b8fb41dc31 Mon Sep 17 00:00:00 2001
From: Niketan Pansare <npansar@us.ibm.com>
Date: Fri, 5 Aug 2016 15:55:38 -0700
Subject: [PATCH 03/14] [SYSTEMML-234] [SYSTEMML-208] Minor code refactoring

---
 .../org/apache/sysml/api/python/SystemML.py   | 60 ++++++++++---------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py
index 753527c8292..bba4db8dfad 100644
--- a/src/main/java/org/apache/sysml/api/python/SystemML.py
+++ b/src/main/java/org/apache/sysml/api/python/SystemML.py
@@ -23,6 +23,7 @@
 from py4j.protocol import Py4JJavaError, Py4JError
 import traceback
 import os
+from pyspark.context import SparkContext 
 from pyspark.sql import DataFrame, SQLContext
 from pyspark.rdd import RDD
 import numpy as np
@@ -259,25 +260,31 @@ def getStringRDD(self, varName, format):
         #    traceback.print_exc()
 
 def getNumCols(numPyArr):
-	if numPyArr.ndim == 1:
-		return 1
-	else:
-		return numPyArr.shape[1]
+    if numPyArr.ndim == 1:
+        return 1
+    else:
+        return numPyArr.shape[1]
        
-def convertToJavaMatrix(sc, src):
-	src = np.asarray(src)
-	numCols = getNumCols(src)
-	numRows = src.shape[0]
-	arr = src.ravel().astype(np.float64)
-	buf = bytearray(arr.tostring())
-	return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols)
-	
+def convertToMatrixBlock(sc, src):
+    if isinstance(sc, SparkContext):
+        src = np.asarray(src)
+        numCols = getNumCols(src)
+        numRows = src.shape[0]
+        arr = src.ravel().astype(np.float64)
+        buf = bytearray(arr.tostring())
+        return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols)
+    else:
+        raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves
+    
 
 def convertToNumpyArr(sc, mb):
-	numRows = mb.getNumRows()
-	numCols = mb.getNumColumns()
-	buf = sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertMBtoPy4JDenseArr(mb)
-	return np.frombuffer(buf, count=numRows*numCols, dtype=np.float64)
+    if isinstance(sc, SparkContext):
+        numRows = mb.getNumRows()
+        numCols = mb.getNumColumns()
+        buf = sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertMBtoPy4JDenseArr(mb)
+        return np.frombuffer(buf, count=numRows*numCols, dtype=np.float64)
+    else:
+        raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves
 
 class mllearn:
     # Or we can create new Python project with package structure
@@ -290,15 +297,12 @@ def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_i
             self.transferUsingDF = transferUsingDF
             if penalty != 'l2':
                 raise Exception('Only l2 penalty is supported')
-            if fit_intercept:
-                self.icpt = 1
-            else:
-                self.icpt = 0
+            self.icpt = int(fit_intercept)
             self.max_iter = max_iter
             self.max_inner_iter = max_inner_iter
             self.tol = tol
-            if C == 0:
-                raise Exception('C cannot be 0')
+            if C < 0:
+                raise Exception('C has to be positive')
             reg = 1/C
             self.reg = reg
             self.updateLog()
@@ -312,7 +316,7 @@ def updateLog(self):
             self.log.setTol(self.tol)
             self.log.setIcpt(self.icpt)
             
-        def convertToPDF(self, X):
+        def convertToPandasDF(self, X):
             if isinstance(X, np.ndarray):
                 colNames = []
                 numCols = getNumCols(X)
@@ -354,8 +358,8 @@ def fit(self, X, *args, **kwargs):
             elif numArgs == 2 and (isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame)):
                 y = args[0]
                 if self.transferUsingDF:
-                    pdfX = self.convertToPDF(X)
-                    pdfY = self.convertToPDF(y)
+                    pdfX = self.convertToPandasDF(X)
+                    pdfY = self.convertToPandasDF(y)
                     if getNumCols(pdfY) != 1:
                         raise Exception('y should be a column vector')
                     if pdfX.shape[0] != pdfY.shape[0]:
@@ -368,7 +372,7 @@ def fit(self, X, *args, **kwargs):
                     numColsy = getNumCols(y)
                     if numColsy != 1:
                         raise Exception('Expected y to be a column vector')
-                    self.model = self.log.fit(convertToJavaMatrix(self.sc, X), convertToJavaMatrix(self.sc, y))
+                    self.model = self.log.fit(convertToMatrixBlock(self.sc, X), convertToMatrixBlock(self.sc, y))
                 self.model.setOutputRawPredictions(False)
                 return self
             else:
@@ -380,7 +384,7 @@ def transform(self, X):
         def predict(self, X):
             if isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame):
                 if self.transferUsingDF:
-                    pdfX = self.convertToPDF(X)
+                    pdfX = self.convertToPandasDF(X)
                     df = self.assemble(pdfX, pdfX.columns, 'features').select('features')
                     retjDF = self.model.transform(df._jdf)
                     retDF = DataFrame(retjDF, self.sqlCtx)
@@ -390,7 +394,7 @@ def predict(self, X):
                     else:
                         return retPDF
                 else:
-                    retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToJavaMatrix(self.sc, X)))
+                    retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))
                     if isinstance(X, np.ndarray):
                         return retNumPy
                     else:

From f223a0aadaffc2f78fd45444950025dbef1d0e38 Mon Sep 17 00:00:00 2001
From: Niketan Pansare <npansar@us.ibm.com>
Date: Fri, 5 Aug 2016 15:57:34 -0700
Subject: [PATCH 04/14] [SYSTEMML-234] [SYSTEMML-208] Taking care of division
 issue

---
 src/main/java/org/apache/sysml/api/python/SystemML.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py
index bba4db8dfad..3f3f453afd4 100644
--- a/src/main/java/org/apache/sysml/api/python/SystemML.py
+++ b/src/main/java/org/apache/sysml/api/python/SystemML.py
@@ -33,6 +33,7 @@
 from pyspark.mllib.linalg import Vectors
 import sys
 from pyspark.ml import Estimator, Model
+from __future__ import division
 
 class MLContext(object):
 
@@ -303,7 +304,7 @@ def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_i
             self.tol = tol
             if C < 0:
                 raise Exception('C has to be positive')
-            reg = 1/C
+            reg = 1.0 / C
             self.reg = reg
             self.updateLog()
             if solver != 'newton-cg':

From e7371aa0d617a25e94829ddfcc2e5dc11330bce4 Mon Sep 17 00:00:00 2001
From: Niketan Pansare <npansar@us.ibm.com>
Date: Fri, 5 Aug 2016 16:15:17 -0700
Subject: [PATCH 05/14] [SYSTEMML-234] [SYSTEMML-208] Avoid divide by zero

---
 src/main/java/org/apache/sysml/api/python/SystemML.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py
index 3f3f453afd4..fca3f1e8cc4 100644
--- a/src/main/java/org/apache/sysml/api/python/SystemML.py
+++ b/src/main/java/org/apache/sysml/api/python/SystemML.py
@@ -302,7 +302,7 @@ def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_i
             self.max_iter = max_iter
             self.max_inner_iter = max_inner_iter
             self.tol = tol
-            if C < 0:
+            if C <= 0:
                 raise Exception('C has to be positive')
             reg = 1.0 / C
             self.reg = reg

From 9fff023802701cbf45b57bf2349a319bc68d0280 Mon Sep 17 00:00:00 2001
From: Niketan Pansare <npansar@us.ibm.com>
Date: Fri, 5 Aug 2016 17:01:12 -0700
Subject: [PATCH 06/14] [SYSTEMML-234] [SYSTEMML-208] Refactored
 convertToPandas

---
 .../org/apache/sysml/api/python/SystemML.py   | 26 ++++++-------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py
index fca3f1e8cc4..d2051aac8d7 100644
--- a/src/main/java/org/apache/sysml/api/python/SystemML.py
+++ b/src/main/java/org/apache/sysml/api/python/SystemML.py
@@ -20,6 +20,7 @@
 #
 #-------------------------------------------------------------
 
+from __future__ import division
 from py4j.protocol import Py4JJavaError, Py4JError
 import traceback
 import os
@@ -33,7 +34,7 @@
 from pyspark.mllib.linalg import Vectors
 import sys
 from pyspark.ml import Estimator, Model
-from __future__ import division
+
 
 class MLContext(object):
 
@@ -318,17 +319,9 @@ def updateLog(self):
             self.log.setIcpt(self.icpt)
             
         def convertToPandasDF(self, X):
-            if isinstance(X, np.ndarray):
-                colNames = []
-                numCols = getNumCols(X)
-                for i in range(0, numCols):
-                    colNames = colNames + [ str('C' + str(i))]
-                pdfX = pd.DataFrame(X, columns=colNames)
-            elif isinstance(X, pd.core.frame.DataFrame):
-                pdfX = X
-            else:
-                raise Exception('The input type not supported')
-            return pdfX
+            if not instance(X, pd.DataFrame):
+                return pd.DataFrame(X, columns=['C' + str(i) for i in range(numCols)])
+            return X
             
         def tolist(self, inputCols):
             if isinstance(inputCols, pd.indexes.base.Index):
@@ -350,14 +343,11 @@ def _fit(self, X):
             else:
                 raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns')
             
-        # TOOD: Ignoring kwargs
-        def fit(self, X, *args, **kwargs):
+        def fit(self, X, y=None, params=None):
             self.updateLog()
-            numArgs = len(args) + 1
-            if numArgs == 1:
+            if y is None:
                 return self._fit(X)
-            elif numArgs == 2 and (isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame)):
-                y = args[0]
+            elif y is not None and (isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame)):
                 if self.transferUsingDF:
                     pdfX = self.convertToPandasDF(X)
                     pdfY = self.convertToPandasDF(y)

From 41f1668b0280b23cf4b0d5dc5e8c92d9922ea9c3 Mon Sep 17 00:00:00 2001
From: Niketan Pansare <npansar@us.ibm.com>
Date: Sat, 6 Aug 2016 18:33:23 -0700
Subject: [PATCH 07/14] [SYSTEMML-234] [SYSTEMML-208] Fixed bugs in MLContext
 and added LinearRegression

Only scikit learn way of usage tested
---
 .../java/org/apache/sysml/api/MLContext.java  |  96 +++++-------
 .../org/apache/sysml/api/python/SystemML.py   | 132 +++++++++-------
 .../sysml/api/ml/LinearRegression.scala       | 146 ++++++++++++++++++
 .../sysml/api/ml/LogisticRegression.scala     |  88 +++--------
 .../apache/sysml/api/ml/PredictionUtils.scala |  69 +++++++++
 5 files changed, 351 insertions(+), 180 deletions(-)
 create mode 100644 src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala
 create mode 100644 src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala

diff --git a/src/main/java/org/apache/sysml/api/MLContext.java b/src/main/java/org/apache/sysml/api/MLContext.java
index 32b0ce949e5..32dc544f8e2 100644
--- a/src/main/java/org/apache/sysml/api/MLContext.java
+++ b/src/main/java/org/apache/sysml/api/MLContext.java
@@ -1241,74 +1241,56 @@ private MLOutput compileAndExecuteScript(String dmlScriptFilePath, String [] arg
 	 * @throws ParseException
 	 */
 	private synchronized MLOutput compileAndExecuteScript(String dmlScriptFilePath, String [] args,  boolean isFile, boolean isNamedArgument, boolean isPyDML, String configFilePath) throws IOException, DMLException {
-		try {
-			if(getActiveMLContext() != null) {
-				throw new DMLRuntimeException("SystemML (and hence by definition MLContext) doesnot support parallel execute() calls from same or different MLContexts. "
-						+ "As a temporary fix, please do explicit synchronization, i.e. synchronized(MLContext.class) { ml.execute(...) } ");
+		// Set active MLContext.
+		_activeMLContext = this;
+		
+		if(_monitorUtils != null) {
+			_monitorUtils.resetMonitoringData();
+		}
+		
+		if(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK) {
+			
+			// Depending on whether registerInput/registerOutput was called initialize the variables 
+			String[] inputs; String[] outputs;
+			if(_inVarnames != null) {
+				inputs = _inVarnames.toArray(new String[0]);
+			}
+			else {
+				inputs = new String[0];
+			}
+			if(_outVarnames != null) {
+				outputs = _outVarnames.toArray(new String[0]);
+			}
+			else {
+				outputs = new String[0];
 			}
+			Map<String, MatrixCharacteristics> outMetadata = new HashMap<String, MatrixCharacteristics>();
 			
-			// Set active MLContext.
-			_activeMLContext = this;
+			Map<String, String> argVals = DMLScript.createArgumentsMap(isNamedArgument, args);
 			
-			if(_monitorUtils != null) {
-				_monitorUtils.resetMonitoringData();
-			}
+			// Run the DML script
+			ExecutionContext ec = executeUsingSimplifiedCompilationChain(dmlScriptFilePath, isFile, argVals, isPyDML, inputs, outputs, _variables, configFilePath);
 			
-			if(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK) {
-				
-				// Depending on whether registerInput/registerOutput was called initialize the variables 
-				String[] inputs; String[] outputs;
-				if(_inVarnames != null) {
-					inputs = _inVarnames.toArray(new String[0]);
-				}
-				else {
-					inputs = new String[0];
+			// Now collect the output
+			if(_outVarnames != null) {
+				if(_variables == null) {
+					throw new DMLRuntimeException("The symbol table returned after executing the script is empty");
 				}
-				if(_outVarnames != null) {
-					outputs = _outVarnames.toArray(new String[0]);
-				}
-				else {
-					outputs = new String[0];
-				}
-				Map<String, MatrixCharacteristics> outMetadata = new HashMap<String, MatrixCharacteristics>();
-				
-				Map<String, String> argVals = DMLScript.createArgumentsMap(isNamedArgument, args);
-				
-				// Run the DML script
-				ExecutionContext ec = executeUsingSimplifiedCompilationChain(dmlScriptFilePath, isFile, argVals, isPyDML, inputs, outputs, _variables, configFilePath);
 				
-				// Now collect the output
-				if(_outVarnames != null) {
-					if(_variables == null) {
-						throw new DMLRuntimeException("The symbol table returned after executing the script is empty");
+				for( String ovar : _outVarnames ) {
+					if( _variables.keySet().contains(ovar) ) {
+						outMetadata.put(ovar, ec.getMatrixCharacteristics(ovar)); // For converting output to dataframe
 					}
-					
-					for( String ovar : _outVarnames ) {
-						if( _variables.keySet().contains(ovar) ) {
-							outMetadata.put(ovar, ec.getMatrixCharacteristics(ovar)); // For converting output to dataframe
-						}
-						else {
-							throw new DMLException("Error: The variable " + ovar + " is not available as output after the execution of the DMLScript.");
-						}
+					else {
+						throw new DMLException("Error: The variable " + ovar + " is not available as output after the execution of the DMLScript.");
 					}
 				}
-				
-				return new MLOutput(_variables, ec, outMetadata);
 			}
-			else {
-				throw new DMLRuntimeException("Unsupported runtime:" + DMLScript.rtplatform.name());
-			}
-		
-		}
-		finally {
-			// Remove global dml config and all thread-local configs
-			// TODO enable cleanup whenever invalid GNMF MLcontext is fixed 
-			// (the test is invalid because it assumes that status of previous execute is kept)
-			//ConfigurationManager.setGlobalConfig(new DMLConfig());
-			//ConfigurationManager.clearLocalConfigs();
 			
-			// Reset active MLContext.
-			_activeMLContext = null;	
+			return new MLOutput(_variables, ec, outMetadata);
+		}
+		else {
+			throw new DMLRuntimeException("Unsupported runtime:" + DMLScript.rtplatform.name());
 		}
 	}
 	
diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py
index d2051aac8d7..85cc4138285 100644
--- a/src/main/java/org/apache/sysml/api/python/SystemML.py
+++ b/src/main/java/org/apache/sysml/api/python/SystemML.py
@@ -288,83 +288,55 @@ def convertToNumpyArr(sc, mb):
     else:
         raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves
 
-class mllearn:
-    # Or we can create new Python project with package structure
-    class LogisticRegression(Estimator):
-
-        def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
-            self.sqlCtx = sqlCtx
-            self.sc = sqlCtx._sc
-            self.log = self.sc._jvm.org.apache.sysml.api.ml.LogisticRegression("lr", self.sc._jsc.sc())
-            self.transferUsingDF = transferUsingDF
-            if penalty != 'l2':
-                raise Exception('Only l2 penalty is supported')
-            self.icpt = int(fit_intercept)
-            self.max_iter = max_iter
-            self.max_inner_iter = max_inner_iter
-            self.tol = tol
-            if C <= 0:
-                raise Exception('C has to be positive')
-            reg = 1.0 / C
-            self.reg = reg
-            self.updateLog()
-            if solver != 'newton-cg':
-                raise Exception('Only newton-cg solver supported')
-             
-        def updateLog(self):
-            self.log.setMaxOuterIter(self.max_iter)
-            self.log.setMaxInnerIter(self.max_inner_iter) 
-            self.log.setRegParam(self.reg)
-            self.log.setTol(self.tol)
-            self.log.setIcpt(self.icpt)
-            
-        def convertToPandasDF(self, X):
-            if not instance(X, pd.DataFrame):
-                return pd.DataFrame(X, columns=['C' + str(i) for i in range(numCols)])
-            return X
-            
-        def tolist(self, inputCols):
-            if isinstance(inputCols, pd.indexes.base.Index):
-                return inputCols.get_values().tolist()
-            elif isinstance(inputCols, list):
-                return inputCols
-            else:
-                raise Exception('inputCols should be of type pandas.indexes.base.Index or list')
-                
-        def assemble(self, pdf, inputCols, outputCol):
-            tmpDF = self.sqlCtx.createDataFrame(pdf, self.tolist(pdf.columns))
-            assembler = VectorAssembler(inputCols=self.tolist(inputCols), outputCol=outputCol)
-            return assembler.transform(tmpDF)
+def convertToPandasDF(X):
+    if not isinstance(X, pd.DataFrame):
+        return pd.DataFrame(X, columns=['C' + str(i) for i in range(getNumCols(X))])
+    return X
             
+def tolist(inputCols):
+    if isinstance(inputCols, pd.indexes.base.Index):
+        return inputCols.get_values().tolist()
+    elif isinstance(inputCols, list):
+        return inputCols
+    else:
+        raise Exception('inputCols should be of type pandas.indexes.base.Index or list')
+
+def assemble(sqlCtx, pdf, inputCols, outputCol):
+    tmpDF = sqlCtx.createDataFrame(pdf, tolist(pdf.columns))
+    assembler = VectorAssembler(inputCols=tolist(inputCols), outputCol=outputCol)
+    return assembler.transform(tmpDF)
+
+class mllearn:
+    class BaseSystemMLEstimator(Estimator):
         def _fit(self, X):
             if hasattr(X, '_jdf') and 'features' in X.columns and 'label' in X.columns:
-                self.model = self.log.fit(X._jdf)
+                self.model = self.estimator.fit(X._jdf)
                 return self
             else:
                 raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns')
             
         def fit(self, X, y=None, params=None):
-            self.updateLog()
             if y is None:
                 return self._fit(X)
             elif y is not None and (isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame)):
                 if self.transferUsingDF:
-                    pdfX = self.convertToPandasDF(X)
-                    pdfY = self.convertToPandasDF(y)
+                    pdfX = convertToPandasDF(X)
+                    pdfY = convertToPandasDF(y)
                     if getNumCols(pdfY) != 1:
                         raise Exception('y should be a column vector')
                     if pdfX.shape[0] != pdfY.shape[0]:
                         raise Exception('Number of rows of X and y should match')
                     colNames = pdfX.columns
                     pdfX['label'] = pdfY[pdfY.columns[0]]
-                    df = self.assemble(pdfX, colNames, 'features').select('features', 'label')
-                    self.model = self.log.fit(df._jdf)
+                    df = assemble(self.sqlCtx, pdfX, colNames, 'features').select('features', 'label')
+                    self.model = self.estimator.fit(df._jdf)
                 else:
                     numColsy = getNumCols(y)
                     if numColsy != 1:
                         raise Exception('Expected y to be a column vector')
-                    self.model = self.log.fit(convertToMatrixBlock(self.sc, X), convertToMatrixBlock(self.sc, y))
-                self.model.setOutputRawPredictions(False)
+                    self.model = self.estimator.fit(convertToMatrixBlock(self.sc, X), convertToMatrixBlock(self.sc, y))
+                if self.setOutputRawPredictionsToFalse:
+                    self.model.setOutputRawPredictions(False)
                 return self
             else:
                 raise Exception('Unsupported input type')
@@ -375,8 +347,8 @@ def transform(self, X):
         def predict(self, X):
             if isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame):
                 if self.transferUsingDF:
-                    pdfX = self.convertToPandasDF(X)
-                    df = self.assemble(pdfX, pdfX.columns, 'features').select('features')
+                    pdfX = convertToPandasDF(X)
+                    df = assemble(self.sqlCtx, pdfX, pdfX.columns, 'features').select('features')
                     retjDF = self.model.transform(df._jdf)
                     retDF = DataFrame(retjDF, self.sqlCtx)
                     retPDF = retDF.sort('ID').select('prediction').toPandas()
@@ -405,5 +377,49 @@ def predict(self, X):
                 raise Exception('Unsupported input type')
                 
         def score(self, X, y):
-            return sk.metrics.accuracy_score(y, self.predict(X))
+            return sk.metrics.accuracy_score(y, self.predict(X))    
+    
+    # Or we can create new Python project with package structure
+    class LogisticRegression(BaseSystemMLEstimator):
+
+        def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
+            self.sqlCtx = sqlCtx
+            self.sc = sqlCtx._sc
+            self.uid = "logReg"
+            self.estimator = self.sc._jvm.org.apache.sysml.api.ml.LogisticRegression(self.uid, self.sc._jsc.sc())
+            self.estimator.setMaxOuterIter(max_iter)
+            self.estimator.setMaxInnerIter(max_inner_iter)
+            if C <= 0:
+                raise Exception('C has to be positive')
+            reg = 1.0 / C
+            self.estimator.setRegParam(reg)
+            self.estimator.setTol(tol)
+            self.estimator.setIcpt(int(fit_intercept))
+            self.transferUsingDF = transferUsingDF
+            self.setOutputRawPredictionsToFalse = True
+            if penalty != 'l2':
+                raise Exception('Only l2 penalty is supported')
+            if solver != 'newton-cg':
+                raise Exception('Only newton-cg solver supported')
+
+    class LinearRegression(BaseSystemMLEstimator):
+
+        def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
+            self.sqlCtx = sqlCtx
+            self.sc = sqlCtx._sc
+            self.uid = "lr"
+            if solver == 'newton-cg' or solver == 'direct-solve':
+                self.estimator = self.sc._jvm.org.apache.sysml.api.ml.LinearRegression(self.uid, self.sc._jsc.sc(), solver)
+            else:
+                raise Exception('Only newton-cg solver supported')
+            self.estimator.setMaxIter(max_iter)
+            if C <= 0:
+                raise Exception('C has to be positive')
+            reg = 1.0 / C
+            self.estimator.setRegParam(reg)
+            self.estimator.setTol(tol)
+            self.estimator.setIcpt(int(fit_intercept))
+            self.transferUsingDF = transferUsingDF
+            self.setOutputRawPredictionsToFalse = False
+            
                 
\ No newline at end of file
diff --git a/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala
new file mode 100644
index 00000000000..28d3dcceaab
--- /dev/null
+++ b/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.api.ml
+
+import java.io.File
+import org.apache.spark.SparkContext
+import org.apache.spark.ml.{ Model, Estimator }
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.ml.param.ParamMap
+import org.apache.sysml.api.{ MLContext, MLOutput }
+import org.apache.sysml.runtime.matrix.MatrixCharacteristics
+import org.apache.sysml.runtime.matrix.data.MatrixBlock
+import org.apache.sysml.runtime.DMLRuntimeException
+import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils }
+
+object LinearRegression {
+  final val scriptPathCG = "scripts" + File.separator + "algorithms" + File.separator + "LinearRegCG.dml"
+  final val scriptPathDS = "scripts" + File.separator + "algorithms" + File.separator + "LinearRegDS.dml"
+}
+
+// algorithm = "direct-solve", "conjugate-gradient"
+class LinearRegression(override val uid: String, val sc: SparkContext, val solver:String="direct-solve") extends Estimator[LinearRegressionModel] with HasIcpt
+    with HasRegParam with HasTol with HasMaxOuterIter {
+  
+  def setIcpt(value: Int) = set(icpt, value)
+  def setMaxIter(value: Int) = set(maxOuterIter, value)
+  def setRegParam(value: Double) = set(regParam, value)
+  def setTol(value: Double) = set(tol, value)
+  
+  override def copy(extra: ParamMap): Estimator[LinearRegressionModel] = {
+    val that = new LinearRegression(uid, sc, solver)
+    copyValues(that, extra)
+  }
+  def transformSchema(schema: StructType): StructType = schema
+  
+  // Note: will update the y_mb as this will be called by Python mllearn
+  def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): LinearRegressionModel = {
+    val ml = new MLContext(sc)
+    if(y_mb.getNumColumns != 1) {
+      throw new RuntimeException("Expected a column vector for y")
+    }
+    val mloutput = {
+      ml.registerInput("X", X_mb);
+      ml.registerInput("y", y_mb);
+      ml.registerOutput("beta_out");
+      if(solver.compareTo("direct-solve") == 0)
+        ml.executeScript(ScriptsUtils.getDMLScript(LinearRegression.scriptPathDS), getParamMap())
+      else if(solver.compareTo("newton-cg") == 0) {
+        ml.executeScript(ScriptsUtils.getDMLScript(LinearRegression.scriptPathCG), getParamMap())
+      }
+      else {
+        throw new DMLRuntimeException("The algorithm should be direct-solve or conjugate-gradient")
+      }
+    }
+    new LinearRegressionModel("linearRegression")(mloutput, sc)
+  }
+  
+  def getParamMap(): Map[String, String] = {
+    Map(  "icpt" -> this.getIcpt.toString(),
+          "reg" -> this.getRegParam.toString(),
+          "tol" -> this.getTol.toString,
+          "maxi" -> this.getMaxOuterIte.toString,
+  
+          "X" -> " ",
+          "Y" -> " ",
+          "B" -> " ", 
+          "O" -> " ", 
+          "Log" -> " ",
+          "fmt" -> "binary")
+  }
+  
+  def fit(df: DataFrame): LinearRegressionModel = {
+    val ml = new MLContext(df.rdd.sparkContext)
+    val mcXin = new MatrixCharacteristics()
+    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features")
+    val yin = df.select("label")
+    val mloutput = {
+      ml.registerInput("X", Xin, mcXin);
+      ml.registerInput("y", yin);
+      ml.registerOutput("beta_out");
+      if(solver.compareTo("direct-solve") == 0)
+        ml.executeScript(ScriptsUtils.getDMLScript(LinearRegression.scriptPathDS), getParamMap())
+      else if(solver.compareTo("newton-cg") == 0) {
+        ml.executeScript(ScriptsUtils.getDMLScript(LinearRegression.scriptPathCG), getParamMap())
+      }
+      else {
+        throw new DMLRuntimeException("The algorithm should be direct-solve or conjugate-gradient")
+      }
+    }
+    new LinearRegressionModel("linearRegression")(mloutput, sc)
+  }
+}
+
+class LinearRegressionModel(override val uid: String)(val mloutput: MLOutput, val sc: SparkContext) extends Model[LinearRegressionModel] with HasIcpt
+    with HasRegParam with HasTol with HasMaxOuterIter {
+  override def copy(extra: ParamMap): LinearRegressionModel = {
+    val that = new LinearRegressionModel(uid)(mloutput, sc)
+    copyValues(that, extra)
+  }
+  
+  override def transformSchema(schema: StructType): StructType = schema
+  
+  def transform(df: DataFrame): DataFrame = {
+    val isSingleNode = false
+    val glmPredOut = PredictionUtils.doGLMPredict(isSingleNode, df, null, sc, mloutput, "beta_out", getPredictParams())
+    val predictedDF = glmPredOut.getDF(df.sqlContext, "means").withColumnRenamed("C1", "prediction")
+    val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID")
+    return PredictionUtils.joinUsingID(dataset, predictedDF)
+  }
+  
+  def transform(X: MatrixBlock): MatrixBlock =  {
+    val isSingleNode = true
+    return PredictionUtils.doGLMPredict(isSingleNode, null, X, sc, mloutput, "beta_out", getPredictParams()).getMatrixBlock("means")
+  }
+  
+  
+  def getPredictParams(): Map[String, String] = {
+    Map("X" -> " ",
+        "B" -> " ",
+        // Gaussian distribution
+        "dfam" -> "1", "vpow" -> "0.0",
+        // identity link function
+        "link" -> "1", "lpow" -> "1.0"
+//        // Dispersion value: TODO
+//        ,"disp" -> "5.0"
+        )
+  }
+}
\ No newline at end of file
diff --git a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
index c679ff6d212..69e4126ac7c 100644
--- a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
@@ -112,7 +112,16 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends
     }
     
     val mloutput = {
-      val paramsMap: Map[String, String] = Map(
+      ml.registerInput("X", X_mb);
+      ml.registerInput("Y_vec", y_mb);
+      ml.registerOutput("B_out");
+      ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath), getParamMap())
+    }
+    new LogisticRegressionModel("logisticRegression")(mloutput, revLabelMapping, sc)
+  }
+  
+  def getParamMap():Map[String, String] = {
+    Map(
         "icpt" -> this.getIcpt.toString(),
         "reg" -> this.getRegParam.toString(),
         "tol" -> this.getTol.toString,
@@ -122,12 +131,6 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends
         "X" -> " ",
         "Y" -> " ",
         "B" -> " ")
-      ml.registerInput("X", X_mb);
-      ml.registerInput("Y_vec", y_mb);
-      ml.registerOutput("B_out");
-      ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath), paramsMap)
-    }
-    new LogisticRegressionModel("logisticRegression")(mloutput, revLabelMapping, sc)
   }
   override def fit(df: DataFrame): LogisticRegressionModel = {
     val ml = new MLContext(df.rdd.sparkContext)
@@ -142,20 +145,10 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends
     }
     val yin = df.select("label").rdd.map( x => labelMapping.get(x.apply(0).toString).toString )
     val mloutput = {
-      val paramsMap: Map[String, String] = Map(
-        "icpt" -> this.getIcpt.toString(),
-        "reg" -> this.getRegParam.toString(),
-        "tol" -> this.getTol.toString,
-        "moi" -> this.getMaxOuterIte.toString,
-        "mii" -> this.getMaxInnerIter.toString,
-
-        "X" -> " ",
-        "Y" -> " ",
-        "B" -> " ")
       ml.registerInput("X", Xin, mcXin);
       ml.registerInput("Y_vec", yin, "csv");
       ml.registerOutput("B_out");
-      ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath), paramsMap)
+      ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath), getParamMap())
     }
     new LogisticRegressionModel("logisticRegression")(mloutput, revLabelMapping, sc)
   }
@@ -211,7 +204,9 @@ class LogisticRegressionModel(
     }
     else {
       val isSingleNode = true
-      val ret = computePredictedLabels(doGLMPredict(isSingleNode, null, X), isSingleNode).getMatrixBlock("Prediction");
+      val ret = PredictionUtils.computePredictedClassLabelsFromProbability(
+          PredictionUtils.doGLMPredict(isSingleNode, null, X, sc, mloutput, "B_out", getPredictParams), 
+          isSingleNode, sc).getMatrixBlock("Prediction");
       if(ret.getNumColumns != 1) {
         throw new RuntimeException("Expected predicted label to be a column vector")
       }
@@ -245,54 +240,11 @@ class LogisticRegressionModel(
     }
   }
   
-  def doGLMPredict(isSingleNode:Boolean, df:DataFrame, X: MatrixBlock): MLOutput = {
-    val ml = new MLContext(sc)
-    val paramsMap: Map[String, String] = Map(
-        "X" -> " ",
-        "B" -> " ",
-        "dfam" -> "3")
-      if(isSingleNode) {
-        ml.registerInput("X", X);
-        ml.registerInput("B_full", mloutput.getMatrixBlock("B_out"), mloutput.getMatrixCharacteristics("B_out"));
-      }
-      else {
-        val mcXin = new MatrixCharacteristics()
-        val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features")
-        ml.registerInput("X", Xin, mcXin);
-        ml.registerInput("B_full", mloutput.getBinaryBlockedRDD("B_out"), mloutput.getMatrixCharacteristics("B_out"));  
-      }
-      ml.registerOutput("means");
-      ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegressionModel.scriptPath), paramsMap)
-  }
-  
-  def computePredictedLabels(mlscoreoutput:MLOutput, isSingleNode:Boolean): MLOutput = {
-    val mlNew = new MLContext(sc)
-    if(isSingleNode) {
-      mlNew.registerInput("Prob", mlscoreoutput.getMatrixBlock("means"), mlscoreoutput.getMatrixCharacteristics("means"));
-    }
-    else {
-      mlNew.registerInput("Prob", mlscoreoutput.getBinaryBlockedRDD("means"), mlscoreoutput.getMatrixCharacteristics("means"));
-    }
-    mlNew.registerOutput("Prediction")
-    mlNew.executeScript(
-      """
-        Prob = read("temp1");
-        Prediction = rowIndexMax(Prob); # assuming one-based label mapping
-        write(Prediction, "tempOut", "csv");
-        """)
-  }
-  
-  def joinUsingID(df1:DataFrame, df2:DataFrame):DataFrame = {
-    val tempDF1 = df1.withColumnRenamed("ID", "ID1")
-    tempDF1.join(df2, tempDF1.col("ID1").equalTo(df2.col("ID"))).drop("ID1")
-  }
   
   override def transform(df: DataFrame): DataFrame = {
-    val ml = new MLContext(df.rdd.sparkContext)
-
     val isSingleNode = false
-    val glmPredOut = doGLMPredict(isSingleNode, df, null)
-    val predLabelOut = computePredictedLabels(glmPredOut, isSingleNode)
+    val glmPredOut = PredictionUtils.doGLMPredict(isSingleNode, df, null, sc, mloutput, "B_out", getPredictParams())
+    val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(glmPredOut, isSingleNode, sc)
     val predictedDF = updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1").select("ID", "prediction")
     val prob = glmPredOut.getDF(df.sqlContext, "means", true).withColumnRenamed("C1", "probability").select("ID", "probability")
     val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID")
@@ -300,7 +252,13 @@ class LogisticRegressionModel(
     if(outputRawPredictions) {
       // Not supported: rawPred = 1 / (1 + exp(- X * t(B_full)) );
     }
-    return joinUsingID(dataset, joinUsingID(prob, predictedDF))
+    return PredictionUtils.joinUsingID(dataset, PredictionUtils.joinUsingID(prob, predictedDF))
+  }
+  
+  def getPredictParams(): Map[String, String] = {
+    Map("X" -> " ",
+        "B" -> " ",
+        "dfam" -> "3")
   }
 }
 
diff --git a/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala b/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala
new file mode 100644
index 00000000000..60c5b8c0c2b
--- /dev/null
+++ b/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.api.ml
+
+import org.apache.sysml.api.{ MLContext, MLOutput }
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.SparkContext
+import org.apache.sysml.runtime.matrix.data.MatrixBlock
+import org.apache.sysml.runtime.DMLRuntimeException
+import org.apache.sysml.runtime.matrix.MatrixCharacteristics
+import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils }
+
+object PredictionUtils {
+  
+  def doGLMPredict(isSingleNode:Boolean, df:DataFrame, X: MatrixBlock, sc:SparkContext, mloutput:MLOutput, B:String, paramsMap: Map[String, String]): MLOutput = {
+    val ml = new MLContext(sc)
+    if(isSingleNode) {
+      ml.registerInput("X", X);
+      ml.registerInput("B_full", mloutput.getMatrixBlock(B), mloutput.getMatrixCharacteristics(B));
+    }
+    else {
+      val mcXin = new MatrixCharacteristics()
+      val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features")
+      ml.registerInput("X", Xin, mcXin);
+      ml.registerInput("B_full", mloutput.getBinaryBlockedRDD(B), mloutput.getMatrixCharacteristics(B));  
+    }
+    ml.registerOutput("means");
+    ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegressionModel.scriptPath), paramsMap)
+  }
+  
+  def joinUsingID(df1:DataFrame, df2:DataFrame):DataFrame = {
+    val tempDF1 = df1.withColumnRenamed("ID", "ID1")
+    tempDF1.join(df2, tempDF1.col("ID1").equalTo(df2.col("ID"))).drop("ID1")
+  }
+  
+  def computePredictedClassLabelsFromProbability(mlscoreoutput:MLOutput, isSingleNode:Boolean, sc:SparkContext): MLOutput = {
+    val mlNew = new MLContext(sc)
+    if(isSingleNode) {
+      mlNew.registerInput("Prob", mlscoreoutput.getMatrixBlock("means"), mlscoreoutput.getMatrixCharacteristics("means"));
+    }
+    else {
+      mlNew.registerInput("Prob", mlscoreoutput.getBinaryBlockedRDD("means"), mlscoreoutput.getMatrixCharacteristics("means"));
+    }
+    mlNew.registerOutput("Prediction")
+    mlNew.executeScript(
+      """
+        Prob = read("temp1");
+        Prediction = rowIndexMax(Prob); # assuming one-based label mapping
+        write(Prediction, "tempOut", "csv");
+        """)
+  }
+}
\ No newline at end of file

From 397d7294b0e64184532d10d83b9216d8a6103d50 Mon Sep 17 00:00:00 2001
From: Niketan Pansare <npansar@us.ibm.com>
Date: Sun, 7 Aug 2016 15:58:06 -0700
Subject: [PATCH 08/14] [SYSTEMML-234] [SYSTEMML-208] Added SVM and python test
 cases

---
 scripts/algorithms/l2-svm.dml                 |   5 +-
 scripts/algorithms/m-svm.dml                  |   5 +-
 .../java/org/apache/sysml/api/MLContext.java  |   1 -
 .../org/apache/sysml/api/python/SystemML.py   |  25 ++-
 .../java/org/apache/sysml/api/python/test.py  | 126 ++++++++++++
 .../sysml/api/ml/LinearRegression.scala       |   1 -
 .../sysml/api/ml/LogisticRegression.scala     |  88 +--------
 .../apache/sysml/api/ml/PredictionUtils.scala |  89 ++++++++-
 .../scala/org/apache/sysml/api/ml/SVM.scala   | 187 ++++++++++++++++++
 9 files changed, 436 insertions(+), 91 deletions(-)
 create mode 100644 src/main/java/org/apache/sysml/api/python/test.py
 create mode 100644 src/main/scala/org/apache/sysml/api/ml/SVM.scala

diff --git a/scripts/algorithms/l2-svm.dml b/scripts/algorithms/l2-svm.dml
index fa404185729..1117c711caf 100644
--- a/scripts/algorithms/l2-svm.dml
+++ b/scripts/algorithms/l2-svm.dml
@@ -160,4 +160,7 @@ extra_model_params[4,1] = dimensions
 w = t(append(t(w), t(extra_model_params)))
 write(w, $model, format=cmdLine_fmt)
 
-write(debug_str, $Log)
+logFile = $Log
+if(logFile != " ") {
+	write(debug_str, logFile)
+}
\ No newline at end of file
diff --git a/scripts/algorithms/m-svm.dml b/scripts/algorithms/m-svm.dml
index e4a7cadb695..04f8a768157 100644
--- a/scripts/algorithms/m-svm.dml
+++ b/scripts/algorithms/m-svm.dml
@@ -175,4 +175,7 @@ for(iter_class in 1:ncol(debug_mat)){
 			debug_str = append(debug_str, iter_class + "," + iter + "," + obj)
 	}
 }
-write(debug_str, $Log)
+logFile = $Log
+if(logFile != " ") {
+	write(debug_str, logFile)
+}
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysml/api/MLContext.java b/src/main/java/org/apache/sysml/api/MLContext.java
index 32dc544f8e2..54f313e7026 100644
--- a/src/main/java/org/apache/sysml/api/MLContext.java
+++ b/src/main/java/org/apache/sysml/api/MLContext.java
@@ -65,7 +65,6 @@
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContextFactory;
-import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysml.runtime.instructions.Instruction;
 import org.apache.sysml.runtime.instructions.cp.Data;
 import org.apache.sysml.runtime.instructions.spark.data.RDDObject;
diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py
index 85cc4138285..6f711f11767 100644
--- a/src/main/java/org/apache/sysml/api/python/SystemML.py
+++ b/src/main/java/org/apache/sysml/api/python/SystemML.py
@@ -30,6 +30,7 @@
 import numpy as np
 import pandas as pd
 import sklearn as sk
+from sklearn import metrics
 from pyspark.ml.feature import VectorAssembler
 from pyspark.mllib.linalg import Vectors
 import sys
@@ -377,7 +378,7 @@ def predict(self, X):
                 raise Exception('Unsupported input type')
                 
         def score(self, X, y):
-            return sk.metrics.accuracy_score(y, self.predict(X))    
+            return metrics.accuracy_score(y, self.predict(X))    
     
     # Or we can create new Python project with package structure
     class LogisticRegression(BaseSystemMLEstimator):
@@ -421,5 +422,25 @@ def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0
             self.estimator.setIcpt(int(fit_intercept))
             self.transferUsingDF = transferUsingDF
             self.setOutputRawPredictionsToFalse = False
-            
+
+        def score(self, X, y):
+            return metrics.r2_score(y, self.predict(X), multioutput='variance_weighted')    
+
+
+    class SVM(BaseSystemMLEstimator):
+
+        def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False):
+            self.sqlCtx = sqlCtx
+            self.sc = sqlCtx._sc
+            self.uid = "svm"
+            self.estimator = self.sc._jvm.org.apache.sysml.api.ml.SVM(self.uid, self.sc._jsc.sc(), is_multi_class)
+            self.estimator.setMaxIter(max_iter)
+            if C <= 0:
+                raise Exception('C has to be positive')
+            reg = 1.0 / C
+            self.estimator.setRegParam(reg)
+            self.estimator.setTol(tol)
+            self.estimator.setIcpt(int(fit_intercept))
+            self.transferUsingDF = transferUsingDF
+            self.setOutputRawPredictionsToFalse = False            
                 
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysml/api/python/test.py b/src/main/java/org/apache/sysml/api/python/test.py
new file mode 100644
index 00000000000..9a9ee055b2d
--- /dev/null
+++ b/src/main/java/org/apache/sysml/api/python/test.py
@@ -0,0 +1,126 @@
+from sklearn import datasets, neighbors
+import SystemML as sml
+from pyspark.sql import SQLContext
+from pyspark.context import SparkContext
+import unittest
+from pyspark.ml.evaluation import MulticlassClassificationEvaluator
+from pyspark.ml import Pipeline
+from pyspark.ml.feature import HashingTF, Tokenizer
+import numpy as np
+
+sc = SparkContext()
+sqlCtx = SQLContext(sc)
+
+# Currently not integrated with JUnit test
+# ~/spark-1.6.1-scala-2.11/bin/spark-submit --master local[*] --driver-class-path SystemML.jar test.py
+class TestMLLearn(unittest.TestCase):
+    def testLogisticSK1(self):
+        digits = datasets.load_digits()
+        X_digits = digits.data
+        y_digits = digits.target
+        n_samples = len(X_digits)
+        X_train = X_digits[:.9 * n_samples]
+        y_train = y_digits[:.9 * n_samples]
+        X_test = X_digits[.9 * n_samples:]
+        y_test = y_digits[.9 * n_samples:]
+        logistic = sml.mllearn.LogisticRegression(sqlCtx)
+        score = logistic.fit(X_train, y_train).score(X_test, y_test)
+        self.failUnless(score > 0.9)
+        
+    def testLogisticSK2(self):
+        digits = datasets.load_digits()
+        X_digits = digits.data
+        y_digits = digits.target
+        n_samples = len(X_digits)
+        X_train = X_digits[:.9 * n_samples]
+        y_train = y_digits[:.9 * n_samples]
+        X_test = X_digits[.9 * n_samples:]
+        y_test = y_digits[.9 * n_samples:]
+        # Convert to DataFrame for i/o: current way to transfer data
+        logistic = sml.mllearn.LogisticRegression(sqlCtx, transferUsingDF=True)
+        score = logistic.fit(X_train, y_train).score(X_test, y_test)
+        self.failUnless(score > 0.9)
+
+    def testLogisticMLPipeline1(self):
+        training = sqlCtx.createDataFrame([
+            (0L, "a b c d e spark", 1.0),
+            (1L, "b d", 2.0),
+            (2L, "spark f g h", 1.0),
+            (3L, "hadoop mapreduce", 2.0),
+            (4L, "b spark who", 1.0),
+            (5L, "g d a y", 2.0),
+            (6L, "spark fly", 1.0),
+            (7L, "was mapreduce", 2.0),
+            (8L, "e spark program", 1.0),
+            (9L, "a e c l", 2.0),
+            (10L, "spark compile", 1.0),
+            (11L, "hadoop software", 2.0)
+            ], ["id", "text", "label"])
+        tokenizer = Tokenizer(inputCol="text", outputCol="words")
+        hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
+        lr = sml.mllearn.LogisticRegression(sqlCtx)
+        pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
+        model = pipeline.fit(training)
+        test = sqlCtx.createDataFrame([
+            (12L, "spark i j k", 1.0),
+            (13L, "l m n", 2.0),
+            (14L, "mapreduce spark", 1.0),
+            (15L, "apache hadoop", 2.0)], ["id", "text", "label"])
+        result = model.transform(test)
+        predictionAndLabels = result.select("prediction", "label")
+        evaluator = MulticlassClassificationEvaluator()
+        score = evaluator.evaluate(predictionAndLabels)
+        self.failUnless(score == 1.0)
+
+    def testLinearRegressionSK1(self):
+        diabetes = datasets.load_diabetes()
+        diabetes_X = diabetes.data[:, np.newaxis, 2]
+        diabetes_X_train = diabetes_X[:-20]
+        diabetes_X_test = diabetes_X[-20:]
+        diabetes_y_train = diabetes.target[:-20]
+        diabetes_y_test = diabetes.target[-20:]
+        regr = sml.mllearn.LinearRegression(sqlCtx)
+        regr.fit(diabetes_X_train, diabetes_y_train)
+        score = regr.score(diabetes_X_test, diabetes_y_test)
+        self.failUnless(score > 0.4) # TODO: Improve r2-score (may be I am using it incorrectly)
+
+    def testLinearRegressionSK2(self):
+        diabetes = datasets.load_diabetes()
+        diabetes_X = diabetes.data[:, np.newaxis, 2]
+        diabetes_X_train = diabetes_X[:-20]
+        diabetes_X_test = diabetes_X[-20:]
+        diabetes_y_train = diabetes.target[:-20]
+        diabetes_y_test = diabetes.target[-20:]
+        regr = sml.mllearn.LinearRegression(sqlCtx, transferUsingDF=True)
+        regr.fit(diabetes_X_train, diabetes_y_train)
+        score = regr.score(diabetes_X_test, diabetes_y_test)
+        self.failUnless(score > 0.4) # TODO: Improve r2-score (may be I am using it incorrectly)
+
+    def testSVMSK1(self):
+        digits = datasets.load_digits()
+        X_digits = digits.data
+        y_digits = digits.target
+        n_samples = len(X_digits)
+        X_train = X_digits[:.9 * n_samples]
+        y_train = y_digits[:.9 * n_samples]
+        X_test = X_digits[.9 * n_samples:]
+        y_test = y_digits[.9 * n_samples:]
+        svm = sml.mllearn.SVM(sqlCtx, is_multi_class=True)
+        score = svm.fit(X_train, y_train).score(X_test, y_test)
+        self.failUnless(score > 0.9)
+
+    def testSVMSK2(self):
+        digits = datasets.load_digits()
+        X_digits = digits.data
+        y_digits = digits.target
+        n_samples = len(X_digits)
+        X_train = X_digits[:.9 * n_samples]
+        y_train = y_digits[:.9 * n_samples]
+        X_test = X_digits[.9 * n_samples:]
+        y_test = y_digits[.9 * n_samples:]
+        svm = sml.mllearn.SVM(sqlCtx, is_multi_class=True, transferUsingDF=True)
+        score = svm.fit(X_train, y_train).score(X_test, y_test)
+        self.failUnless(score > 0.9)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala
index 28d3dcceaab..7f22f8f717b 100644
--- a/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala
@@ -51,7 +51,6 @@ class LinearRegression(override val uid: String, val sc: SparkContext, val solve
   }
   def transformSchema(schema: StructType): StructType = schema
   
-  // Note: will update the y_mb as this will be called by Python mllearn
   def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): LinearRegressionModel = {
     val ml = new MLContext(sc)
     if(y_mb.getNumColumns != 1) {
diff --git a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
index 69e4126ac7c..f9ddf9c9964 100644
--- a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
@@ -35,7 +35,6 @@ import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
 import scala.reflect.ClassTag
 import scala.collection.immutable.HashMap
-import org.apache.spark.sql.functions.udf
 import org.apache.sysml.runtime.matrix.data.MatrixBlock
 import org.apache.sysml.runtime.DMLRuntimeException
 
@@ -89,27 +88,8 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends
   // Note: will update the y_mb as this will be called by Python mllearn
   def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): LogisticRegressionModel = {
     val ml = new MLContext(sc)
-    val labelMapping = new java.util.HashMap[String, Int]
     val revLabelMapping = new java.util.HashMap[Int, String]
-    if(y_mb.getNumColumns != 1) {
-      throw new RuntimeException("Expected a column vector for y")
-    }
-    if(y_mb.isInSparseFormat()) {
-      throw new DMLRuntimeException("Sparse block is not implemented for fit")
-    }
-    else {
-      val denseBlock = y_mb.getDenseBlock()
-      var id:Int = 1
-      for(i <- 0 until denseBlock.length) {
-        val v = denseBlock(i).toString()
-        if(!labelMapping.containsKey(v)) {
-          labelMapping.put(v, id)
-          revLabelMapping.put(id, v)
-          id += 1
-        }
-        denseBlock.update(i, labelMapping.get(v))
-      }  
-    }
+    PredictionUtils.fillLabelMapping(y_mb, revLabelMapping)
     
     val mloutput = {
       ml.registerInput("X", X_mb);
@@ -136,14 +116,8 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends
     val ml = new MLContext(df.rdd.sparkContext)
     val mcXin = new MatrixCharacteristics()
     val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features")
-    val temp = df.select("label").distinct.rdd.map(_.apply(0).toString).collect()
-    val labelMapping = new java.util.HashMap[String, Int]
     val revLabelMapping = new java.util.HashMap[Int, String]
-    for(i <- 0 until temp.length) {
-      labelMapping.put(temp(i), i+1)
-      revLabelMapping.put(i+1, temp(i))
-    }
-    val yin = df.select("label").rdd.map( x => labelMapping.get(x.apply(0).toString).toString )
+    val yin = PredictionUtils.fillLabelMapping(df, revLabelMapping)
     val mloutput = {
       ml.registerInput("X", Xin, mcXin);
       ml.registerInput("Y_vec", yin, "csv");
@@ -157,31 +131,6 @@ object LogisticRegressionModel {
   final val scriptPath = "scripts" + File.separator + "algorithms" + File.separator + "GLM-predict.dml"
 }
 
-class LogisticRegressionModelSerializableData(val labelMapping: java.util.HashMap[Int, String]) extends Serializable {
- def mapLabelStr(x:Double):String = {
-   if(labelMapping.containsKey(x.toInt))
-     labelMapping.get(x.toInt)
-   else
-     throw new RuntimeException("Incorrect label mapping")
- }
- def mapLabelDouble(x:Double):Double = {
-   if(labelMapping.containsKey(x.toInt))
-     labelMapping.get(x.toInt).toDouble
-   else
-     throw new RuntimeException("Incorrect label mapping")
- }
- val mapLabel_udf =  {
-      try {
-        val it = labelMapping.values().iterator()
-        while(it.hasNext()) {
-          it.next().toDouble
-        }
-        udf(mapLabelDouble _)
-      } catch {
-        case e: Exception => udf(mapLabelStr _)
-      }
-    }
-}
 /**
  * Logistic Regression Scala API
  */
@@ -206,46 +155,21 @@ class LogisticRegressionModel(
       val isSingleNode = true
       val ret = PredictionUtils.computePredictedClassLabelsFromProbability(
           PredictionUtils.doGLMPredict(isSingleNode, null, X, sc, mloutput, "B_out", getPredictParams), 
-          isSingleNode, sc).getMatrixBlock("Prediction");
+          isSingleNode, sc, "means").getMatrixBlock("Prediction");
       if(ret.getNumColumns != 1) {
         throw new RuntimeException("Expected predicted label to be a column vector")
       }
-      if(ret.isInSparseFormat()) {
-        throw new RuntimeException("Since predicted label is a column vector, expected it to be in dense format")
-      }
-      else {
-        updateLabels(true, null, ret, null)
-      }
+      PredictionUtils.updateLabels(true, null, ret, null, labelMapping)
       return ret
     }
   }
   
-  def updateLabels(isSingleNode:Boolean, df:DataFrame, X: MatrixBlock, labelColName:String): DataFrame = {
-    if(isSingleNode) {
-      for(i <- 0 until X.getNumRows) {
-        val v:Int = X.getValue(i, 0).toInt
-        if(labelMapping.containsKey(v)) {
-          X.setValue(i, 0, labelMapping.get(v).toDouble)
-        }
-        else {
-          throw new RuntimeException("No mapping found for " + v + " in " + labelMapping.toString())
-        }
-      }
-      return null
-    }
-    else {
-      val serObj = new LogisticRegressionModelSerializableData(labelMapping)
-      return df.withColumn(labelColName, serObj.mapLabel_udf(df(labelColName)))
-               .withColumnRenamed(labelColName, "prediction")
-    }
-  }
-  
   
   override def transform(df: DataFrame): DataFrame = {
     val isSingleNode = false
     val glmPredOut = PredictionUtils.doGLMPredict(isSingleNode, df, null, sc, mloutput, "B_out", getPredictParams())
-    val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(glmPredOut, isSingleNode, sc)
-    val predictedDF = updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1").select("ID", "prediction")
+    val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(glmPredOut, isSingleNode, sc, "means")
+    val predictedDF = PredictionUtils.updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1", labelMapping).select("ID", "prediction")
     val prob = glmPredOut.getDF(df.sqlContext, "means", true).withColumnRenamed("C1", "probability").select("ID", "probability")
     val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID")
     
diff --git a/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala b/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala
index 60c5b8c0c2b..13494eedaf8 100644
--- a/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala
@@ -19,6 +19,8 @@
 
 package org.apache.sysml.api.ml
 
+import org.apache.spark.sql.functions.udf
+import org.apache.spark.rdd.RDD
 import org.apache.sysml.api.{ MLContext, MLOutput }
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.SparkContext
@@ -45,18 +47,99 @@ object PredictionUtils {
     ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegressionModel.scriptPath), paramsMap)
   }
   
+  def fillLabelMapping(df: DataFrame, revLabelMapping: java.util.HashMap[Int, String]): RDD[String]  = {
+    val temp = df.select("label").distinct.rdd.map(_.apply(0).toString).collect()
+    val labelMapping = new java.util.HashMap[String, Int]
+    for(i <- 0 until temp.length) {
+      labelMapping.put(temp(i), i+1)
+      revLabelMapping.put(i+1, temp(i))
+    }
+    df.select("label").rdd.map( x => labelMapping.get(x.apply(0).toString).toString )
+  }
+  
+  def fillLabelMapping(y_mb: MatrixBlock, revLabelMapping: java.util.HashMap[Int, String]): Unit = {
+    val labelMapping = new java.util.HashMap[String, Int]
+    if(y_mb.getNumColumns != 1) {
+      throw new RuntimeException("Expected a column vector for y")
+    }
+    if(y_mb.isInSparseFormat()) {
+      throw new DMLRuntimeException("Sparse block is not implemented for fit")
+    }
+    else {
+      val denseBlock = y_mb.getDenseBlock()
+      var id:Int = 1
+      for(i <- 0 until denseBlock.length) {
+        val v = denseBlock(i).toString()
+        if(!labelMapping.containsKey(v)) {
+          labelMapping.put(v, id)
+          revLabelMapping.put(id, v)
+          id += 1
+        }
+        denseBlock.update(i, labelMapping.get(v))
+      }  
+    }
+  }
+  
+  class LabelMappingData(val labelMapping: java.util.HashMap[Int, String]) extends Serializable {
+   def mapLabelStr(x:Double):String = {
+     if(labelMapping.containsKey(x.toInt))
+       labelMapping.get(x.toInt)
+     else
+       throw new RuntimeException("Incorrect label mapping")
+   }
+   def mapLabelDouble(x:Double):Double = {
+     if(labelMapping.containsKey(x.toInt))
+       labelMapping.get(x.toInt).toDouble
+     else
+       throw new RuntimeException("Incorrect label mapping")
+   }
+   val mapLabel_udf =  {
+        try {
+          val it = labelMapping.values().iterator()
+          while(it.hasNext()) {
+            it.next().toDouble
+          }
+          udf(mapLabelDouble _)
+        } catch {
+          case e: Exception => udf(mapLabelStr _)
+        }
+      }
+  }  
+  def updateLabels(isSingleNode:Boolean, df:DataFrame, X: MatrixBlock, labelColName:String, labelMapping: java.util.HashMap[Int, String]): DataFrame = {
+    if(isSingleNode) {
+      if(X.isInSparseFormat()) {
+        throw new RuntimeException("Since predicted label is a column vector, expected it to be in dense format")
+      }
+      for(i <- 0 until X.getNumRows) {
+        val v:Int = X.getValue(i, 0).toInt
+        if(labelMapping.containsKey(v)) {
+          X.setValue(i, 0, labelMapping.get(v).toDouble)
+        }
+        else {
+          throw new RuntimeException("No mapping found for " + v + " in " + labelMapping.toString())
+        }
+      }
+      return null
+    }
+    else {
+      val serObj = new LabelMappingData(labelMapping)
+      return df.withColumn(labelColName, serObj.mapLabel_udf(df(labelColName)))
+               .withColumnRenamed(labelColName, "prediction")
+    }
+  }
+  
   def joinUsingID(df1:DataFrame, df2:DataFrame):DataFrame = {
     val tempDF1 = df1.withColumnRenamed("ID", "ID1")
     tempDF1.join(df2, tempDF1.col("ID1").equalTo(df2.col("ID"))).drop("ID1")
   }
   
-  def computePredictedClassLabelsFromProbability(mlscoreoutput:MLOutput, isSingleNode:Boolean, sc:SparkContext): MLOutput = {
+  def computePredictedClassLabelsFromProbability(mlscoreoutput:MLOutput, isSingleNode:Boolean, sc:SparkContext, inProbVar:String): MLOutput = {
     val mlNew = new MLContext(sc)
     if(isSingleNode) {
-      mlNew.registerInput("Prob", mlscoreoutput.getMatrixBlock("means"), mlscoreoutput.getMatrixCharacteristics("means"));
+      mlNew.registerInput("Prob", mlscoreoutput.getMatrixBlock(inProbVar), mlscoreoutput.getMatrixCharacteristics(inProbVar));
     }
     else {
-      mlNew.registerInput("Prob", mlscoreoutput.getBinaryBlockedRDD("means"), mlscoreoutput.getMatrixCharacteristics("means"));
+      mlNew.registerInput("Prob", mlscoreoutput.getBinaryBlockedRDD(inProbVar), mlscoreoutput.getMatrixCharacteristics(inProbVar));
     }
     mlNew.registerOutput("Prediction")
     mlNew.executeScript(
diff --git a/src/main/scala/org/apache/sysml/api/ml/SVM.scala b/src/main/scala/org/apache/sysml/api/ml/SVM.scala
new file mode 100644
index 00000000000..7a48c1ded13
--- /dev/null
+++ b/src/main/scala/org/apache/sysml/api/ml/SVM.scala
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.api.ml
+
+import java.io.File
+import org.apache.spark.SparkContext
+import org.apache.spark.ml.{ Model, Estimator }
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.ml.param.ParamMap
+import org.apache.sysml.api.{ MLContext, MLOutput }
+import org.apache.sysml.runtime.matrix.MatrixCharacteristics
+import org.apache.sysml.runtime.matrix.data.MatrixBlock
+import org.apache.sysml.runtime.DMLRuntimeException
+import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils }
+
+object SVM {
+  final val scriptPathBinary = "scripts" + File.separator + "algorithms" + File.separator + "l2-svm.dml"
+  final val scriptPathMulticlass = "scripts" + File.separator + "algorithms" + File.separator + "m-svm.dml"
+}
+
+class SVM (override val uid: String, val sc: SparkContext, val isMultiClass:Boolean=false) extends Estimator[SVMModel] with HasIcpt
+    with HasRegParam with HasTol with HasMaxOuterIter {
+
+  def setIcpt(value: Int) = set(icpt, value)
+  def setMaxIter(value: Int) = set(maxOuterIter, value)
+  def setRegParam(value: Double) = set(regParam, value)
+  def setTol(value: Double) = set(tol, value)
+  
+  def setModelParams(m:SVMModel):SVMModel = {
+    m.setIcpt(this.getIcpt).setMaxIter(this.getMaxOuterIte).setRegParam(this.getRegParam).setTol(this.getTol)
+  }
+  
+  override def copy(extra: ParamMap): Estimator[SVMModel] = {
+    val that = new SVM(uid, sc, isMultiClass)
+    copyValues(that, extra)
+  }
+  def transformSchema(schema: StructType): StructType = schema
+  
+  def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): SVMModel = {
+    val ml = new MLContext(sc)
+    val revLabelMapping = new java.util.HashMap[Int, String]
+    PredictionUtils.fillLabelMapping(y_mb, revLabelMapping)
+    if(y_mb.getNumColumns != 1) {
+      throw new RuntimeException("Expected a column vector for y")
+    }
+    val mloutput = {
+      ml.registerInput("X", X_mb);
+      ml.registerInput("Y", y_mb);
+      ml.registerOutput("w");
+      if(isMultiClass)
+        ml.executeScript(ScriptsUtils.getDMLScript(SVM.scriptPathMulticlass), getParamMap())
+      else {
+        ml.executeScript(ScriptsUtils.getDMLScript(SVM.scriptPathBinary), getParamMap())
+      }
+    }
+    setModelParams(new SVMModel("svm")(mloutput, sc, isMultiClass, revLabelMapping))
+  }
+  
+  def getParamMap(): Map[String, String] = {
+    Map(  "icpt" -> this.getIcpt.toString(),
+          "reg" -> this.getRegParam.toString(),
+          "tol" -> this.getTol.toString,
+          "maxiter" -> this.getMaxOuterIte.toString,
+          "X" -> " ",
+          "Y" -> " ",
+          "model" -> " ",
+          "Log" -> " ")
+  }
+  
+  def fit(df: DataFrame): SVMModel = {
+    val ml = new MLContext(df.rdd.sparkContext)
+    val mcXin = new MatrixCharacteristics()
+    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features")
+    val revLabelMapping = new java.util.HashMap[Int, String]
+    val yin = PredictionUtils.fillLabelMapping(df, revLabelMapping)
+    val mloutput = {
+      ml.registerInput("X", Xin, mcXin);
+      ml.registerInput("Y", yin, "csv");
+      ml.registerOutput("w");
+      if(isMultiClass)
+        ml.executeScript(ScriptsUtils.getDMLScript(SVM.scriptPathMulticlass), getParamMap())
+      else {
+        ml.executeScript(ScriptsUtils.getDMLScript(SVM.scriptPathBinary), getParamMap())
+      }
+    }
+    setModelParams(new SVMModel("svm")(mloutput, sc, isMultiClass, revLabelMapping))
+  }
+  
+}
+
+object SVMModel {
+  final val predictionScriptPathBinary = "scripts" + File.separator + "algorithms" + File.separator + "l2-svm-predict.dml"
+  final val predictionScriptPathMulticlass = "scripts" + File.separator + "algorithms" + File.separator + "m-svm-predict.dml"
+}
+
+class SVMModel (override val uid: String)(val mloutput: MLOutput, val sc: SparkContext, val isMultiClass:Boolean, val labelMapping: java.util.HashMap[Int, String]) extends Model[SVMModel] with HasIcpt
+    with HasRegParam with HasTol with HasMaxOuterIter {
+  override def copy(extra: ParamMap): SVMModel = {
+    val that = new SVMModel(uid)(mloutput, sc, isMultiClass, labelMapping)
+    copyValues(that, extra)
+  }
+  
+  def setIcpt(value: Int) = set(icpt, value)
+  def setMaxIter(value: Int) = set(maxOuterIter, value)
+  def setRegParam(value: Double) = set(regParam, value)
+  def setTol(value: Double) = set(tol, value)
+  
+  override def transformSchema(schema: StructType): StructType = schema
+  
+  def transform(df: DataFrame): DataFrame = {
+    val ml = new MLContext(sc)
+    val mcXin = new MatrixCharacteristics()
+    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features")
+    ml.registerInput("X", Xin, mcXin);
+    ml.registerOutput("scores");
+    val glmPredOut = {
+      if(isMultiClass) {
+        ml.registerInput("W", mloutput.getBinaryBlockedRDD("w"), mloutput.getMatrixCharacteristics("w"));
+        ml.executeScript(ScriptsUtils.getDMLScript(SVMModel.predictionScriptPathMulticlass), getPredictParams())
+      }
+      else {
+        ml.registerInput("w", mloutput.getBinaryBlockedRDD("w"), mloutput.getMatrixCharacteristics("w"));
+        ml.executeScript(ScriptsUtils.getDMLScript(SVMModel.predictionScriptPathBinary), getPredictParams())
+      }
+    }
+    val isSingleNode = false
+    val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(glmPredOut, isSingleNode, sc, "scores")
+    val predictedDF = PredictionUtils.updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1", labelMapping).select("ID", "prediction")
+    val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID")
+    return PredictionUtils.joinUsingID(dataset, predictedDF)
+  }
+  
+  def transform(X: MatrixBlock): MatrixBlock =  {
+    val ml = new MLContext(sc)
+    ml.registerInput("X", X);
+    ml.registerInput("w", mloutput.getMatrixBlock("w"), mloutput.getMatrixCharacteristics("w"));
+    ml.registerOutput("scores");
+    val glmPredOut = {
+      if(isMultiClass) {
+        ml.registerInput("W", mloutput.getMatrixBlock("w"), mloutput.getMatrixCharacteristics("w"));
+        ml.executeScript(ScriptsUtils.getDMLScript(SVMModel.predictionScriptPathMulticlass), getPredictParams())
+      }
+      else { 
+        ml.registerInput("w", mloutput.getMatrixBlock("w"), mloutput.getMatrixCharacteristics("w"));
+        ml.executeScript(ScriptsUtils.getDMLScript(SVMModel.predictionScriptPathBinary), getPredictParams())
+      }
+    }
+    val isSingleNode = true
+    val ret = PredictionUtils.computePredictedClassLabelsFromProbability(glmPredOut, isSingleNode, sc, "scores").getMatrixBlock("Prediction");
+    if(ret.getNumColumns != 1) {
+      throw new RuntimeException("Expected predicted label to be a column vector")
+    }
+    PredictionUtils.updateLabels(true, null, ret, null, labelMapping)
+    return ret
+  }
+  
+  
+  def getPredictParams(): Map[String, String] = {
+    Map(  "icpt" -> this.getIcpt.toString(),
+          "reg" -> this.getRegParam.toString(),
+          "tol" -> this.getTol.toString,
+          "maxiter" -> this.getMaxOuterIte.toString,
+          "X" -> " ",
+          "Y" -> " ",
+          "model" -> " ",
+          "Log" -> " ")
+  }
+
+}
\ No newline at end of file

From d4aff09030624e0cee8aed269bd20d64f422b8a8 Mon Sep 17 00:00:00 2001
From: Niketan Pansare <npansar@us.ibm.com>
Date: Sun, 7 Aug 2016 17:15:43 -0700
Subject: [PATCH 09/14] [SYSTEMML-234] [SYSTEMML-208] Updating the
 documentation

---
 docs/algorithms-classification.md | 139 ++++++++++++++++++++++++++++++
 docs/algorithms-regression.md     |  62 +++++++++++++
 2 files changed, 201 insertions(+)

diff --git a/docs/algorithms-classification.md b/docs/algorithms-classification.md
index 2488a8c7f1a..339f2d882fe 100644
--- a/docs/algorithms-classification.md
+++ b/docs/algorithms-classification.md
@@ -127,6 +127,15 @@ Eqs. (1) and (2).
 ### Usage
 
 <div class="codetabs">
+<div data-lang="Python" markdown="1">
+import SystemML as sml
+# C = 1/reg
+logistic = sml.mllearn.LogisticRegression(sqlCtx, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0)
+# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame
+y_test = logistic.fit(X_train, y_train).predict(X_test)
+# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
+y_test = logistic.fit(df_train).transform(df_test)
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f MultiLogReg.dml
                             -nvargs X=<file>
@@ -214,6 +223,56 @@ SystemML Language Reference for details.
 ### Examples
 
 <div class="codetabs">
+<div data-lang="Python" markdown="1">
+# Scikit-learn way
+from sklearn import datasets, neighbors
+import SystemML as sml
+from pyspark.sql import SQLContext
+sqlCtx = SQLContext(sc)
+digits = datasets.load_digits()
+X_digits = digits.data
+y_digits = digits.target + 1
+n_samples = len(X_digits)
+X_train = X_digits[:.9 * n_samples]
+y_train = y_digits[:.9 * n_samples]
+X_test = X_digits[.9 * n_samples:]
+y_test = y_digits[.9 * n_samples:]
+logistic = sml.mllearn.LogisticRegression(sqlCtx)
+print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))
+
+# MLPipeline way
+from pyspark.ml import Pipeline
+import SystemML as sml
+from pyspark.ml.feature import HashingTF, Tokenizer
+from pyspark.sql import SQLContext
+sqlCtx = SQLContext(sc)
+training = sqlCtx.createDataFrame([
+    (0L, "a b c d e spark", 1.0),
+    (1L, "b d", 2.0),
+    (2L, "spark f g h", 1.0),
+    (3L, "hadoop mapreduce", 2.0),
+    (4L, "b spark who", 1.0),
+    (5L, "g d a y", 2.0),
+    (6L, "spark fly", 1.0),
+    (7L, "was mapreduce", 2.0),
+    (8L, "e spark program", 1.0),
+    (9L, "a e c l", 2.0),
+    (10L, "spark compile", 1.0),
+    (11L, "hadoop software", 2.0)
+], ["id", "text", "label"])
+tokenizer = Tokenizer(inputCol="text", outputCol="words")
+hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
+lr = sml.mllearn.LogisticRegression(sqlCtx)
+pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
+model = pipeline.fit(training)
+test = sqlCtx.createDataFrame([
+    (12L, "spark i j k"),
+    (13L, "l m n"),
+    (14L, "mapreduce spark"),
+    (15L, "apache hadoop")], ["id", "text"])
+prediction = model.transform(test)
+prediction.show()
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f MultiLogReg.dml
                             -nvargs X=/user/ml/X.mtx
@@ -393,6 +452,15 @@ support vector machine (`y` with domain size `2`).
 **Binary-Class Support Vector Machines**:
 
 <div class="codetabs">
+<div data-lang="Python" markdown="1">
+import SystemML as sml
+# C = 1/reg
+svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False)
+# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame
+y_test = svm.fit(X_train, y_train)
+# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
+y_test = svm.fit(df_train)
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f l2-svm.dml
                             -nvargs X=<file>
@@ -428,6 +496,12 @@ support vector machine (`y` with domain size `2`).
 **Binary-Class Support Vector Machines Prediction**:
 
 <div class="codetabs">
+<div data-lang="Python" markdown="1">
+# X_test can be NumPy matrices or Pandas DataFrame
+y_test = svm.predict(X_test)
+# df_test is a DataFrame that contains the column "features" of type Vector
+y_test = svm.transform(df_test)
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f l2-svm-predict.dml
                             -nvargs X=<file>
@@ -630,6 +704,15 @@ class labels.
 **Multi-Class Support Vector Machines**:
 
 <div class="codetabs">
+<div data-lang="Python" markdown="1">
+import SystemML as sml
+# C = 1/reg
+svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=True)
+# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame
+y_test = svm.fit(X_train, y_train)
+# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
+y_test = svm.fit(df_train)
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f m-svm.dml
                             -nvargs X=<file>
@@ -665,6 +748,12 @@ class labels.
 **Multi-Class Support Vector Machines Prediction**:
 
 <div class="codetabs">
+<div data-lang="Python" markdown="1">
+# X_test can be NumPy matrices or Pandas DataFrame
+y_test = svm.predict(X_test)
+# df_test is a DataFrame that contains the column "features" of type Vector
+y_test = svm.transform(df_test)
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f m-svm-predict.dml
                             -nvargs X=<file>
@@ -747,6 +836,56 @@ SystemML Language Reference for details.
 **Multi-Class Support Vector Machines**:
 
 <div class="codetabs">
+<div data-lang="Python" markdown="1">
+# Scikit-learn way
+from sklearn import datasets, neighbors
+import SystemML as sml
+from pyspark.sql import SQLContext
+sqlCtx = SQLContext(sc)
+digits = datasets.load_digits()
+X_digits = digits.data
+y_digits = digits.target 
+n_samples = len(X_digits)
+X_train = X_digits[:.9 * n_samples]
+y_train = y_digits[:.9 * n_samples]
+X_test = X_digits[.9 * n_samples:]
+y_test = y_digits[.9 * n_samples:]
+svm = sml.mllearn.SVM(sqlCtx, is_multi_class=True)
+print('LogisticRegression score: %f' % svm.fit(X_train, y_train).score(X_test, y_test))
+
+# MLPipeline way
+from pyspark.ml import Pipeline
+import SystemML as sml
+from pyspark.ml.feature import HashingTF, Tokenizer
+from pyspark.sql import SQLContext
+sqlCtx = SQLContext(sc)
+training = sqlCtx.createDataFrame([
+    (0L, "a b c d e spark", 1.0),
+    (1L, "b d", 2.0),
+    (2L, "spark f g h", 1.0),
+    (3L, "hadoop mapreduce", 2.0),
+    (4L, "b spark who", 1.0),
+    (5L, "g d a y", 2.0),
+    (6L, "spark fly", 1.0),
+    (7L, "was mapreduce", 2.0),
+    (8L, "e spark program", 1.0),
+    (9L, "a e c l", 2.0),
+    (10L, "spark compile", 1.0),
+    (11L, "hadoop software", 2.0)
+], ["id", "text", "label"])
+tokenizer = Tokenizer(inputCol="text", outputCol="words")
+hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
+svm = sml.mllearn.SVM(sqlCtx, is_multi_class=True)
+pipeline = Pipeline(stages=[tokenizer, hashingTF, svm])
+model = pipeline.fit(training)
+test = sqlCtx.createDataFrame([
+    (12L, "spark i j k"),
+    (13L, "l m n"),
+    (14L, "mapreduce spark"),
+    (15L, "apache hadoop")], ["id", "text"])
+prediction = model.transform(test)
+prediction.show()
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f m-svm.dml
                             -nvargs X=/user/ml/X.mtx
diff --git a/docs/algorithms-regression.md b/docs/algorithms-regression.md
index 6472c176dfb..2ec549ccc60 100644
--- a/docs/algorithms-regression.md
+++ b/docs/algorithms-regression.md
@@ -80,6 +80,15 @@ efficient when the number of features $m$ is relatively small
 **Linear Regression - Direct Solve**:
 
 <div class="codetabs">
+<div data-lang="Python" markdown="1">
+import SystemML as sml
+# C = 1/reg
+lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='direct-solve')
+# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame
+y_test = lr.fit(X_train, y_train)
+# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
+y_test = lr.fit(df_train)
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f LinearRegDS.dml
                             -nvargs X=<file>
@@ -111,6 +120,15 @@ efficient when the number of features $m$ is relatively small
 **Linear Regression - Conjugate Gradient**:
 
 <div class="codetabs">
+<div data-lang="Python" markdown="1">
+import SystemML as sml
+# C = 1/reg
+lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg')
+# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame
+y_test = lr.fit(X_train, y_train)
+# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
+y_test = lr.fit(df_train)
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f LinearRegCG.dml
                             -nvargs X=<file>
@@ -196,6 +214,28 @@ SystemML Language Reference for details.
 **Linear Regression - Direct Solve**:
 
 <div class="codetabs">
+<div data-lang="Python" markdown="1">
+import numpy as np
+from sklearn import datasets
+import SystemML as sml
+from pyspark.sql import SQLContext
+# Load the diabetes dataset
+diabetes = datasets.load_diabetes()
+# Use only one feature
+diabetes_X = diabetes.data[:, np.newaxis, 2]
+# Split the data into training/testing sets
+diabetes_X_train = diabetes_X[:-20]
+diabetes_X_test = diabetes_X[-20:]
+# Split the targets into training/testing sets
+diabetes_y_train = diabetes.target[:-20]
+diabetes_y_test = diabetes.target[-20:]
+# Create linear regression object
+regr = sml.mllearn.LinearRegression(sqlCtx, solver='direct-solve')
+# Train the model using the training sets
+regr.fit(diabetes_X_train, diabetes_y_train)
+# The mean square error
+print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2))
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f LinearRegDS.dml
                             -nvargs X=/user/ml/X.mtx
@@ -227,6 +267,28 @@ SystemML Language Reference for details.
 **Linear Regression - Conjugate Gradient**:
 
 <div class="codetabs">
+<div data-lang="Python" markdown="1">
+import numpy as np
+from sklearn import datasets
+import SystemML as sml
+from pyspark.sql import SQLContext
+# Load the diabetes dataset
+diabetes = datasets.load_diabetes()
+# Use only one feature
+diabetes_X = diabetes.data[:, np.newaxis, 2]
+# Split the data into training/testing sets
+diabetes_X_train = diabetes_X[:-20]
+diabetes_X_test = diabetes_X[-20:]
+# Split the targets into training/testing sets
+diabetes_y_train = diabetes.target[:-20]
+diabetes_y_test = diabetes.target[-20:]
+# Create linear regression object
+regr = sml.mllearn.LinearRegression(sqlCtx, solver='newton-cg')
+# Train the model using the training sets
+regr.fit(diabetes_X_train, diabetes_y_train)
+# The mean square error
+print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2))
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f LinearRegCG.dml
                             -nvargs X=/user/ml/X.mtx

From ca671346e4e16134e0485ebf37de6d79e1254d30 Mon Sep 17 00:00:00 2001
From: Niketan Pansare <npansar@us.ibm.com>
Date: Sun, 7 Aug 2016 17:31:16 -0700
Subject: [PATCH 10/14] Updated documentation

---
 docs/algorithms-classification.md | 14 ++++++++++++++
 docs/algorithms-regression.md     |  8 ++++++++
 2 files changed, 22 insertions(+)

diff --git a/docs/algorithms-classification.md b/docs/algorithms-classification.md
index 339f2d882fe..4797429122a 100644
--- a/docs/algorithms-classification.md
+++ b/docs/algorithms-classification.md
@@ -128,6 +128,7 @@ Eqs. (1) and (2).
 
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
+{% highlight python %}
 import SystemML as sml
 # C = 1/reg
 logistic = sml.mllearn.LogisticRegression(sqlCtx, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0)
@@ -135,6 +136,7 @@ logistic = sml.mllearn.LogisticRegression(sqlCtx, fit_intercept=True, max_iter=1
 y_test = logistic.fit(X_train, y_train).predict(X_test)
 # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
 y_test = logistic.fit(df_train).transform(df_test)
+{% endhighlight %}
 </div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f MultiLogReg.dml
@@ -224,6 +226,7 @@ SystemML Language Reference for details.
 
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
+{% highlight python %}
 # Scikit-learn way
 from sklearn import datasets, neighbors
 import SystemML as sml
@@ -272,6 +275,7 @@ test = sqlCtx.createDataFrame([
     (15L, "apache hadoop")], ["id", "text"])
 prediction = model.transform(test)
 prediction.show()
+{% endhighlight %}
 </div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f MultiLogReg.dml
@@ -453,6 +457,7 @@ support vector machine (`y` with domain size `2`).
 
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
+{% highlight python %}
 import SystemML as sml
 # C = 1/reg
 svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False)
@@ -460,6 +465,7 @@ svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=
 y_test = svm.fit(X_train, y_train)
 # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
 y_test = svm.fit(df_train)
+{% endhighlight %}
 </div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f l2-svm.dml
@@ -497,10 +503,12 @@ y_test = svm.fit(df_train)
 
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
+{% highlight python %}
 # X_test can be NumPy matrices or Pandas DataFrame
 y_test = svm.predict(X_test)
 # df_test is a DataFrame that contains the column "features" of type Vector
 y_test = svm.transform(df_test)
+{% endhighlight %}
 </div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f l2-svm-predict.dml
@@ -705,6 +713,7 @@ class labels.
 
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
+{% highlight python %}
 import SystemML as sml
 # C = 1/reg
 svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=True)
@@ -712,6 +721,7 @@ svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=
 y_test = svm.fit(X_train, y_train)
 # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
 y_test = svm.fit(df_train)
+{% endhighlight %}
 </div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f m-svm.dml
@@ -749,10 +759,12 @@ y_test = svm.fit(df_train)
 
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
+{% highlight python %}
 # X_test can be NumPy matrices or Pandas DataFrame
 y_test = svm.predict(X_test)
 # df_test is a DataFrame that contains the column "features" of type Vector
 y_test = svm.transform(df_test)
+{% endhighlight %}
 </div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f m-svm-predict.dml
@@ -837,6 +849,7 @@ SystemML Language Reference for details.
 
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
+{% highlight python %}
 # Scikit-learn way
 from sklearn import datasets, neighbors
 import SystemML as sml
@@ -885,6 +898,7 @@ test = sqlCtx.createDataFrame([
     (15L, "apache hadoop")], ["id", "text"])
 prediction = model.transform(test)
 prediction.show()
+{% endhighlight %}
 </div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f m-svm.dml
diff --git a/docs/algorithms-regression.md b/docs/algorithms-regression.md
index 2ec549ccc60..628abcea230 100644
--- a/docs/algorithms-regression.md
+++ b/docs/algorithms-regression.md
@@ -81,6 +81,7 @@ efficient when the number of features $m$ is relatively small
 
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
+{% highlight python %}
 import SystemML as sml
 # C = 1/reg
 lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='direct-solve')
@@ -88,6 +89,7 @@ lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=
 y_test = lr.fit(X_train, y_train)
 # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
 y_test = lr.fit(df_train)
+{% endhighlight %}
 </div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f LinearRegDS.dml
@@ -121,6 +123,7 @@ y_test = lr.fit(df_train)
 
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
+{% highlight python %}
 import SystemML as sml
 # C = 1/reg
 lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg')
@@ -128,6 +131,7 @@ lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=
 y_test = lr.fit(X_train, y_train)
 # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
 y_test = lr.fit(df_train)
+{% endhighlight %}
 </div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f LinearRegCG.dml
@@ -215,6 +219,7 @@ SystemML Language Reference for details.
 
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
+{% highlight python %}
 import numpy as np
 from sklearn import datasets
 import SystemML as sml
@@ -235,6 +240,7 @@ regr = sml.mllearn.LinearRegression(sqlCtx, solver='direct-solve')
 regr.fit(diabetes_X_train, diabetes_y_train)
 # The mean square error
 print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2))
+{% endhighlight %}
 </div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f LinearRegDS.dml
@@ -268,6 +274,7 @@ print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) -
 
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
+{% highlight python %}
 import numpy as np
 from sklearn import datasets
 import SystemML as sml
@@ -288,6 +295,7 @@ regr = sml.mllearn.LinearRegression(sqlCtx, solver='newton-cg')
 regr.fit(diabetes_X_train, diabetes_y_train)
 # The mean square error
 print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2))
+{% endhighlight %}
 </div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f LinearRegCG.dml

From cfe6087b16dc0f4d401e68bb22b72dae3fa272b4 Mon Sep 17 00:00:00 2001
From: Niketan Pansare <npansar@us.ibm.com>
Date: Sun, 7 Aug 2016 17:51:57 -0700
Subject: [PATCH 11/14] Updating the LinRegDS documentation

---
 docs/algorithms-regression.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/algorithms-regression.md b/docs/algorithms-regression.md
index 628abcea230..2153342d1aa 100644
--- a/docs/algorithms-regression.md
+++ b/docs/algorithms-regression.md
@@ -84,7 +84,7 @@ efficient when the number of features $m$ is relatively small
 {% highlight python %}
 import SystemML as sml
 # C = 1/reg
-lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='direct-solve')
+lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, C=1.0, solver='direct-solve')
 # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame
 y_test = lr.fit(X_train, y_train)
 # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"

From 65eb8889b053b062ee91b96545d405fa6e82d9c9 Mon Sep 17 00:00:00 2001
From: Niketan Pansare <npansar@us.ibm.com>
Date: Sun, 7 Aug 2016 22:34:34 -0700
Subject: [PATCH 12/14] Added naive bayes and scipy sparse matrix

---
 docs/algorithms-classification.md             |  49 +++++-
 docs/algorithms-regression.md                 |   4 +-
 scripts/algorithms/naive-bayes-predict.dml    |  15 +-
 scripts/algorithms/naive-bayes.dml            |   5 +-
 .../org/apache/sysml/api/python/SystemML.py   |  35 +++-
 .../java/org/apache/sysml/api/python/test.py  |  52 ++++++
 .../spark/utils/RDDConverterUtilsExt.java     |  18 ++
 .../sysml/api/ml/LogisticRegression.scala     |   2 +-
 .../org/apache/sysml/api/ml/NaiveBayes.scala  | 156 ++++++++++++++++++
 9 files changed, 313 insertions(+), 23 deletions(-)
 create mode 100644 src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala

diff --git a/docs/algorithms-classification.md b/docs/algorithms-classification.md
index 4797429122a..f25d78ea459 100644
--- a/docs/algorithms-classification.md
+++ b/docs/algorithms-classification.md
@@ -132,7 +132,7 @@ Eqs. (1) and (2).
 import SystemML as sml
 # C = 1/reg
 logistic = sml.mllearn.LogisticRegression(sqlCtx, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0)
-# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame
+# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
 y_test = logistic.fit(X_train, y_train).predict(X_test)
 # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
 y_test = logistic.fit(df_train).transform(df_test)
@@ -461,7 +461,7 @@ support vector machine (`y` with domain size `2`).
 import SystemML as sml
 # C = 1/reg
 svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False)
-# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame
+# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
 y_test = svm.fit(X_train, y_train)
 # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
 y_test = svm.fit(df_train)
@@ -504,7 +504,7 @@ y_test = svm.fit(df_train)
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
 {% highlight python %}
-# X_test can be NumPy matrices or Pandas DataFrame
+# X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
 y_test = svm.predict(X_test)
 # df_test is a DataFrame that contains the column "features" of type Vector
 y_test = svm.transform(df_test)
@@ -717,7 +717,7 @@ class labels.
 import SystemML as sml
 # C = 1/reg
 svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=True)
-# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame
+# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
 y_test = svm.fit(X_train, y_train)
 # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
 y_test = svm.fit(df_train)
@@ -760,7 +760,7 @@ y_test = svm.fit(df_train)
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
 {% highlight python %}
-# X_test can be NumPy matrices or Pandas DataFrame
+# X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
 y_test = svm.predict(X_test)
 # df_test is a DataFrame that contains the column "features" of type Vector
 y_test = svm.transform(df_test)
@@ -1024,6 +1024,16 @@ applicable when all features are counts of categorical values.
 **Naive Bayes**:
 
 <div class="codetabs">
+<div data-lang="Python" markdown="1">
+{% highlight python %}
+import SystemML as sml
+nb = sml.mllearn.NaiveBayes(sqlCtx, laplace=1.0)
+# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
+y_test = nb.fit(X_train, y_train)
+# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
+y_test = nb.fit(df_train)
+{% endhighlight %}
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f naive-bayes.dml
                             -nvargs X=<file>
@@ -1055,6 +1065,14 @@ applicable when all features are counts of categorical values.
 **Naive Bayes Prediction**:
 
 <div class="codetabs">
+<div data-lang="Python" markdown="1">
+{% highlight python %}
+# X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
+y_test = nb.predict(X_test)
+# df_test is a DataFrame that contains the column "features" of type Vector
+y_test = nb.transform(df_test)
+{% endhighlight %}
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f naive-bayes-predict.dml
                             -nvargs X=<file>
@@ -1127,6 +1145,27 @@ SystemML Language Reference for details.
 **Naive Bayes**:
 
 <div class="codetabs">
+<div data-lang="Python" markdown="1">
+{% highlight python %}
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.feature_extraction.text import TfidfVectorizer
+import SystemML as sml
+from sklearn import metrics
+from pyspark.sql import SQLContext
+sqlCtx = SQLContext(sc)
+categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
+newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
+newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
+vectorizer = TfidfVectorizer()
+# Both vectors and vectors_test are SciPy CSR matrix
+vectors = vectorizer.fit_transform(newsgroups_train.data)
+vectors_test = vectorizer.transform(newsgroups_test.data)
+nb = sml.mllearn.NaiveBayes(sqlCtx)
+nb.fit(vectors, newsgroups_train.target)
+pred = nb.predict(vectors_test)
+metrics.f1_score(newsgroups_test.target, pred, average='weighted')
+{% endhighlight %}
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f naive-bayes.dml
                             -nvargs X=/user/ml/X.mtx
diff --git a/docs/algorithms-regression.md b/docs/algorithms-regression.md
index 2153342d1aa..5241f5f1d68 100644
--- a/docs/algorithms-regression.md
+++ b/docs/algorithms-regression.md
@@ -85,7 +85,7 @@ efficient when the number of features $m$ is relatively small
 import SystemML as sml
 # C = 1/reg
 lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, C=1.0, solver='direct-solve')
-# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame
+# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
 y_test = lr.fit(X_train, y_train)
 # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
 y_test = lr.fit(df_train)
@@ -127,7 +127,7 @@ y_test = lr.fit(df_train)
 import SystemML as sml
 # C = 1/reg
 lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg')
-# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame
+# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrames or SciPy Sparse matrices
 y_test = lr.fit(X_train, y_train)
 # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
 y_test = lr.fit(df_train)
diff --git a/scripts/algorithms/naive-bayes-predict.dml b/scripts/algorithms/naive-bayes-predict.dml
index e6f8fa4a5e9..b687bfa77ab 100644
--- a/scripts/algorithms/naive-bayes-predict.dml
+++ b/scripts/algorithms/naive-bayes-predict.dml
@@ -28,7 +28,6 @@
 cmdLine_Y = ifdef($Y, " ")
 cmdLine_accuracy = ifdef($accuracy, " ")
 cmdLine_confusion = ifdef($confusion, " ")
-cmdLine_probabilities = ifdef($probabilities, " ")
 cmdLine_fmt = ifdef($fmt, "text")
 
 D = read($X)
@@ -51,13 +50,13 @@ model = append(conditionals, prior)
 
 log_probs = D_w_ones %*% t(log(model))
 
-if(cmdLine_probabilities != " "){
-	mx = rowMaxs(log_probs)
-	ones = matrix(1, rows=1, cols=nrow(prior))
-	probs = log_probs - mx %*% ones
-	probs = exp(probs)/(rowSums(exp(probs)) %*% ones)
-	write(probs, cmdLine_probabilities, format=cmdLine_fmt)
-}
+
+mx = rowMaxs(log_probs)
+ones = matrix(1, rows=1, cols=nrow(prior))
+probs = log_probs - mx %*% ones
+probs = exp(probs)/(rowSums(exp(probs)) %*% ones)
+write(probs, $probabilities, format=cmdLine_fmt)
+
 
 if(cmdLine_Y != " "){
 	C = read(cmdLine_Y)
diff --git a/scripts/algorithms/naive-bayes.dml b/scripts/algorithms/naive-bayes.dml
index a01a5fc0d4f..c1dc44c7c5f 100644
--- a/scripts/algorithms/naive-bayes.dml
+++ b/scripts/algorithms/naive-bayes.dml
@@ -74,7 +74,10 @@ acc = sum(rowIndexMax(logProbs) == C) / numRows * 100
 
 acc_str = "Training Accuracy (%): " + acc
 print(acc_str)
-write(acc, $accuracy)
+accuracyFile = $accuracy
+if(accuracyFile != " ") {
+	write(acc, accuracyFile)
+}
 
 extraModelParams = as.matrix(numFeatures)
 classPrior = rbind(classPrior, extraModelParams)
diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py
index 6f711f11767..bf25ef96e58 100644
--- a/src/main/java/org/apache/sysml/api/python/SystemML.py
+++ b/src/main/java/org/apache/sysml/api/python/SystemML.py
@@ -35,7 +35,8 @@
 from pyspark.mllib.linalg import Vectors
 import sys
 from pyspark.ml import Estimator, Model
-
+from scipy.sparse import spmatrix
+from scipy.sparse import coo_matrix
 
 class MLContext(object):
 
@@ -269,7 +270,19 @@ def getNumCols(numPyArr):
         return numPyArr.shape[1]
        
 def convertToMatrixBlock(sc, src):
-    if isinstance(sc, SparkContext):
+    if isinstance(src, spmatrix):
+        src = coo_matrix(src,  dtype=np.float64)
+        numRows = src.shape[0]
+        numCols = src.shape[1]
+        data = src.data.astype(np.float64)
+        row = src.row.astype(np.int32)
+        col = src.col.astype(np.int32)
+        nnz = len(src.col)
+        buf1 = bytearray(data.tostring())
+        buf2 = bytearray(row.tostring())
+        buf3 = bytearray(col.tostring())
+        return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertSciPyCOOToMB(buf1, buf2, buf3, numRows, numCols, nnz)
+    elif isinstance(sc, SparkContext):
         src = np.asarray(src)
         numCols = getNumCols(src)
         numRows = src.shape[0]
@@ -319,7 +332,7 @@ def _fit(self, X):
         def fit(self, X, y=None, params=None):
             if y is None:
                 return self._fit(X)
-            elif y is not None and (isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame)):
+            elif y is not None and (isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame) or isinstance(X, spmatrix)):
                 if self.transferUsingDF:
                     pdfX = convertToPandasDF(X)
                     pdfY = convertToPandasDF(y)
@@ -346,7 +359,7 @@ def transform(self, X):
             return self.predict(X)
             
         def predict(self, X):
-            if isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame):
+            if isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame) or isinstance(X, spmatrix):
                 if self.transferUsingDF:
                     pdfX = convertToPandasDF(X)
                     df = assemble(self.sqlCtx, pdfX, pdfX.columns, 'features').select('features')
@@ -442,5 +455,15 @@ def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0
             self.estimator.setTol(tol)
             self.estimator.setIcpt(int(fit_intercept))
             self.transferUsingDF = transferUsingDF
-            self.setOutputRawPredictionsToFalse = False            
-                
\ No newline at end of file
+            self.setOutputRawPredictionsToFalse = False    
+
+    class NaiveBayes(BaseSystemMLEstimator):
+
+        def __init__(self, sqlCtx, laplace=1.0, transferUsingDF=False):
+            self.sqlCtx = sqlCtx
+            self.sc = sqlCtx._sc
+            self.uid = "nb"
+            self.estimator = self.sc._jvm.org.apache.sysml.api.ml.NaiveBayes(self.uid, self.sc._jsc.sc())
+            self.estimator.setLaplace(laplace)
+            self.transferUsingDF = transferUsingDF
+            self.setOutputRawPredictionsToFalse = False                
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysml/api/python/test.py b/src/main/java/org/apache/sysml/api/python/test.py
index 9a9ee055b2d..21a1f79fd5c 100644
--- a/src/main/java/org/apache/sysml/api/python/test.py
+++ b/src/main/java/org/apache/sysml/api/python/test.py
@@ -1,3 +1,24 @@
+#!/usr/bin/python
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
 from sklearn import datasets, neighbors
 import SystemML as sml
 from pyspark.sql import SQLContext
@@ -7,6 +28,9 @@
 from pyspark.ml import Pipeline
 from pyspark.ml.feature import HashingTF, Tokenizer
 import numpy as np
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn import metrics
 
 sc = SparkContext()
 sqlCtx = SQLContext(sc)
@@ -122,5 +146,33 @@ def testSVMSK2(self):
         score = svm.fit(X_train, y_train).score(X_test, y_test)
         self.failUnless(score > 0.9)
 
+    def testNaiveBayesSK1(self):
+        digits = datasets.load_digits()
+        X_digits = digits.data
+        y_digits = digits.target
+        n_samples = len(X_digits)
+        X_train = X_digits[:.9 * n_samples]
+        y_train = y_digits[:.9 * n_samples]
+        X_test = X_digits[.9 * n_samples:]
+        y_test = y_digits[.9 * n_samples:]
+        nb = sml.mllearn.NaiveBayes(sqlCtx)
+        score = nb.fit(X_train, y_train).score(X_test, y_test)
+        self.failUnless(score > 0.85)
+
+    def testNaiveBayesSK2(self):
+        categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
+        newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
+        newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
+        vectorizer = TfidfVectorizer()
+        # Both vectors and vectors_test are SciPy CSR matrix
+        vectors = vectorizer.fit_transform(newsgroups_train.data)
+        vectors_test = vectorizer.transform(newsgroups_test.data)
+        nb = sml.mllearn.NaiveBayes(sqlCtx)
+        nb.fit(vectors, newsgroups_train.target)
+        pred = nb.predict(vectors_test)
+        score = metrics.f1_score(newsgroups_test.target, pred, average='weighted')
+        self.failUnless(score > 0.8)
+        
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
index 114e78fc3d3..72ab2303ecf 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
@@ -266,6 +266,24 @@ public static MatrixBlock convertPy4JArrayToMB(byte [] data, int rlen, int clen)
 		return convertPy4JArrayToMB(data, rlen, clen, false);
 	}
 	
+	public static MatrixBlock convertSciPyCOOToMB(byte [] data, byte [] row, byte [] col, int rlen, int clen, int nnz) throws DMLRuntimeException {
+		MatrixBlock mb = new MatrixBlock(rlen, clen, true);
+		mb.allocateSparseRowsBlock(false);
+		ByteBuffer buf1 = ByteBuffer.wrap(data);
+		buf1.order(ByteOrder.nativeOrder());
+		ByteBuffer buf2 = ByteBuffer.wrap(row);
+		buf2.order(ByteOrder.nativeOrder());
+		ByteBuffer buf3 = ByteBuffer.wrap(col);
+		buf3.order(ByteOrder.nativeOrder());
+		for(int i = 0; i < nnz; i++) {
+			double val = buf1.getDouble();
+			int rowIndex = buf2.getInt();
+			int colIndex = buf3.getInt();
+			mb.setValue(rowIndex, colIndex, val); // TODO: Improve the performance
+		}
+		return mb;
+	}
+	
 	public static MatrixBlock convertPy4JArrayToMB(byte [] data, int rlen, int clen, boolean isSparse) throws DMLRuntimeException {
 		MatrixBlock mb = new MatrixBlock(rlen, clen, isSparse, -1);
 		if(isSparse) {
diff --git a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
index f9ddf9c9964..7e6b9223dc6 100644
--- a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
@@ -159,7 +159,7 @@ class LogisticRegressionModel(
       if(ret.getNumColumns != 1) {
         throw new RuntimeException("Expected predicted label to be a column vector")
       }
-      PredictionUtils.updateLabels(true, null, ret, null, labelMapping)
+      PredictionUtils.updateLabels(isSingleNode, null, ret, null, labelMapping)
       return ret
     }
   }
diff --git a/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala b/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala
new file mode 100644
index 00000000000..a6fc367f41d
--- /dev/null
+++ b/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.api.ml
+
+import java.io.File
+import org.apache.spark.SparkContext
+import org.apache.spark.ml.{ Model, Estimator }
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam }
+import org.apache.sysml.api.{ MLContext, MLOutput }
+import org.apache.sysml.runtime.matrix.MatrixCharacteristics
+import org.apache.sysml.runtime.matrix.data.MatrixBlock
+import org.apache.sysml.runtime.DMLRuntimeException
+import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils }
+
+trait HasLaplace extends Params {
+  final val laplace: Param[Double] = new Param[Double](this, "laplace", "Laplace smoothing specified by the user to avoid creation of 0 probabilities.")
+  setDefault(laplace, 1.0)
+  final def getLaplace: Double = $(laplace)
+}
+
+object NaiveBayes {
+  final val scriptPath = "scripts" + File.separator + "algorithms" + File.separator + "naive-bayes.dml"
+}
+
+class NaiveBayes(override val uid: String, val sc: SparkContext) extends Estimator[NaiveBayesModel] with HasLaplace {
+  override def copy(extra: ParamMap): Estimator[NaiveBayesModel] = {
+    val that = new NaiveBayes(uid, sc)
+    copyValues(that, extra)
+  }
+  def setLaplace(value: Double) = set(laplace, value)
+  override def transformSchema(schema: StructType): StructType = schema
+  
+  // Note: will update the y_mb as this will be called by Python mllearn
+  def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): NaiveBayesModel = {
+    val ml = new MLContext(sc)
+    val revLabelMapping = new java.util.HashMap[Int, String]
+    PredictionUtils.fillLabelMapping(y_mb, revLabelMapping)
+    
+    val mloutput = {
+      ml.registerInput("D", X_mb);
+      ml.registerInput("C", y_mb);
+      ml.registerOutput("classPrior");
+      ml.registerOutput("classConditionals");
+      ml.executeScript(ScriptsUtils.getDMLScript(NaiveBayes.scriptPath), getParamMap())
+    }
+    new NaiveBayesModel("naivebayes")(mloutput, revLabelMapping, sc)
+  }
+  
+  def fit(df: DataFrame): NaiveBayesModel = {
+    val ml = new MLContext(df.rdd.sparkContext)
+    val mcXin = new MatrixCharacteristics()
+    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features")
+    val revLabelMapping = new java.util.HashMap[Int, String]
+    val yin = PredictionUtils.fillLabelMapping(df, revLabelMapping)
+    val mloutput = {
+      ml.registerInput("D", Xin, mcXin);
+      ml.registerInput("C", yin, "csv");
+      ml.registerOutput("classPrior");
+      ml.registerOutput("classConditionals");
+      ml.executeScript(ScriptsUtils.getDMLScript(NaiveBayes.scriptPath), getParamMap())
+    }
+    new NaiveBayesModel("naive")(mloutput, revLabelMapping, sc)
+  }
+  
+  def getParamMap(): Map[String, String] = {
+    Map("X" -> " ",
+        "Y" -> " ",
+        "prior" -> " ",
+        "conditionals" -> " ",
+        "accuracy" -> " ",
+        "laplace" -> getLaplace.toString())
+  }
+}
+
+
+object NaiveBayesModel {
+  final val scriptPath = "scripts" + File.separator + "algorithms" + File.separator + "naive-bayes-predict.dml"
+}
+
+class NaiveBayesModel(
+  override val uid: String)(
+    val mloutput: MLOutput, val labelMapping: java.util.HashMap[Int, String], val sc: SparkContext) extends Model[NaiveBayesModel] with HasLaplace {
+  override def copy(extra: ParamMap): NaiveBayesModel = {
+    val that = new NaiveBayesModel(uid)(mloutput, labelMapping, sc)
+    copyValues(that, extra)
+  }
+  
+  def transformSchema(schema: StructType): StructType = schema
+  
+  var priorMB: MatrixBlock = null
+  var conditionalMB: MatrixBlock = null
+  def setPriorAndConditional(prior:MatrixBlock, conditional:MatrixBlock) {
+    priorMB = prior
+    conditionalMB = conditional
+  }
+  
+  def transform(X: MatrixBlock): MatrixBlock = {
+    val isSingleNode = true
+    val ml = new MLContext(sc)
+    ml.registerInput("D", X)
+    ml.registerInput("prior", mloutput.getMatrixBlock("classPrior"), mloutput.getMatrixCharacteristics("classPrior"))
+    ml.registerInput("conditionals", mloutput.getMatrixBlock("classConditionals"), mloutput.getMatrixCharacteristics("classConditionals"))
+    ml.registerOutput("probs")
+    val nbPredict = ml.executeScript(ScriptsUtils.getDMLScript(NaiveBayesModel.scriptPath), getPredictParams())
+    val ret = PredictionUtils.computePredictedClassLabelsFromProbability(nbPredict, isSingleNode, sc, "probs").getMatrixBlock("Prediction");
+    if(ret.getNumColumns != 1) {
+      throw new RuntimeException("Expected predicted label to be a column vector")
+    }
+    PredictionUtils.updateLabels(isSingleNode, null, ret, null, labelMapping)
+    return ret
+  }
+  
+  def transform(df: org.apache.spark.sql.DataFrame): org.apache.spark.sql.DataFrame = {
+    val isSingleNode = false
+    val ml = new MLContext(sc)
+    val mcXin = new MatrixCharacteristics()
+    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features")
+    ml.registerInput("D", Xin, mcXin);
+    ml.registerInput("prior", mloutput.getMatrixBlock("classPrior"), mloutput.getMatrixCharacteristics("classPrior"))
+    ml.registerInput("conditionals", mloutput.getMatrixBlock("classConditionals"), mloutput.getMatrixCharacteristics("classConditionals"))
+    ml.registerOutput("probs")
+    val nbPredict = ml.executeScript(ScriptsUtils.getDMLScript(NaiveBayesModel.scriptPath), getPredictParams())
+    val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(nbPredict, isSingleNode, sc, "probs")
+    val predictedDF = PredictionUtils.updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1", labelMapping).select("ID", "prediction")
+    val prob = nbPredict.getDF(df.sqlContext, "probs", true).withColumnRenamed("C1", "probability").select("ID", "probability")
+    val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID")
+    return PredictionUtils.joinUsingID(dataset, PredictionUtils.joinUsingID(prob, predictedDF))
+  }
+  
+  def getPredictParams(): Map[String, String] = {
+    Map("X" -> " ",
+        "prior" -> " ",
+        "conditionals" -> " ",
+        "probabilities" -> " ")
+  }
+
+}
\ No newline at end of file

From 21e91c7dc6bbe0ea7314e262c890b222c677935f Mon Sep 17 00:00:00 2001
From: Niketan Pansare <npansar@us.ibm.com>
Date: Mon, 8 Aug 2016 22:57:37 -0700
Subject: [PATCH 13/14] Added BaseSystemMLClassifier and updated the classifier
 to use new MLContext

---
 .../java/org/apache/sysml/api/MLContext.java  | 142 +++++++++-------
 .../java/org/apache/sysml/api/MLOutput.java   |  48 +++---
 .../api/mlcontext/BinaryBlockMatrix.java      |   9 +
 .../mlcontext/MLContextConversionUtil.java    |  13 +-
 .../sysml/api/mlcontext/MLContextUtil.java    |  29 +++-
 .../apache/sysml/api/mlcontext/MLResults.java |   9 +-
 .../apache/sysml/api/mlcontext/Matrix.java    |   2 +-
 .../org/apache/sysml/api/python/SystemML.py   |  17 +-
 .../sysml/api/ml/BaseSystemMLClassifier.scala | 157 ++++++++++++++++++
 .../sysml/api/ml/LogisticRegression.scala     | 154 +++++------------
 .../org/apache/sysml/api/ml/NaiveBayes.scala  | 131 +++++----------
 .../apache/sysml/api/ml/PredictionUtils.scala |  36 ++++
 .../scala/org/apache/sysml/api/ml/SVM.scala   | 156 +++++------------
 13 files changed, 479 insertions(+), 424 deletions(-)
 create mode 100644 src/main/scala/org/apache/sysml/api/ml/BaseSystemMLClassifier.scala

diff --git a/src/main/java/org/apache/sysml/api/MLContext.java b/src/main/java/org/apache/sysml/api/MLContext.java
index 54f313e7026..d8a290d107a 100644
--- a/src/main/java/org/apache/sysml/api/MLContext.java
+++ b/src/main/java/org/apache/sysml/api/MLContext.java
@@ -65,6 +65,7 @@
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContextFactory;
+import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysml.runtime.instructions.Instruction;
 import org.apache.sysml.runtime.instructions.cp.Data;
 import org.apache.sysml.runtime.instructions.spark.data.RDDObject;
@@ -476,25 +477,6 @@ public void registerInput(String varName, RDD<String> rdd, String format, long r
 		registerInput(varName, rdd.toJavaRDD().mapToPair(new ConvertStringToLongTextPair()), format, rlen, clen, nnz, null);
 	}
 	
-	public void registerInput(String varName, MatrixBlock mb) throws DMLRuntimeException {
-		MatrixCharacteristics mc = new MatrixCharacteristics(mb.getNumRows(), mb.getNumColumns(), OptimizerUtils.DEFAULT_BLOCKSIZE, OptimizerUtils.DEFAULT_BLOCKSIZE, mb.getNonZeros());
-		registerInput(varName, mb, mc);
-	}
-	
-	public void registerInput(String varName, MatrixBlock mb, MatrixCharacteristics mc) throws DMLRuntimeException {
-		if(_variables == null)
-			_variables = new LocalVariableMap();
-		if(_inVarnames == null)
-			_inVarnames = new ArrayList<String>();
-	
-		MatrixObject mo = new MatrixObject(ValueType.DOUBLE, "temp", new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo));
-		mo.acquireModify(mb); 
-		mo.release();
-		_variables.put(varName, mo);
-		_inVarnames.add(varName);
-		checkIfRegisteringInputAllowed();
-	}
-	
 	// All CSV related methods call this ... It provides access to dimensions, nnz, file properties.
 	private void registerInput(String varName, JavaPairRDD<LongWritable, Text> textOrCsv_rdd, String format, long rlen, long clen, long nnz, FileFormatProperties props) throws DMLRuntimeException {
 		if(!(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)) {
@@ -618,6 +600,24 @@ public void registerInput(String varName, JavaPairRDD<MatrixIndexes,MatrixBlock>
 		checkIfRegisteringInputAllowed();
 	}
 	
+	public void registerInput(String varName, MatrixBlock mb) throws DMLRuntimeException {
+		MatrixCharacteristics mc = new MatrixCharacteristics(mb.getNumRows(), mb.getNumColumns(), OptimizerUtils.DEFAULT_BLOCKSIZE, OptimizerUtils.DEFAULT_BLOCKSIZE, mb.getNonZeros());
+		registerInput(varName, mb, mc);
+	}
+	
+	public void registerInput(String varName, MatrixBlock mb, MatrixCharacteristics mc) throws DMLRuntimeException {
+		if(_variables == null)
+			_variables = new LocalVariableMap();
+		if(_inVarnames == null)
+			_inVarnames = new ArrayList<String>();
+		MatrixObject mo = new MatrixObject(ValueType.DOUBLE, "temp", new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo));
+		mo.acquireModify(mb); 
+		mo.release();
+		_variables.put(varName, mo);
+		_inVarnames.add(varName);
+		checkIfRegisteringInputAllowed();
+	}
+	
 	// =============================================================================================
 	
 	/**
@@ -1240,56 +1240,80 @@ private MLOutput compileAndExecuteScript(String dmlScriptFilePath, String [] arg
 	 * @throws ParseException
 	 */
 	private synchronized MLOutput compileAndExecuteScript(String dmlScriptFilePath, String [] args,  boolean isFile, boolean isNamedArgument, boolean isPyDML, String configFilePath) throws IOException, DMLException {
-		// Set active MLContext.
-		_activeMLContext = this;
-		
-		if(_monitorUtils != null) {
-			_monitorUtils.resetMonitoringData();
-		}
-		
-		if(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK) {
-			
-			// Depending on whether registerInput/registerOutput was called initialize the variables 
-			String[] inputs; String[] outputs;
-			if(_inVarnames != null) {
-				inputs = _inVarnames.toArray(new String[0]);
-			}
-			else {
-				inputs = new String[0];
-			}
-			if(_outVarnames != null) {
-				outputs = _outVarnames.toArray(new String[0]);
-			}
-			else {
-				outputs = new String[0];
+		try {
+			if(getActiveMLContext() != null) {
+				throw new DMLRuntimeException("SystemML (and hence by definition MLContext) doesnot support parallel execute() calls from same or different MLContexts. "
+						+ "As a temporary fix, please do explicit synchronization, i.e. synchronized(MLContext.class) { ml.execute(...) } ");
 			}
-			Map<String, MatrixCharacteristics> outMetadata = new HashMap<String, MatrixCharacteristics>();
 			
-			Map<String, String> argVals = DMLScript.createArgumentsMap(isNamedArgument, args);
+			// Set active MLContext.
+			_activeMLContext = this;
 			
-			// Run the DML script
-			ExecutionContext ec = executeUsingSimplifiedCompilationChain(dmlScriptFilePath, isFile, argVals, isPyDML, inputs, outputs, _variables, configFilePath);
+			if(_monitorUtils != null) {
+				_monitorUtils.resetMonitoringData();
+			}
 			
-			// Now collect the output
-			if(_outVarnames != null) {
-				if(_variables == null) {
-					throw new DMLRuntimeException("The symbol table returned after executing the script is empty");
+			if(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK) {
+				
+				Map<String, JavaPairRDD<MatrixIndexes,MatrixBlock>> retVal = null;
+				
+				// Depending on whether registerInput/registerOutput was called initialize the variables 
+				String[] inputs; String[] outputs;
+				if(_inVarnames != null) {
+					inputs = _inVarnames.toArray(new String[0]);
+				}
+				else {
+					inputs = new String[0];
+				}
+				if(_outVarnames != null) {
+					outputs = _outVarnames.toArray(new String[0]);
 				}
+				else {
+					outputs = new String[0];
+				}
+				Map<String, MatrixCharacteristics> outMetadata = new HashMap<String, MatrixCharacteristics>();
+				
+				Map<String, String> argVals = DMLScript.createArgumentsMap(isNamedArgument, args);
 				
-				for( String ovar : _outVarnames ) {
-					if( _variables.keySet().contains(ovar) ) {
-						outMetadata.put(ovar, ec.getMatrixCharacteristics(ovar)); // For converting output to dataframe
+				// Run the DML script
+				ExecutionContext ec = executeUsingSimplifiedCompilationChain(dmlScriptFilePath, isFile, argVals, isPyDML, inputs, outputs, _variables, configFilePath);
+				
+				// Now collect the output
+				if(_outVarnames != null) {
+					if(_variables == null) {
+						throw new DMLRuntimeException("The symbol table returned after executing the script is empty");
 					}
-					else {
-						throw new DMLException("Error: The variable " + ovar + " is not available as output after the execution of the DMLScript.");
+					
+					for( String ovar : _outVarnames ) {
+						if( _variables.keySet().contains(ovar) ) {
+							if(retVal == null) {
+								retVal = new HashMap<String, JavaPairRDD<MatrixIndexes,MatrixBlock>>();
+							}
+							retVal.put(ovar, ((SparkExecutionContext) ec).getBinaryBlockRDDHandleForVariable(ovar));
+							outMetadata.put(ovar, ec.getMatrixCharacteristics(ovar)); // For converting output to dataframe
+						}
+						else {
+							throw new DMLException("Error: The variable " + ovar + " is not available as output after the execution of the DMLScript.");
+						}
 					}
 				}
+				
+				return new MLOutput(retVal, outMetadata);
 			}
-			
-			return new MLOutput(_variables, ec, outMetadata);
+			else {
+				throw new DMLRuntimeException("Unsupported runtime:" + DMLScript.rtplatform.name());
+			}
+		
 		}
-		else {
-			throw new DMLRuntimeException("Unsupported runtime:" + DMLScript.rtplatform.name());
+		finally {
+			// Remove global dml config and all thread-local configs
+			// TODO enable cleanup whenever invalid GNMF MLcontext is fixed 
+			// (the test is invalid because it assumes that status of previous execute is kept)
+			//ConfigurationManager.setGlobalConfig(new DMLConfig());
+			//ConfigurationManager.clearLocalConfigs();
+			
+			// Reset active MLContext.
+			_activeMLContext = null;	
 		}
 	}
 	
@@ -1451,4 +1475,4 @@ public MLMatrix read(SQLContext sqlContext, String filePath, String format) thro
 //		return MLMatrix.createMLMatrix(this, sqlContext, blocks, mc);
 //	}
 	
-}
+}
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysml/api/MLOutput.java b/src/main/java/org/apache/sysml/api/MLOutput.java
index 3ef68a9f151..55daf176510 100644
--- a/src/main/java/org/apache/sysml/api/MLOutput.java
+++ b/src/main/java/org/apache/sysml/api/MLOutput.java
@@ -39,8 +39,6 @@
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.controlprogram.LocalVariableMap;
-import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysml.runtime.instructions.spark.functions.GetMLBlock;
 import org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt;
@@ -48,7 +46,7 @@
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
 import org.apache.sysml.runtime.util.UtilFunctions;
-import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+
 import scala.Tuple2;
 
 /**
@@ -57,39 +55,31 @@
  */
 public class MLOutput {
 	
-	private LocalVariableMap _variables;
-	private ExecutionContext _ec;
+	Map<String, JavaPairRDD<MatrixIndexes,MatrixBlock>> _outputs;
 	private Map<String, MatrixCharacteristics> _outMetadata = null;
 	
-	public MLOutput(LocalVariableMap variables, ExecutionContext ec, Map<String, MatrixCharacteristics> outMetadata) {
-		this._variables = variables;
-		this._ec = ec;
-		this._outMetadata = outMetadata;
-	}
-	
 	public MatrixBlock getMatrixBlock(String varName) throws DMLRuntimeException {
-		if( _variables.keySet().contains(varName) ) {
-			MatrixObject mo = _ec.getMatrixObject(varName);
-			MatrixBlock mb = mo.acquireRead();
-			mo.release();
-			return mb;
-		}
-		else {
-			throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table.");
-		}
+		MatrixCharacteristics mc = getMatrixCharacteristics(varName);
+		// The matrix block is always pushed to an RDD and then we do collect
+		// We can later avoid this by returning symbol table rather than "Map<String, JavaPairRDD<MatrixIndexes,MatrixBlock>> _outputs"
+		MatrixBlock mb = SparkExecutionContext.toMatrixBlock(getBinaryBlockedRDD(varName), (int) mc.getRows(), (int) mc.getCols(), 
+				mc.getRowsPerBlock(), mc.getColsPerBlock(), mc.getNonZeros());
+		return mb;
+	}
+	public MLOutput(Map<String, JavaPairRDD<MatrixIndexes,MatrixBlock>> outputs, Map<String, MatrixCharacteristics> outMetadata) {
+		this._outputs = outputs;
+		this._outMetadata = outMetadata;
 	}
 	
 	public JavaPairRDD<MatrixIndexes,MatrixBlock> getBinaryBlockedRDD(String varName) throws DMLRuntimeException {
-		if( _variables.keySet().contains(varName) ) {
-			return ((SparkExecutionContext) _ec).getBinaryBlockRDDHandleForVariable(varName);
-		}
-		else {
-			throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table.");
+		if(_outputs.containsKey(varName)) {
+			return _outputs.get(varName);
 		}
+		throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table.");
 	}
 	
 	public MatrixCharacteristics getMatrixCharacteristics(String varName) throws DMLRuntimeException {
-		if(_outMetadata.containsKey(varName)) {
+		if(_outputs.containsKey(varName)) {
 			return _outMetadata.get(varName);
 		}
 		throw new DMLRuntimeException("Variable " + varName + " not found in the output symbol table.");
@@ -255,7 +245,7 @@ public Iterable<Tuple2<Long, Tuple2<Long, Double[]>>> call(Tuple2<MatrixIndexes,
     		int lclen = UtilFunctions.computeBlockSize(clen, blockColIndex, bclen);
     		// ------------------------------------------------------------------
 			
-			long startRowIndex = (kv._1.getRowIndex()-1) * bclen;
+			long startRowIndex = (kv._1.getRowIndex()-1) * bclen + 1;
 			MatrixBlock blk = kv._2;
 			ArrayList<Tuple2<Long, Tuple2<Long, Double[]>>> retVal = new ArrayList<Tuple2<Long,Tuple2<Long,Double[]>>>();
 			for(int i = 0; i < lrlen; i++) {
@@ -263,7 +253,7 @@ public Iterable<Tuple2<Long, Tuple2<Long, Double[]>>> call(Tuple2<MatrixIndexes,
 				for(int j = 0; j < lclen; j++) {
 					partialRow[j] = blk.getValue(i, j);
 				}
-				retVal.add(new Tuple2<Long, Tuple2<Long,Double[]>>(startRowIndex + i + 1, new Tuple2<Long,Double[]>(kv._1.getColumnIndex(), partialRow)));
+				retVal.add(new Tuple2<Long, Tuple2<Long,Double[]>>(startRowIndex + i, new Tuple2<Long,Double[]>(kv._1.getColumnIndex(), partialRow)));
 			}
 			return retVal;
 		}
@@ -427,4 +417,4 @@ public Row call(Tuple2<Long, Iterable<Tuple2<Long, Double[]>>> arg0)
 			return RowFactory.create(row);
 		}
 	}
-}
+}
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/BinaryBlockMatrix.java b/src/main/java/org/apache/sysml/api/mlcontext/BinaryBlockMatrix.java
index 8c9f923ad5d..ea6fcf0a55b 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/BinaryBlockMatrix.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/BinaryBlockMatrix.java
@@ -21,6 +21,8 @@
 
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.sql.DataFrame;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
@@ -97,6 +99,13 @@ public BinaryBlockMatrix(JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlocks,
 	public JavaPairRDD<MatrixIndexes, MatrixBlock> getBinaryBlocks() {
 		return binaryBlocks;
 	}
+	
+	public MatrixBlock getMatrixBlock() throws DMLRuntimeException {
+		MatrixCharacteristics mc = getMatrixCharacteristics();
+		MatrixBlock mb = SparkExecutionContext.toMatrixBlock(binaryBlocks, (int) mc.getRows(), (int) mc.getCols(), 
+				mc.getRowsPerBlock(), mc.getColsPerBlock(), mc.getNonZeros());
+		return mb;
+	}
 
 	/**
 	 * Obtain the SystemML binary-block matrix characteristics
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
index 33226d2b87a..161ad174fd9 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
@@ -676,7 +676,7 @@ public static double[][] matrixObjectToDoubleMatrix(MatrixObject matrixObject) {
 	 * @return the {@code MatrixObject} converted to a {@code DataFrame}
 	 */
 	public static DataFrame matrixObjectToDataFrame(MatrixObject matrixObject,
-			SparkExecutionContext sparkExecutionContext) {
+			SparkExecutionContext sparkExecutionContext, boolean isVectorDF) {
 		try {
 			@SuppressWarnings("unchecked")
 			JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlockMatrix = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sparkExecutionContext
@@ -686,8 +686,17 @@ public static DataFrame matrixObjectToDataFrame(MatrixObject matrixObject,
 			MLContext activeMLContext = (MLContext) MLContextProxy.getActiveMLContext();
 			SparkContext sc = activeMLContext.getSparkContext();
 			SQLContext sqlContext = new SQLContext(sc);
-			DataFrame df = RDDConverterUtilsExt.binaryBlockToDataFrame(binaryBlockMatrix, matrixCharacteristics,
+			DataFrame df = null;
+			if(isVectorDF) {
+				df = RDDConverterUtilsExt.binaryBlockToVectorDataFrame(binaryBlockMatrix, matrixCharacteristics,
+						sqlContext);
+			}
+			else {
+				df = RDDConverterUtilsExt.binaryBlockToDataFrame(binaryBlockMatrix, matrixCharacteristics,
 					sqlContext);
+			}
+			
+			
 			return df;
 		} catch (DMLRuntimeException e) {
 			throw new MLContextException("DMLRuntimeException while converting matrix object to DataFrame", e);
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java b/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
index feb616ecefd..fc942e98c52 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
@@ -44,7 +44,9 @@
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.parser.ParseException;
+import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.controlprogram.LocalVariableMap;
+import org.apache.sysml.runtime.controlprogram.caching.CacheException;
 import org.apache.sysml.runtime.controlprogram.caching.FrameObject;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.instructions.cp.BooleanObject;
@@ -52,8 +54,12 @@
 import org.apache.sysml.runtime.instructions.cp.DoubleObject;
 import org.apache.sysml.runtime.instructions.cp.IntObject;
 import org.apache.sysml.runtime.instructions.cp.StringObject;
+import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
+import org.apache.sysml.runtime.matrix.MatrixFormatMetaData;
+import org.apache.sysml.runtime.matrix.data.InputInfo;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
+import org.apache.sysml.runtime.matrix.data.OutputInfo;
 
 /**
  * Utility class containing methods for working with the MLContext API.
@@ -72,7 +78,7 @@ public final class MLContextUtil {
 	 */
 	@SuppressWarnings("rawtypes")
 	public static final Class[] COMPLEX_DATA_TYPES = { JavaRDD.class, RDD.class, DataFrame.class,
-			BinaryBlockMatrix.class, Matrix.class, (new double[][] {}).getClass() };
+			BinaryBlockMatrix.class, Matrix.class, (new double[][] {}).getClass(), MatrixBlock.class };
 
 	/**
 	 * All data types supported by the MLContext API
@@ -391,6 +397,8 @@ public static Map<String, String> convertInputParametersForParser(Map<String, Ob
 				convertedMap.put(key, Double.toString((Double) value));
 			} else if (value instanceof String) {
 				convertedMap.put(key, (String) value);
+			} else {
+				throw new MLContextException("Incorrect type for input parameters");
 			}
 		}
 		return convertedMap;
@@ -448,7 +456,24 @@ public static Data convertInputType(String parameterName, Object parameterValue,
 			}
 
 			return matrixObject;
-		} else if (value instanceof DataFrame) {
+		} else if (value instanceof MatrixBlock) {
+			MatrixCharacteristics matrixCharacteristics;
+			if (matrixMetadata != null) {
+				matrixCharacteristics = matrixMetadata.asMatrixCharacteristics();
+			} else {
+				matrixCharacteristics = new MatrixCharacteristics();
+			}
+			MatrixFormatMetaData mtd = new MatrixFormatMetaData(matrixCharacteristics, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo);
+			MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, MLContextUtil.scratchSpace() + "/" + name, mtd);
+			try {
+				matrixObject.acquireModify((MatrixBlock)value);
+				matrixObject.release();
+			} catch (CacheException e) {
+				throw new MLContextException(e);
+			}
+			return matrixObject;
+		}
+		else if (value instanceof DataFrame) {
 			DataFrame dataFrame = (DataFrame) value;
 			MatrixObject matrixObject = MLContextConversionUtil
 					.dataFrameToMatrixObject(name, dataFrame, matrixMetadata);
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLResults.java b/src/main/java/org/apache/sysml/api/mlcontext/MLResults.java
index bd1b6bc30e2..582a73eb08f 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/MLResults.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/MLResults.java
@@ -255,7 +255,13 @@ public RDD<String> getRDDStringIJV(String outputName) {
 	 */
 	public DataFrame getDataFrame(String outputName) {
 		MatrixObject mo = getMatrixObject(outputName);
-		DataFrame df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext);
+		DataFrame df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, false);
+		return df;
+	}
+	
+	public DataFrame getDataFrame(String outputName, boolean isVectorDF) {
+		MatrixObject mo = getMatrixObject(outputName);
+		DataFrame df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, isVectorDF);
 		return df;
 	}
 
@@ -271,6 +277,7 @@ public Matrix getMatrix(String outputName) {
 		Matrix matrix = new Matrix(mo, sparkExecutionContext);
 		return matrix;
 	}
+	
 
 	/**
 	 * Obtain an output as a {@code BinaryBlockMatrix}.
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/Matrix.java b/src/main/java/org/apache/sysml/api/mlcontext/Matrix.java
index 178a6e5a04d..3ee41b7fa85 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/Matrix.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/Matrix.java
@@ -108,7 +108,7 @@ public RDD<String> asRDDStringIJV() {
 	 * @return the matrix as a {@code DataFrame}
 	 */
 	public DataFrame asDataFrame() {
-		DataFrame df = MLContextConversionUtil.matrixObjectToDataFrame(matrixObject, sparkExecutionContext);
+		DataFrame df = MLContextConversionUtil.matrixObjectToDataFrame(matrixObject, sparkExecutionContext, false);
 		return df;
 	}
 
diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py
index bf25ef96e58..c03bd1bab49 100644
--- a/src/main/java/org/apache/sysml/api/python/SystemML.py
+++ b/src/main/java/org/apache/sysml/api/python/SystemML.py
@@ -38,6 +38,8 @@
 from scipy.sparse import spmatrix
 from scipy.sparse import coo_matrix
 
+SUPPORTED_TYPES = (np.ndarray, pd.DataFrame, spmatrix)
+
 class MLContext(object):
 
     """
@@ -274,7 +276,7 @@ def convertToMatrixBlock(sc, src):
         src = coo_matrix(src,  dtype=np.float64)
         numRows = src.shape[0]
         numCols = src.shape[1]
-        data = src.data.astype(np.float64)
+        data = src.data
         row = src.row.astype(np.int32)
         col = src.col.astype(np.int32)
         nnz = len(src.col)
@@ -308,12 +310,7 @@ def convertToPandasDF(X):
     return X
             
 def tolist(inputCols):
-    if isinstance(inputCols, pd.indexes.base.Index):
-        return inputCols.get_values().tolist()
-    elif isinstance(inputCols, list):
-        return inputCols
-    else:
-        raise Exception('inputCols should be of type pandas.indexes.base.Index or list')
+    return list(inputCols)
 
 def assemble(sqlCtx, pdf, inputCols, outputCol):
     tmpDF = sqlCtx.createDataFrame(pdf, tolist(pdf.columns))
@@ -322,6 +319,8 @@ def assemble(sqlCtx, pdf, inputCols, outputCol):
 
 class mllearn:
     class BaseSystemMLEstimator(Estimator):
+    # TODO: Allow users to set featuresCol (with default 'features') and labelCol (with default 'label')
+    
         def _fit(self, X):
             if hasattr(X, '_jdf') and 'features' in X.columns and 'label' in X.columns:
                 self.model = self.estimator.fit(X._jdf)
@@ -332,7 +331,7 @@ def _fit(self, X):
         def fit(self, X, y=None, params=None):
             if y is None:
                 return self._fit(X)
-            elif y is not None and (isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame) or isinstance(X, spmatrix)):
+            elif y is not None and isinstance(X, SUPPORTED_TYPES):
                 if self.transferUsingDF:
                     pdfX = convertToPandasDF(X)
                     pdfY = convertToPandasDF(y)
@@ -359,7 +358,7 @@ def transform(self, X):
             return self.predict(X)
             
         def predict(self, X):
-            if isinstance(X, np.ndarray) or isinstance(X, pd.core.frame.DataFrame) or isinstance(X, spmatrix):
+            if isinstance(X, SUPPORTED_TYPES):
                 if self.transferUsingDF:
                     pdfX = convertToPandasDF(X)
                     df = assemble(self.sqlCtx, pdfX, pdfX.columns, 'features').select('features')
diff --git a/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLClassifier.scala b/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLClassifier.scala
new file mode 100644
index 00000000000..5174aabdb72
--- /dev/null
+++ b/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLClassifier.scala
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.api.ml
+
+import org.apache.spark.rdd.RDD
+import java.io.File
+import org.apache.spark.SparkContext
+import org.apache.spark.ml.{ Model, Estimator }
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam }
+import org.apache.sysml.runtime.matrix.MatrixCharacteristics
+import org.apache.sysml.runtime.matrix.data.MatrixBlock
+import org.apache.sysml.runtime.DMLRuntimeException
+import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils }
+import org.apache.sysml.api.mlcontext._
+import org.apache.sysml.api.mlcontext.ScriptFactory._
+
+trait HasLaplace extends Params {
+  final val laplace: Param[Double] = new Param[Double](this, "laplace", "Laplace smoothing specified by the user to avoid creation of 0 probabilities.")
+  setDefault(laplace, 1.0)
+  final def getLaplace: Double = $(laplace)
+}
+trait HasIcpt extends Params {
+  final val icpt: Param[Int] = new Param[Int](this, "icpt", "Intercept presence, shifting and rescaling X columns")
+  setDefault(icpt, 0)
+  final def getIcpt: Int = $(icpt)
+}
+trait HasMaxOuterIter extends Params {
+  final val maxOuterIter: Param[Int] = new Param[Int](this, "maxOuterIter", "max. number of outer (Newton) iterations")
+  setDefault(maxOuterIter, 100)
+  final def getMaxOuterIte: Int = $(maxOuterIter)
+}
+trait HasMaxInnerIter extends Params {
+  final val maxInnerIter: Param[Int] = new Param[Int](this, "maxInnerIter", "max. number of inner (conjugate gradient) iterations, 0 = no max")
+  setDefault(maxInnerIter, 0)
+  final def getMaxInnerIter: Int = $(maxInnerIter)
+}
+trait HasTol extends Params {
+  final val tol: DoubleParam = new DoubleParam(this, "tol", "the convergence tolerance for iterative algorithms")
+  setDefault(tol, 0.000001)
+  final def getTol: Double = $(tol)
+}
+trait HasRegParam extends Params {
+  final val regParam: DoubleParam = new DoubleParam(this, "tol", "the convergence tolerance for iterative algorithms")
+  setDefault(regParam, 0.000001)
+  final def getRegParam: Double = $(regParam)
+}
+
+
+trait BaseSystemMLClassifier {
+  def transformSchema(schema: StructType): StructType = schema
+  
+  // Returns the script and variables for X and y
+  def getTrainingScript(isSingleNode:Boolean):(Script, String, String)
+  
+  def fit(X_mb: MatrixBlock, y_mb: MatrixBlock, sc: SparkContext): (MLResults, java.util.HashMap[Int, String]) = {
+    val isSingleNode = true
+    val ml = new org.apache.sysml.api.mlcontext.MLContext(sc)
+    val revLabelMapping = new java.util.HashMap[Int, String]
+    PredictionUtils.fillLabelMapping(y_mb, revLabelMapping)
+    val ret = getTrainingScript(isSingleNode)
+    val script = ret._1.in(ret._2, X_mb).in(ret._3, y_mb)
+    (ml.execute(script), revLabelMapping)
+  }
+  
+  def fit(df: DataFrame, sc: SparkContext): (MLResults, java.util.HashMap[Int, String]) = {
+    val isSingleNode = false
+    val ml = new MLContext(df.rdd.sparkContext)
+    val mcXin = new MatrixCharacteristics()
+    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features")
+    val revLabelMapping = new java.util.HashMap[Int, String]
+    val yin = PredictionUtils.fillLabelMapping(df, revLabelMapping)
+    val ret = getTrainingScript(isSingleNode)
+    val Xbin = new BinaryBlockMatrix(Xin, mcXin)
+    val script = ret._1.in(ret._2, Xbin).in(ret._3, yin)
+    (ml.execute(script), revLabelMapping)
+  }
+  
+  def toDouble(i:Int): java.lang.Double = {
+    double2Double(i.toDouble)
+  }
+  def toDouble(d:Double): java.lang.Double = {
+    double2Double(d)
+  }
+  
+}
+
+trait BaseSystemMLClassifierModel {
+  
+  def toDouble(i:Int): java.lang.Double = {
+    double2Double(i.toDouble)
+  }
+  def toDouble(d:Double): java.lang.Double = {
+    double2Double(d)
+  }
+  
+  def transformSchema(schema: StructType): StructType = schema
+  
+  // Returns the script and variable for X
+  def getPredictionScript(mloutput: MLResults, isSingleNode:Boolean): (Script, String)
+  
+  def transform(X: MatrixBlock, mloutput: MLResults, labelMapping: java.util.HashMap[Int, String], sc: SparkContext, probVar:String): MatrixBlock = {
+    val isSingleNode = true
+    val ml = new MLContext(sc)
+    val script = getPredictionScript(mloutput, isSingleNode)
+    val modelPredict = ml.execute(script._1.in(script._2, X))
+    val ret = PredictionUtils.computePredictedClassLabelsFromProbability(modelPredict, isSingleNode, sc, probVar)
+              .getBinaryBlockMatrix("Prediction").getMatrixBlock
+              
+    if(ret.getNumColumns != 1) {
+      throw new RuntimeException("Expected predicted label to be a column vector")
+    }
+    PredictionUtils.updateLabels(isSingleNode, null, ret, null, labelMapping)
+    return ret
+  }
+
+  def transform(df: DataFrame, mloutput: MLResults, labelMapping: java.util.HashMap[Int, String], sc: SparkContext, 
+      probVar:String, outputProb:Boolean=true): DataFrame = {
+    val isSingleNode = false
+    val ml = new MLContext(sc)
+    val mcXin = new MatrixCharacteristics()
+    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features")
+    val script = getPredictionScript(mloutput, isSingleNode)
+    val Xin_bin = new BinaryBlockMatrix(Xin, mcXin)
+    val modelPredict = ml.execute(script._1.in(script._2, Xin_bin))
+    val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(modelPredict, isSingleNode, sc, probVar)
+    val predictedDF = PredictionUtils.updateLabels(isSingleNode, predLabelOut.getDataFrame("Prediction"), null, "C1", labelMapping).select("ID", "prediction")
+    if(outputProb) {
+      val prob = modelPredict.getDataFrame(probVar, true).withColumnRenamed("C1", "probability").select("ID", "probability")
+      val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID")
+      return PredictionUtils.joinUsingID(dataset, PredictionUtils.joinUsingID(prob, predictedDF))  
+    }
+    else {
+      val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID")
+      return PredictionUtils.joinUsingID(dataset, predictedDF)
+    }
+    
+  }
+}
\ No newline at end of file
diff --git a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
index 7e6b9223dc6..3098da9c21d 100644
--- a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
@@ -19,50 +19,20 @@
 
 package org.apache.sysml.api.ml
 
+import org.apache.spark.rdd.RDD
 import java.io.File
-import org.apache.sysml.api.{ MLContext, MLOutput }
-import org.apache.sysml.runtime.matrix.MatrixCharacteristics
-import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils }
-import org.apache.spark.{ SparkContext }
+import org.apache.spark.SparkContext
+import org.apache.spark.ml.{ Model, Estimator }
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.ml.{ Model, Estimator }
-import org.apache.spark.ml.classification._
 import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam }
-import org.apache.spark.ml.param.shared._
-import org.apache.spark.SparkConf
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.regression.LabeledPoint
-import scala.reflect.ClassTag
-import scala.collection.immutable.HashMap
+import org.apache.sysml.runtime.matrix.MatrixCharacteristics
 import org.apache.sysml.runtime.matrix.data.MatrixBlock
 import org.apache.sysml.runtime.DMLRuntimeException
+import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils }
+import org.apache.sysml.api.mlcontext._
+import org.apache.sysml.api.mlcontext.ScriptFactory._
 
-trait HasIcpt extends Params {
-  final val icpt: Param[Int] = new Param[Int](this, "icpt", "Intercept presence, shifting and rescaling X columns")
-  setDefault(icpt, 0)
-  final def getIcpt: Int = $(icpt)
-}
-trait HasMaxOuterIter extends Params {
-  final val maxOuterIter: Param[Int] = new Param[Int](this, "maxOuterIter", "max. number of outer (Newton) iterations")
-  setDefault(maxOuterIter, 100)
-  final def getMaxOuterIte: Int = $(maxOuterIter)
-}
-trait HasMaxInnerIter extends Params {
-  final val maxInnerIter: Param[Int] = new Param[Int](this, "maxInnerIter", "max. number of inner (conjugate gradient) iterations, 0 = no max")
-  setDefault(maxInnerIter, 0)
-  final def getMaxInnerIter: Int = $(maxInnerIter)
-}
-trait HasTol extends Params {
-  final val tol: DoubleParam = new DoubleParam(this, "tol", "the convergence tolerance for iterative algorithms")
-  setDefault(tol, 0.000001)
-  final def getTol: Double = $(tol)
-}
-trait HasRegParam extends Params {
-  final val regParam: DoubleParam = new DoubleParam(this, "tol", "the convergence tolerance for iterative algorithms")
-  setDefault(regParam, 0.000001)
-  final def getRegParam: Double = $(regParam)
-}
 object LogisticRegression {
   final val scriptPath = "scripts" + File.separator + "algorithms" + File.separator + "MultiLogReg.dml"
 }
@@ -71,7 +41,7 @@ object LogisticRegression {
  * Logistic Regression Scala API
  */
 class LogisticRegression(override val uid: String, val sc: SparkContext) extends Estimator[LogisticRegressionModel] with HasIcpt
-    with HasRegParam with HasTol with HasMaxOuterIter with HasMaxInnerIter {
+    with HasRegParam with HasTol with HasMaxOuterIter with HasMaxInnerIter with BaseSystemMLClassifier {
 
   def setIcpt(value: Int) = set(icpt, value)
   def setMaxOuterIter(value: Int) = set(maxOuterIter, value)
@@ -83,48 +53,31 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends
     val that = new LogisticRegression(uid, sc)
     copyValues(that, extra)
   }
-  override def transformSchema(schema: StructType): StructType = schema
   
   // Note: will update the y_mb as this will be called by Python mllearn
   def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): LogisticRegressionModel = {
-    val ml = new MLContext(sc)
-    val revLabelMapping = new java.util.HashMap[Int, String]
-    PredictionUtils.fillLabelMapping(y_mb, revLabelMapping)
-    
-    val mloutput = {
-      ml.registerInput("X", X_mb);
-      ml.registerInput("Y_vec", y_mb);
-      ml.registerOutput("B_out");
-      ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath), getParamMap())
-    }
-    new LogisticRegressionModel("logisticRegression")(mloutput, revLabelMapping, sc)
+    val ret = fit(X_mb, y_mb, sc)
+    new LogisticRegressionModel("log")(ret._1, ret._2, sc)
   }
   
-  def getParamMap():Map[String, String] = {
-    Map(
-        "icpt" -> this.getIcpt.toString(),
-        "reg" -> this.getRegParam.toString(),
-        "tol" -> this.getTol.toString,
-        "moi" -> this.getMaxOuterIte.toString,
-        "mii" -> this.getMaxInnerIter.toString,
-
-        "X" -> " ",
-        "Y" -> " ",
-        "B" -> " ")
+  def fit(df: DataFrame): LogisticRegressionModel = {
+    val ret = fit(df, sc)
+    new LogisticRegressionModel("log")(ret._1, ret._2, sc)
   }
-  override def fit(df: DataFrame): LogisticRegressionModel = {
-    val ml = new MLContext(df.rdd.sparkContext)
-    val mcXin = new MatrixCharacteristics()
-    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features")
-    val revLabelMapping = new java.util.HashMap[Int, String]
-    val yin = PredictionUtils.fillLabelMapping(df, revLabelMapping)
-    val mloutput = {
-      ml.registerInput("X", Xin, mcXin);
-      ml.registerInput("Y_vec", yin, "csv");
-      ml.registerOutput("B_out");
-      ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath), getParamMap())
-    }
-    new LogisticRegressionModel("logisticRegression")(mloutput, revLabelMapping, sc)
+  
+  
+  def getTrainingScript(isSingleNode:Boolean):(Script, String, String)  = {
+    val script = dml(ScriptsUtils.getDMLScript(LogisticRegression.scriptPath))
+      .in("$X", " ")
+      .in("$Y", " ")
+      .in("$B", " ")
+      .in("$icpt", toDouble(getIcpt))
+      .in("$reg", toDouble(getRegParam))
+      .in("$tol", toDouble(getTol))
+      .in("$moi", toDouble(getMaxOuterIte))
+      .in("$mii", toDouble(getMaxInnerIter))
+      .out("B_out")
+    (script, "X", "Y_vec")
   }
 }
 object LogisticRegressionModel {
@@ -135,55 +88,22 @@ object LogisticRegressionModel {
  * Logistic Regression Scala API
  */
 
-class LogisticRegressionModel(
-  override val uid: String)(
-    val mloutput: MLOutput, val labelMapping: java.util.HashMap[Int, String], val sc: SparkContext) extends Model[LogisticRegressionModel] with HasIcpt
-    with HasRegParam with HasTol with HasMaxOuterIter with HasMaxInnerIter {
+class LogisticRegressionModel(override val uid: String)(
+    val mloutput: MLResults, val labelMapping: java.util.HashMap[Int, String], val sc: SparkContext) 
+    extends Model[LogisticRegressionModel] with HasIcpt
+    with HasRegParam with HasTol with HasMaxOuterIter with HasMaxInnerIter with BaseSystemMLClassifierModel {
   override def copy(extra: ParamMap): LogisticRegressionModel = {
     val that = new LogisticRegressionModel(uid)(mloutput, labelMapping, sc)
     copyValues(that, extra)
   }
   var outputRawPredictions = true
   def setOutputRawPredictions(outRawPred:Boolean): Unit = { outputRawPredictions = outRawPred }
-  override def transformSchema(schema: StructType): StructType = schema
-   
-  def transform(X: MatrixBlock): MatrixBlock = {
-    if(outputRawPredictions) {
-      throw new RuntimeException("Outputting raw prediction is not supported")
-    }
-    else {
-      val isSingleNode = true
-      val ret = PredictionUtils.computePredictedClassLabelsFromProbability(
-          PredictionUtils.doGLMPredict(isSingleNode, null, X, sc, mloutput, "B_out", getPredictParams), 
-          isSingleNode, sc, "means").getMatrixBlock("Prediction");
-      if(ret.getNumColumns != 1) {
-        throw new RuntimeException("Expected predicted label to be a column vector")
-      }
-      PredictionUtils.updateLabels(isSingleNode, null, ret, null, labelMapping)
-      return ret
-    }
-  }
   
-  
-  override def transform(df: DataFrame): DataFrame = {
-    val isSingleNode = false
-    val glmPredOut = PredictionUtils.doGLMPredict(isSingleNode, df, null, sc, mloutput, "B_out", getPredictParams())
-    val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(glmPredOut, isSingleNode, sc, "means")
-    val predictedDF = PredictionUtils.updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1", labelMapping).select("ID", "prediction")
-    val prob = glmPredOut.getDF(df.sqlContext, "means", true).withColumnRenamed("C1", "probability").select("ID", "probability")
-    val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID")
-    
-    if(outputRawPredictions) {
-      // Not supported: rawPred = 1 / (1 + exp(- X * t(B_full)) );
-    }
-    return PredictionUtils.joinUsingID(dataset, PredictionUtils.joinUsingID(prob, predictedDF))
-  }
-  
-  def getPredictParams(): Map[String, String] = {
-    Map("X" -> " ",
-        "B" -> " ",
-        "dfam" -> "3")
-  }
+  def getPredictionScript(mloutput: MLResults, isSingleNode:Boolean): (Script, String) =
+    PredictionUtils.getGLMPredictionScript(mloutput.getBinaryBlockMatrix("B_out"), isSingleNode)
+   
+  def transform(X: MatrixBlock): MatrixBlock = transform(X, mloutput, labelMapping, sc, "means")
+  def transform(df: DataFrame): DataFrame = transform(df, mloutput, labelMapping, sc, "means")
 }
 
 /**
@@ -210,7 +130,7 @@ object LogisticRegressionExample {
       LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 2.3))))
     val lr = new LogisticRegression("log", sc)
     val lrmodel = lr.fit(training.toDF)
-    lrmodel.mloutput.getDF(sqlContext, "B_out").show()
+    // lrmodel.mloutput.getDF(sqlContext, "B_out").show()
 
     val testing = sc.parallelize(Seq(
       LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)),
diff --git a/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala b/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala
index a6fc367f41d..28836221582 100644
--- a/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala
@@ -19,75 +19,52 @@
 
 package org.apache.sysml.api.ml
 
+import org.apache.spark.rdd.RDD
 import java.io.File
 import org.apache.spark.SparkContext
 import org.apache.spark.ml.{ Model, Estimator }
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam }
-import org.apache.sysml.api.{ MLContext, MLOutput }
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics
 import org.apache.sysml.runtime.matrix.data.MatrixBlock
 import org.apache.sysml.runtime.DMLRuntimeException
 import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils }
-
-trait HasLaplace extends Params {
-  final val laplace: Param[Double] = new Param[Double](this, "laplace", "Laplace smoothing specified by the user to avoid creation of 0 probabilities.")
-  setDefault(laplace, 1.0)
-  final def getLaplace: Double = $(laplace)
-}
+import org.apache.sysml.api.mlcontext._
+import org.apache.sysml.api.mlcontext.ScriptFactory._
 
 object NaiveBayes {
   final val scriptPath = "scripts" + File.separator + "algorithms" + File.separator + "naive-bayes.dml"
 }
 
-class NaiveBayes(override val uid: String, val sc: SparkContext) extends Estimator[NaiveBayesModel] with HasLaplace {
+class NaiveBayes(override val uid: String, val sc: SparkContext) extends Estimator[NaiveBayesModel] with HasLaplace with BaseSystemMLClassifier {
   override def copy(extra: ParamMap): Estimator[NaiveBayesModel] = {
     val that = new NaiveBayes(uid, sc)
     copyValues(that, extra)
   }
   def setLaplace(value: Double) = set(laplace, value)
-  override def transformSchema(schema: StructType): StructType = schema
   
   // Note: will update the y_mb as this will be called by Python mllearn
   def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): NaiveBayesModel = {
-    val ml = new MLContext(sc)
-    val revLabelMapping = new java.util.HashMap[Int, String]
-    PredictionUtils.fillLabelMapping(y_mb, revLabelMapping)
-    
-    val mloutput = {
-      ml.registerInput("D", X_mb);
-      ml.registerInput("C", y_mb);
-      ml.registerOutput("classPrior");
-      ml.registerOutput("classConditionals");
-      ml.executeScript(ScriptsUtils.getDMLScript(NaiveBayes.scriptPath), getParamMap())
-    }
-    new NaiveBayesModel("naivebayes")(mloutput, revLabelMapping, sc)
+    val ret = fit(X_mb, y_mb, sc)
+    new NaiveBayesModel("naive")(ret._1, ret._2, sc)
   }
   
   def fit(df: DataFrame): NaiveBayesModel = {
-    val ml = new MLContext(df.rdd.sparkContext)
-    val mcXin = new MatrixCharacteristics()
-    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features")
-    val revLabelMapping = new java.util.HashMap[Int, String]
-    val yin = PredictionUtils.fillLabelMapping(df, revLabelMapping)
-    val mloutput = {
-      ml.registerInput("D", Xin, mcXin);
-      ml.registerInput("C", yin, "csv");
-      ml.registerOutput("classPrior");
-      ml.registerOutput("classConditionals");
-      ml.executeScript(ScriptsUtils.getDMLScript(NaiveBayes.scriptPath), getParamMap())
-    }
-    new NaiveBayesModel("naive")(mloutput, revLabelMapping, sc)
+    val ret = fit(df, sc)
+    new NaiveBayesModel("naive")(ret._1, ret._2, sc)
   }
   
-  def getParamMap(): Map[String, String] = {
-    Map("X" -> " ",
-        "Y" -> " ",
-        "prior" -> " ",
-        "conditionals" -> " ",
-        "accuracy" -> " ",
-        "laplace" -> getLaplace.toString())
+  def getTrainingScript(isSingleNode:Boolean):(Script, String, String)  = {
+    val script = dml(ScriptsUtils.getDMLScript(NaiveBayes.scriptPath))
+      .in("$X", " ")
+      .in("$Y", " ")
+      .in("$prior", " ")
+      .in("$conditionals", " ")
+      .in("$accuracy", " ")
+      .in("$laplace", toDouble(getLaplace))
+      .out("classPrior", "classConditionals")
+    (script, "D", "C")
   }
 }
 
@@ -96,61 +73,37 @@ object NaiveBayesModel {
   final val scriptPath = "scripts" + File.separator + "algorithms" + File.separator + "naive-bayes-predict.dml"
 }
 
-class NaiveBayesModel(
-  override val uid: String)(
-    val mloutput: MLOutput, val labelMapping: java.util.HashMap[Int, String], val sc: SparkContext) extends Model[NaiveBayesModel] with HasLaplace {
+class NaiveBayesModel(override val uid: String)
+  (val mloutput: MLResults, val labelMapping: java.util.HashMap[Int, String], val sc: SparkContext) 
+  extends Model[NaiveBayesModel] with HasLaplace with BaseSystemMLClassifierModel {
+  
   override def copy(extra: ParamMap): NaiveBayesModel = {
     val that = new NaiveBayesModel(uid)(mloutput, labelMapping, sc)
     copyValues(that, extra)
   }
   
-  def transformSchema(schema: StructType): StructType = schema
-  
-  var priorMB: MatrixBlock = null
-  var conditionalMB: MatrixBlock = null
-  def setPriorAndConditional(prior:MatrixBlock, conditional:MatrixBlock) {
-    priorMB = prior
-    conditionalMB = conditional
-  }
-  
-  def transform(X: MatrixBlock): MatrixBlock = {
-    val isSingleNode = true
-    val ml = new MLContext(sc)
-    ml.registerInput("D", X)
-    ml.registerInput("prior", mloutput.getMatrixBlock("classPrior"), mloutput.getMatrixCharacteristics("classPrior"))
-    ml.registerInput("conditionals", mloutput.getMatrixBlock("classConditionals"), mloutput.getMatrixCharacteristics("classConditionals"))
-    ml.registerOutput("probs")
-    val nbPredict = ml.executeScript(ScriptsUtils.getDMLScript(NaiveBayesModel.scriptPath), getPredictParams())
-    val ret = PredictionUtils.computePredictedClassLabelsFromProbability(nbPredict, isSingleNode, sc, "probs").getMatrixBlock("Prediction");
-    if(ret.getNumColumns != 1) {
-      throw new RuntimeException("Expected predicted label to be a column vector")
+  def getPredictionScript(mloutput: MLResults, isSingleNode:Boolean): (Script, String)  = {
+    val script = dml(ScriptsUtils.getDMLScript(NaiveBayesModel.scriptPath))
+      .in("$X", " ")
+      .in("$prior", " ")
+      .in("$conditionals", " ")
+      .in("$probabilities", " ")
+      .out("probs")
+    
+    val classPrior = mloutput.getBinaryBlockMatrix("classPrior")
+    val classConditionals = mloutput.getBinaryBlockMatrix("classConditionals")
+    val ret = if(isSingleNode) {
+      script.in("prior", classPrior.getMatrixBlock, classPrior.getMatrixMetadata)
+            .in("conditionals", classConditionals.getMatrixBlock, classConditionals.getMatrixMetadata)
+    }
+    else {
+      script.in("prior", classPrior.getBinaryBlocks, classPrior.getMatrixMetadata)
+            .in("conditionals", classConditionals.getBinaryBlocks, classConditionals.getMatrixMetadata)
     }
-    PredictionUtils.updateLabels(isSingleNode, null, ret, null, labelMapping)
-    return ret
+    (ret, "D")
   }
   
-  def transform(df: org.apache.spark.sql.DataFrame): org.apache.spark.sql.DataFrame = {
-    val isSingleNode = false
-    val ml = new MLContext(sc)
-    val mcXin = new MatrixCharacteristics()
-    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features")
-    ml.registerInput("D", Xin, mcXin);
-    ml.registerInput("prior", mloutput.getMatrixBlock("classPrior"), mloutput.getMatrixCharacteristics("classPrior"))
-    ml.registerInput("conditionals", mloutput.getMatrixBlock("classConditionals"), mloutput.getMatrixCharacteristics("classConditionals"))
-    ml.registerOutput("probs")
-    val nbPredict = ml.executeScript(ScriptsUtils.getDMLScript(NaiveBayesModel.scriptPath), getPredictParams())
-    val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(nbPredict, isSingleNode, sc, "probs")
-    val predictedDF = PredictionUtils.updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1", labelMapping).select("ID", "prediction")
-    val prob = nbPredict.getDF(df.sqlContext, "probs", true).withColumnRenamed("C1", "probability").select("ID", "probability")
-    val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID")
-    return PredictionUtils.joinUsingID(dataset, PredictionUtils.joinUsingID(prob, predictedDF))
-  }
+  def transform(X: MatrixBlock): MatrixBlock = transform(X, mloutput, labelMapping, sc, "probs")
+  def transform(df: DataFrame): DataFrame = transform(df, mloutput, labelMapping, sc, "probs")
   
-  def getPredictParams(): Map[String, String] = {
-    Map("X" -> " ",
-        "prior" -> " ",
-        "conditionals" -> " ",
-        "probabilities" -> " ")
-  }
-
 }
\ No newline at end of file
diff --git a/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala b/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala
index 13494eedaf8..f91a82cadc2 100644
--- a/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala
@@ -28,9 +28,28 @@ import org.apache.sysml.runtime.matrix.data.MatrixBlock
 import org.apache.sysml.runtime.DMLRuntimeException
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics
 import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils }
+import org.apache.sysml.api.mlcontext.MLResults
+import org.apache.sysml.api.mlcontext.ScriptFactory._
+import org.apache.sysml.api.mlcontext.Script
+import org.apache.sysml.api.mlcontext.BinaryBlockMatrix
 
 object PredictionUtils {
   
+  def getGLMPredictionScript(B_full: BinaryBlockMatrix, isSingleNode:Boolean): (Script, String)  = {
+    val script = dml(ScriptsUtils.getDMLScript(LogisticRegressionModel.scriptPath))
+      .in("$X", " ")
+      .in("$B", " ")
+      .in("$dfam", "3")
+      .out("means")
+    val ret = if(isSingleNode) {
+      script.in("B_full", B_full.getMatrixBlock, B_full.getMatrixMetadata)
+    }
+    else {
+      script.in("B_full", B_full)
+    }
+    (ret, "X")
+  }
+  
   def doGLMPredict(isSingleNode:Boolean, df:DataFrame, X: MatrixBlock, sc:SparkContext, mloutput:MLOutput, B:String, paramsMap: Map[String, String]): MLOutput = {
     val ml = new MLContext(sc)
     if(isSingleNode) {
@@ -149,4 +168,21 @@ object PredictionUtils {
         write(Prediction, "tempOut", "csv");
         """)
   }
+  
+  def computePredictedClassLabelsFromProbability(mlscoreoutput:MLResults, isSingleNode:Boolean, sc:SparkContext, inProbVar:String): MLResults = {
+    val ml = new org.apache.sysml.api.mlcontext.MLContext(sc)
+    val script = dml(
+        """
+        Prob = read("temp1");
+        Prediction = rowIndexMax(Prob); # assuming one-based label mapping
+        write(Prediction, "tempOut", "csv");
+        """).out("Prediction")
+    val probVar = mlscoreoutput.getBinaryBlockMatrix(inProbVar)
+    if(isSingleNode) {
+      ml.execute(script.in("Prob", probVar.getMatrixBlock, probVar.getMatrixMetadata))
+    }
+    else {
+      ml.execute(script.in("Prob", probVar))
+    }
+  }
 }
\ No newline at end of file
diff --git a/src/main/scala/org/apache/sysml/api/ml/SVM.scala b/src/main/scala/org/apache/sysml/api/ml/SVM.scala
index 7a48c1ded13..93e91ec4660 100644
--- a/src/main/scala/org/apache/sysml/api/ml/SVM.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/SVM.scala
@@ -19,17 +19,19 @@
 
 package org.apache.sysml.api.ml
 
+import org.apache.spark.rdd.RDD
 import java.io.File
 import org.apache.spark.SparkContext
 import org.apache.spark.ml.{ Model, Estimator }
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.ml.param.ParamMap
-import org.apache.sysml.api.{ MLContext, MLOutput }
+import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam }
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics
 import org.apache.sysml.runtime.matrix.data.MatrixBlock
 import org.apache.sysml.runtime.DMLRuntimeException
 import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils }
+import org.apache.sysml.api.mlcontext._
+import org.apache.sysml.api.mlcontext.ScriptFactory._
 
 object SVM {
   final val scriptPathBinary = "scripts" + File.separator + "algorithms" + File.separator + "l2-svm.dml"
@@ -37,71 +39,41 @@ object SVM {
 }
 
 class SVM (override val uid: String, val sc: SparkContext, val isMultiClass:Boolean=false) extends Estimator[SVMModel] with HasIcpt
-    with HasRegParam with HasTol with HasMaxOuterIter {
+    with HasRegParam with HasTol with HasMaxOuterIter with BaseSystemMLClassifier {
 
   def setIcpt(value: Int) = set(icpt, value)
   def setMaxIter(value: Int) = set(maxOuterIter, value)
   def setRegParam(value: Double) = set(regParam, value)
   def setTol(value: Double) = set(tol, value)
   
-  def setModelParams(m:SVMModel):SVMModel = {
-    m.setIcpt(this.getIcpt).setMaxIter(this.getMaxOuterIte).setRegParam(this.getRegParam).setTol(this.getTol)
-  }
-  
   override def copy(extra: ParamMap): Estimator[SVMModel] = {
     val that = new SVM(uid, sc, isMultiClass)
     copyValues(that, extra)
   }
-  def transformSchema(schema: StructType): StructType = schema
   
-  def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): SVMModel = {
-    val ml = new MLContext(sc)
-    val revLabelMapping = new java.util.HashMap[Int, String]
-    PredictionUtils.fillLabelMapping(y_mb, revLabelMapping)
-    if(y_mb.getNumColumns != 1) {
-      throw new RuntimeException("Expected a column vector for y")
-    }
-    val mloutput = {
-      ml.registerInput("X", X_mb);
-      ml.registerInput("Y", y_mb);
-      ml.registerOutput("w");
-      if(isMultiClass)
-        ml.executeScript(ScriptsUtils.getDMLScript(SVM.scriptPathMulticlass), getParamMap())
-      else {
-        ml.executeScript(ScriptsUtils.getDMLScript(SVM.scriptPathBinary), getParamMap())
-      }
-    }
-    setModelParams(new SVMModel("svm")(mloutput, sc, isMultiClass, revLabelMapping))
+  def getTrainingScript(isSingleNode:Boolean):(Script, String, String)  = {
+    val script = dml(ScriptsUtils.getDMLScript(if(isMultiClass) SVM.scriptPathMulticlass else SVM.scriptPathBinary))
+      .in("$X", " ")
+      .in("$Y", " ")
+      .in("$model", " ")
+      .in("$Log", " ")
+      .in("$icpt", toDouble(getIcpt))
+      .in("$reg", toDouble(getRegParam))
+      .in("$tol", toDouble(getTol))
+      .in("$maxiter", toDouble(getMaxOuterIte))
+      .out("w")
+    (script, "X", "Y")
   }
   
-  def getParamMap(): Map[String, String] = {
-    Map(  "icpt" -> this.getIcpt.toString(),
-          "reg" -> this.getRegParam.toString(),
-          "tol" -> this.getTol.toString,
-          "maxiter" -> this.getMaxOuterIte.toString,
-          "X" -> " ",
-          "Y" -> " ",
-          "model" -> " ",
-          "Log" -> " ")
+  // Note: will update the y_mb as this will be called by Python mllearn
+  def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): SVMModel = {
+    val ret = fit(X_mb, y_mb, sc)
+    new SVMModel("svm")(ret._1, sc, isMultiClass, ret._2)
   }
   
   def fit(df: DataFrame): SVMModel = {
-    val ml = new MLContext(df.rdd.sparkContext)
-    val mcXin = new MatrixCharacteristics()
-    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features")
-    val revLabelMapping = new java.util.HashMap[Int, String]
-    val yin = PredictionUtils.fillLabelMapping(df, revLabelMapping)
-    val mloutput = {
-      ml.registerInput("X", Xin, mcXin);
-      ml.registerInput("Y", yin, "csv");
-      ml.registerOutput("w");
-      if(isMultiClass)
-        ml.executeScript(ScriptsUtils.getDMLScript(SVM.scriptPathMulticlass), getParamMap())
-      else {
-        ml.executeScript(ScriptsUtils.getDMLScript(SVM.scriptPathBinary), getParamMap())
-      }
-    }
-    setModelParams(new SVMModel("svm")(mloutput, sc, isMultiClass, revLabelMapping))
+    val ret = fit(df, sc)
+    new SVMModel("svm")(ret._1, sc, isMultiClass, ret._2)
   }
   
 }
@@ -111,77 +83,31 @@ object SVMModel {
   final val predictionScriptPathMulticlass = "scripts" + File.separator + "algorithms" + File.separator + "m-svm-predict.dml"
 }
 
-class SVMModel (override val uid: String)(val mloutput: MLOutput, val sc: SparkContext, val isMultiClass:Boolean, val labelMapping: java.util.HashMap[Int, String]) extends Model[SVMModel] with HasIcpt
-    with HasRegParam with HasTol with HasMaxOuterIter {
+class SVMModel (override val uid: String)(val mloutput: MLResults, val sc: SparkContext, val isMultiClass:Boolean, 
+    val labelMapping: java.util.HashMap[Int, String]) extends Model[SVMModel] with BaseSystemMLClassifierModel {
   override def copy(extra: ParamMap): SVMModel = {
     val that = new SVMModel(uid)(mloutput, sc, isMultiClass, labelMapping)
     copyValues(that, extra)
   }
   
-  def setIcpt(value: Int) = set(icpt, value)
-  def setMaxIter(value: Int) = set(maxOuterIter, value)
-  def setRegParam(value: Double) = set(regParam, value)
-  def setTol(value: Double) = set(tol, value)
-  
-  override def transformSchema(schema: StructType): StructType = schema
-  
-  def transform(df: DataFrame): DataFrame = {
-    val ml = new MLContext(sc)
-    val mcXin = new MatrixCharacteristics()
-    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features")
-    ml.registerInput("X", Xin, mcXin);
-    ml.registerOutput("scores");
-    val glmPredOut = {
-      if(isMultiClass) {
-        ml.registerInput("W", mloutput.getBinaryBlockedRDD("w"), mloutput.getMatrixCharacteristics("w"));
-        ml.executeScript(ScriptsUtils.getDMLScript(SVMModel.predictionScriptPathMulticlass), getPredictParams())
-      }
-      else {
-        ml.registerInput("w", mloutput.getBinaryBlockedRDD("w"), mloutput.getMatrixCharacteristics("w"));
-        ml.executeScript(ScriptsUtils.getDMLScript(SVMModel.predictionScriptPathBinary), getPredictParams())
-      }
+  def getPredictionScript(mloutput: MLResults, isSingleNode:Boolean): (Script, String)  = {
+    val script = dml(ScriptsUtils.getDMLScript(if(isMultiClass) SVMModel.predictionScriptPathMulticlass else SVMModel.predictionScriptPathBinary))
+      .in("$X", " ")
+      .in("$model", " ")
+      .out("scores")
+    
+    val w = mloutput.getBinaryBlockMatrix("w")
+    val wVar = if(isMultiClass) "W" else "w"
+      
+    val ret = if(isSingleNode) {
+      script.in(wVar, w.getMatrixBlock, w.getMatrixMetadata)
     }
-    val isSingleNode = false
-    val predLabelOut = PredictionUtils.computePredictedClassLabelsFromProbability(glmPredOut, isSingleNode, sc, "scores")
-    val predictedDF = PredictionUtils.updateLabels(isSingleNode, predLabelOut.getDF(df.sqlContext, "Prediction"), null, "C1", labelMapping).select("ID", "prediction")
-    val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID")
-    return PredictionUtils.joinUsingID(dataset, predictedDF)
-  }
-  
-  def transform(X: MatrixBlock): MatrixBlock =  {
-    val ml = new MLContext(sc)
-    ml.registerInput("X", X);
-    ml.registerInput("w", mloutput.getMatrixBlock("w"), mloutput.getMatrixCharacteristics("w"));
-    ml.registerOutput("scores");
-    val glmPredOut = {
-      if(isMultiClass) {
-        ml.registerInput("W", mloutput.getMatrixBlock("w"), mloutput.getMatrixCharacteristics("w"));
-        ml.executeScript(ScriptsUtils.getDMLScript(SVMModel.predictionScriptPathMulticlass), getPredictParams())
-      }
-      else { 
-        ml.registerInput("w", mloutput.getMatrixBlock("w"), mloutput.getMatrixCharacteristics("w"));
-        ml.executeScript(ScriptsUtils.getDMLScript(SVMModel.predictionScriptPathBinary), getPredictParams())
-      }
+    else {
+      script.in(wVar, w)
     }
-    val isSingleNode = true
-    val ret = PredictionUtils.computePredictedClassLabelsFromProbability(glmPredOut, isSingleNode, sc, "scores").getMatrixBlock("Prediction");
-    if(ret.getNumColumns != 1) {
-      throw new RuntimeException("Expected predicted label to be a column vector")
-    }
-    PredictionUtils.updateLabels(true, null, ret, null, labelMapping)
-    return ret
+    (ret, "X")
   }
   
-  
-  def getPredictParams(): Map[String, String] = {
-    Map(  "icpt" -> this.getIcpt.toString(),
-          "reg" -> this.getRegParam.toString(),
-          "tol" -> this.getTol.toString,
-          "maxiter" -> this.getMaxOuterIte.toString,
-          "X" -> " ",
-          "Y" -> " ",
-          "model" -> " ",
-          "Log" -> " ")
-  }
-
+  def transform(X: MatrixBlock): MatrixBlock = transform(X, mloutput, labelMapping, sc, "scores")
+  def transform(df: DataFrame): DataFrame = transform(df, mloutput, labelMapping, sc, "scores")
 }
\ No newline at end of file

From 3a2a4cfb8a8f7e3b254e7ef7bfebe72c30013b0e Mon Sep 17 00:00:00 2001
From: Niketan Pansare <npansar@us.ibm.com>
Date: Tue, 9 Aug 2016 13:16:07 -0700
Subject: [PATCH 14/14] Modified Linear Regression to support new MLContext and
 added support for Spark 2.0

---
 .../org/apache/sysml/api/python/SystemML.py   |  34 +++--
 .../sysml/api/ml/BaseSystemMLClassifier.scala |  71 ++++++-----
 .../sysml/api/ml/BaseSystemMLRegressor.scala  |  86 +++++++++++++
 .../sysml/api/ml/LinearRegression.scala       | 118 ++++++------------
 .../sysml/api/ml/LogisticRegression.scala     |   6 +-
 .../org/apache/sysml/api/ml/NaiveBayes.scala  |   4 +-
 .../apache/sysml/api/ml/PredictionUtils.scala |  40 +-----
 .../scala/org/apache/sysml/api/ml/SVM.scala   |   4 +-
 .../apache/sysml/api/ml/ScriptsUtils.scala    |   2 +
 9 files changed, 195 insertions(+), 170 deletions(-)
 create mode 100644 src/main/scala/org/apache/sysml/api/ml/BaseSystemMLRegressor.scala

diff --git a/src/main/java/org/apache/sysml/api/python/SystemML.py b/src/main/java/org/apache/sysml/api/python/SystemML.py
index c03bd1bab49..689403ea883 100644
--- a/src/main/java/org/apache/sysml/api/python/SystemML.py
+++ b/src/main/java/org/apache/sysml/api/python/SystemML.py
@@ -321,13 +321,15 @@ class mllearn:
     class BaseSystemMLEstimator(Estimator):
     # TODO: Allow users to set featuresCol (with default 'features') and labelCol (with default 'label')
     
+        # Returns a model after calling fit(df) on Estimator object on JVM    
         def _fit(self, X):
             if hasattr(X, '_jdf') and 'features' in X.columns and 'label' in X.columns:
                 self.model = self.estimator.fit(X._jdf)
                 return self
             else:
                 raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns')
-            
+        
+        # Returns a model after calling fit(X:MatrixBlock, y:MatrixBlock) on Estimator object on JVM  
         def fit(self, X, y=None, params=None):
             if y is None:
                 return self._fit(X)
@@ -356,7 +358,8 @@ def fit(self, X, y=None, params=None):
         
         def transform(self, X):
             return self.predict(X)
-            
+        
+        # Returns either a DataFrame or MatrixBlock after calling transform(X:MatrixBlock, y:MatrixBlock) on Model object on JVM    
         def predict(self, X):
             if isinstance(X, SUPPORTED_TYPES):
                 if self.transferUsingDF:
@@ -389,12 +392,23 @@ def predict(self, X):
             else:
                 raise Exception('Unsupported input type')
                 
+    class BaseSystemMLClassifier(BaseSystemMLEstimator):
+
+        # Scores the predicted value with ground truth 'y'
         def score(self, X, y):
             return metrics.accuracy_score(y, self.predict(X))    
     
+    class BaseSystemMLRegressor(BaseSystemMLEstimator):
+
+        # Scores the predicted value with ground truth 'y'
+        def score(self, X, y):
+            return metrics.r2_score(y, self.predict(X), multioutput='variance_weighted')
+
+    
     # Or we can create new Python project with package structure
-    class LogisticRegression(BaseSystemMLEstimator):
+    class LogisticRegression(BaseSystemMLClassifier):
 
+        # See https://apache.github.io/incubator-systemml/algorithms-reference for usage
         def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
             self.sqlCtx = sqlCtx
             self.sc = sqlCtx._sc
@@ -415,8 +429,9 @@ def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_i
             if solver != 'newton-cg':
                 raise Exception('Only newton-cg solver supported')
 
-    class LinearRegression(BaseSystemMLEstimator):
+    class LinearRegression(BaseSystemMLRegressor):
 
+        # See https://apache.github.io/incubator-systemml/algorithms-reference for usage
         def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
             self.sqlCtx = sqlCtx
             self.sc = sqlCtx._sc
@@ -435,12 +450,10 @@ def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0
             self.transferUsingDF = transferUsingDF
             self.setOutputRawPredictionsToFalse = False
 
-        def score(self, X, y):
-            return metrics.r2_score(y, self.predict(X), multioutput='variance_weighted')    
-
 
-    class SVM(BaseSystemMLEstimator):
+    class SVM(BaseSystemMLClassifier):
 
+        # See https://apache.github.io/incubator-systemml/algorithms-reference for usage
         def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False):
             self.sqlCtx = sqlCtx
             self.sc = sqlCtx._sc
@@ -456,8 +469,9 @@ def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0
             self.transferUsingDF = transferUsingDF
             self.setOutputRawPredictionsToFalse = False    
 
-    class NaiveBayes(BaseSystemMLEstimator):
+    class NaiveBayes(BaseSystemMLClassifier):
 
+        # See https://apache.github.io/incubator-systemml/algorithms-reference for usage
         def __init__(self, sqlCtx, laplace=1.0, transferUsingDF=False):
             self.sqlCtx = sqlCtx
             self.sc = sqlCtx._sc
@@ -465,4 +479,4 @@ def __init__(self, sqlCtx, laplace=1.0, transferUsingDF=False):
             self.estimator = self.sc._jvm.org.apache.sysml.api.ml.NaiveBayes(self.uid, self.sc._jsc.sc())
             self.estimator.setLaplace(laplace)
             self.transferUsingDF = transferUsingDF
-            self.setOutputRawPredictionsToFalse = False                
\ No newline at end of file
+            self.setOutputRawPredictionsToFalse = False
\ No newline at end of file
diff --git a/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLClassifier.scala b/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLClassifier.scala
index 5174aabdb72..98def7c21bd 100644
--- a/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLClassifier.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLClassifier.scala
@@ -23,7 +23,6 @@ import org.apache.spark.rdd.RDD
 import java.io.File
 import org.apache.spark.SparkContext
 import org.apache.spark.ml.{ Model, Estimator }
-import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam }
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics
@@ -32,6 +31,7 @@ import org.apache.sysml.runtime.DMLRuntimeException
 import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils }
 import org.apache.sysml.api.mlcontext._
 import org.apache.sysml.api.mlcontext.ScriptFactory._
+import org.apache.spark.sql._
 
 trait HasLaplace extends Params {
   final val laplace: Param[Double] = new Param[Double](this, "laplace", "Laplace smoothing specified by the user to avoid creation of 0 probabilities.")
@@ -64,16 +64,41 @@ trait HasRegParam extends Params {
   final def getRegParam: Double = $(regParam)
 }
 
-
-trait BaseSystemMLClassifier {
+trait BaseSystemMLEstimator {
+  
   def transformSchema(schema: StructType): StructType = schema
   
   // Returns the script and variables for X and y
   def getTrainingScript(isSingleNode:Boolean):(Script, String, String)
   
+  def toDouble(i:Int): java.lang.Double = {
+    double2Double(i.toDouble)
+  }
+  
+  def toDouble(d:Double): java.lang.Double = {
+    double2Double(d)
+  }
+}
+
+trait BaseSystemMLEstimatorModel {
+  def toDouble(i:Int): java.lang.Double = {
+    double2Double(i.toDouble)
+  }
+  def toDouble(d:Double): java.lang.Double = {
+    double2Double(d)
+  }
+  
+  def transformSchema(schema: StructType): StructType = schema
+  
+  // Returns the script and variable for X
+  def getPredictionScript(mloutput: MLResults, isSingleNode:Boolean): (Script, String)
+}
+
+trait BaseSystemMLClassifier extends BaseSystemMLEstimator {
+  
   def fit(X_mb: MatrixBlock, y_mb: MatrixBlock, sc: SparkContext): (MLResults, java.util.HashMap[Int, String]) = {
     val isSingleNode = true
-    val ml = new org.apache.sysml.api.mlcontext.MLContext(sc)
+    val ml = new MLContext(sc)
     val revLabelMapping = new java.util.HashMap[Int, String]
     PredictionUtils.fillLabelMapping(y_mb, revLabelMapping)
     val ret = getTrainingScript(isSingleNode)
@@ -81,11 +106,11 @@ trait BaseSystemMLClassifier {
     (ml.execute(script), revLabelMapping)
   }
   
-  def fit(df: DataFrame, sc: SparkContext): (MLResults, java.util.HashMap[Int, String]) = {
+  def fit(df: ScriptsUtils.SparkDataType, sc: SparkContext): (MLResults, java.util.HashMap[Int, String]) = {
     val isSingleNode = false
     val ml = new MLContext(df.rdd.sparkContext)
     val mcXin = new MatrixCharacteristics()
-    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features")
+    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df.asInstanceOf[DataFrame], mcXin, false, "features")
     val revLabelMapping = new java.util.HashMap[Int, String]
     val yin = PredictionUtils.fillLabelMapping(df, revLabelMapping)
     val ret = getTrainingScript(isSingleNode)
@@ -93,29 +118,9 @@ trait BaseSystemMLClassifier {
     val script = ret._1.in(ret._2, Xbin).in(ret._3, yin)
     (ml.execute(script), revLabelMapping)
   }
-  
-  def toDouble(i:Int): java.lang.Double = {
-    double2Double(i.toDouble)
-  }
-  def toDouble(d:Double): java.lang.Double = {
-    double2Double(d)
-  }
-  
 }
 
-trait BaseSystemMLClassifierModel {
-  
-  def toDouble(i:Int): java.lang.Double = {
-    double2Double(i.toDouble)
-  }
-  def toDouble(d:Double): java.lang.Double = {
-    double2Double(d)
-  }
-  
-  def transformSchema(schema: StructType): StructType = schema
-  
-  // Returns the script and variable for X
-  def getPredictionScript(mloutput: MLResults, isSingleNode:Boolean): (Script, String)
+trait BaseSystemMLClassifierModel extends BaseSystemMLEstimatorModel {
   
   def transform(X: MatrixBlock, mloutput: MLResults, labelMapping: java.util.HashMap[Int, String], sc: SparkContext, probVar:String): MatrixBlock = {
     val isSingleNode = true
@@ -131,13 +136,13 @@ trait BaseSystemMLClassifierModel {
     PredictionUtils.updateLabels(isSingleNode, null, ret, null, labelMapping)
     return ret
   }
-
-  def transform(df: DataFrame, mloutput: MLResults, labelMapping: java.util.HashMap[Int, String], sc: SparkContext, 
+  
+  def transform(df: ScriptsUtils.SparkDataType, mloutput: MLResults, labelMapping: java.util.HashMap[Int, String], sc: SparkContext, 
       probVar:String, outputProb:Boolean=true): DataFrame = {
     val isSingleNode = false
     val ml = new MLContext(sc)
     val mcXin = new MatrixCharacteristics()
-    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features")
+    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df.asInstanceOf[DataFrame], mcXin, false, "features")
     val script = getPredictionScript(mloutput, isSingleNode)
     val Xin_bin = new BinaryBlockMatrix(Xin, mcXin)
     val modelPredict = ml.execute(script._1.in(script._2, Xin_bin))
@@ -145,11 +150,11 @@ trait BaseSystemMLClassifierModel {
     val predictedDF = PredictionUtils.updateLabels(isSingleNode, predLabelOut.getDataFrame("Prediction"), null, "C1", labelMapping).select("ID", "prediction")
     if(outputProb) {
       val prob = modelPredict.getDataFrame(probVar, true).withColumnRenamed("C1", "probability").select("ID", "probability")
-      val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID")
-      return PredictionUtils.joinUsingID(dataset, PredictionUtils.joinUsingID(prob, predictedDF))  
+      val dataset = RDDConverterUtils.addIDToDataFrame(df.asInstanceOf[DataFrame], df.sqlContext, "ID")
+      return PredictionUtils.joinUsingID(dataset, PredictionUtils.joinUsingID(prob, predictedDF))
     }
     else {
-      val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID")
+      val dataset = RDDConverterUtils.addIDToDataFrame(df.asInstanceOf[DataFrame], df.sqlContext, "ID")
       return PredictionUtils.joinUsingID(dataset, predictedDF)
     }
     
diff --git a/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLRegressor.scala b/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLRegressor.scala
new file mode 100644
index 00000000000..5bcde30a2ea
--- /dev/null
+++ b/src/main/scala/org/apache/sysml/api/ml/BaseSystemMLRegressor.scala
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.api.ml
+
+import org.apache.spark.rdd.RDD
+import java.io.File
+import org.apache.spark.SparkContext
+import org.apache.spark.ml.{ Model, Estimator }
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam }
+import org.apache.sysml.runtime.matrix.MatrixCharacteristics
+import org.apache.sysml.runtime.matrix.data.MatrixBlock
+import org.apache.sysml.runtime.DMLRuntimeException
+import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils }
+import org.apache.sysml.api.mlcontext._
+import org.apache.sysml.api.mlcontext.ScriptFactory._
+
+trait BaseSystemMLRegressor extends BaseSystemMLEstimator {
+  
+  def fit(X_mb: MatrixBlock, y_mb: MatrixBlock, sc: SparkContext): MLResults = {
+    val isSingleNode = true
+    val ml = new MLContext(sc)
+    val ret = getTrainingScript(isSingleNode)
+    val script = ret._1.in(ret._2, X_mb).in(ret._3, y_mb)
+    ml.execute(script)
+  }
+  
+  def fit(df: ScriptsUtils.SparkDataType, sc: SparkContext): MLResults = {
+    val isSingleNode = false
+    val ml = new MLContext(df.rdd.sparkContext)
+    val mcXin = new MatrixCharacteristics()
+    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df.asInstanceOf[DataFrame], mcXin, false, "features")
+    val yin = df.select("label")
+    val ret = getTrainingScript(isSingleNode)
+    val Xbin = new BinaryBlockMatrix(Xin, mcXin)
+    val script = ret._1.in(ret._2, Xbin).in(ret._3, yin)
+    ml.execute(script)
+  }
+}
+
+trait BaseSystemMLRegressorModel extends BaseSystemMLEstimatorModel {
+  
+  def transform(X: MatrixBlock, mloutput: MLResults, sc: SparkContext, predictionVar:String): MatrixBlock = {
+    val isSingleNode = true
+    val ml = new MLContext(sc)
+    val script = getPredictionScript(mloutput, isSingleNode)
+    val modelPredict = ml.execute(script._1.in(script._2, X))
+    val ret = modelPredict.getBinaryBlockMatrix(predictionVar).getMatrixBlock
+              
+    if(ret.getNumColumns != 1) {
+      throw new RuntimeException("Expected prediction to be a column vector")
+    }
+    return ret
+  }
+  
+  def transform(df: ScriptsUtils.SparkDataType, mloutput: MLResults, sc: SparkContext, predictionVar:String): DataFrame = {
+    val isSingleNode = false
+    val ml = new MLContext(sc)
+    val mcXin = new MatrixCharacteristics()
+    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df.asInstanceOf[DataFrame], mcXin, false, "features")
+    val script = getPredictionScript(mloutput, isSingleNode)
+    val Xin_bin = new BinaryBlockMatrix(Xin, mcXin)
+    val modelPredict = ml.execute(script._1.in(script._2, Xin_bin))
+    val predictedDF = modelPredict.getDataFrame(predictionVar).select("ID", "C1").withColumnRenamed("C1", "prediction")
+    val dataset = RDDConverterUtils.addIDToDataFrame(df.asInstanceOf[DataFrame], df.sqlContext, "ID")
+    return PredictionUtils.joinUsingID(dataset, predictedDF)
+  }
+}
\ No newline at end of file
diff --git a/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala
index 7f22f8f717b..cce646d76ea 100644
--- a/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/LinearRegression.scala
@@ -19,17 +19,19 @@
 
 package org.apache.sysml.api.ml
 
+import org.apache.spark.rdd.RDD
 import java.io.File
 import org.apache.spark.SparkContext
 import org.apache.spark.ml.{ Model, Estimator }
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.ml.param.ParamMap
-import org.apache.sysml.api.{ MLContext, MLOutput }
+import org.apache.spark.ml.param.{ Params, Param, ParamMap, DoubleParam }
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics
 import org.apache.sysml.runtime.matrix.data.MatrixBlock
 import org.apache.sysml.runtime.DMLRuntimeException
 import org.apache.sysml.runtime.instructions.spark.utils.{ RDDConverterUtilsExt => RDDConverterUtils }
+import org.apache.sysml.api.mlcontext._
+import org.apache.sysml.api.mlcontext.ScriptFactory._
 
 object LinearRegression {
   final val scriptPathCG = "scripts" + File.separator + "algorithms" + File.separator + "LinearRegCG.dml"
@@ -37,8 +39,9 @@ object LinearRegression {
 }
 
 // algorithm = "direct-solve", "conjugate-gradient"
-class LinearRegression(override val uid: String, val sc: SparkContext, val solver:String="direct-solve") extends Estimator[LinearRegressionModel] with HasIcpt
-    with HasRegParam with HasTol with HasMaxOuterIter {
+class LinearRegression(override val uid: String, val sc: SparkContext, val solver:String="direct-solve") 
+  extends Estimator[LinearRegressionModel] with HasIcpt
+    with HasRegParam with HasTol with HasMaxOuterIter with BaseSystemMLRegressor {
   
   def setIcpt(value: Int) = set(icpt, value)
   def setMaxIter(value: Int) = set(maxOuterIter, value)
@@ -49,97 +52,46 @@ class LinearRegression(override val uid: String, val sc: SparkContext, val solve
     val that = new LinearRegression(uid, sc, solver)
     copyValues(that, extra)
   }
-  def transformSchema(schema: StructType): StructType = schema
   
-  def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): LinearRegressionModel = {
-    val ml = new MLContext(sc)
-    if(y_mb.getNumColumns != 1) {
-      throw new RuntimeException("Expected a column vector for y")
-    }
-    val mloutput = {
-      ml.registerInput("X", X_mb);
-      ml.registerInput("y", y_mb);
-      ml.registerOutput("beta_out");
-      if(solver.compareTo("direct-solve") == 0)
-        ml.executeScript(ScriptsUtils.getDMLScript(LinearRegression.scriptPathDS), getParamMap())
-      else if(solver.compareTo("newton-cg") == 0) {
-        ml.executeScript(ScriptsUtils.getDMLScript(LinearRegression.scriptPathCG), getParamMap())
-      }
-      else {
-        throw new DMLRuntimeException("The algorithm should be direct-solve or conjugate-gradient")
-      }
-    }
-    new LinearRegressionModel("linearRegression")(mloutput, sc)
+          
+  def getTrainingScript(isSingleNode:Boolean):(Script, String, String)  = {
+    val script = dml(ScriptsUtils.getDMLScript(
+        if(solver.compareTo("direct-solve") == 0) LinearRegression.scriptPathDS 
+        else if(solver.compareTo("newton-cg") == 0) LinearRegression.scriptPathCG
+        else throw new DMLRuntimeException("The algorithm should be direct-solve or newton-cg")))
+      .in("$X", " ")
+      .in("$Y", " ")
+      .in("$B", " ")
+      .in("$Log", " ")
+      .in("$fmt", "binary")
+      .in("$icpt", toDouble(getIcpt))
+      .in("$reg", toDouble(getRegParam))
+      .in("$tol", toDouble(getTol))
+      .in("$maxi", toDouble(getMaxOuterIte))
+      .out("beta_out")
+    (script, "X", "y")
   }
   
-  def getParamMap(): Map[String, String] = {
-    Map(  "icpt" -> this.getIcpt.toString(),
-          "reg" -> this.getRegParam.toString(),
-          "tol" -> this.getTol.toString,
-          "maxi" -> this.getMaxOuterIte.toString,
+  def fit(X_mb: MatrixBlock, y_mb: MatrixBlock): LinearRegressionModel = 
+    new LinearRegressionModel("lr")(fit(X_mb, y_mb, sc), sc)
+    
+  def fit(df: ScriptsUtils.SparkDataType): LinearRegressionModel = 
+    new LinearRegressionModel("lr")(fit(df, sc), sc)
   
-          "X" -> " ",
-          "Y" -> " ",
-          "B" -> " ", 
-          "O" -> " ", 
-          "Log" -> " ",
-          "fmt" -> "binary")
-  }
-  
-  def fit(df: DataFrame): LinearRegressionModel = {
-    val ml = new MLContext(df.rdd.sparkContext)
-    val mcXin = new MatrixCharacteristics()
-    val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(sc, df, mcXin, false, "features")
-    val yin = df.select("label")
-    val mloutput = {
-      ml.registerInput("X", Xin, mcXin);
-      ml.registerInput("y", yin);
-      ml.registerOutput("beta_out");
-      if(solver.compareTo("direct-solve") == 0)
-        ml.executeScript(ScriptsUtils.getDMLScript(LinearRegression.scriptPathDS), getParamMap())
-      else if(solver.compareTo("newton-cg") == 0) {
-        ml.executeScript(ScriptsUtils.getDMLScript(LinearRegression.scriptPathCG), getParamMap())
-      }
-      else {
-        throw new DMLRuntimeException("The algorithm should be direct-solve or conjugate-gradient")
-      }
-    }
-    new LinearRegressionModel("linearRegression")(mloutput, sc)
-  }
 }
 
-class LinearRegressionModel(override val uid: String)(val mloutput: MLOutput, val sc: SparkContext) extends Model[LinearRegressionModel] with HasIcpt
-    with HasRegParam with HasTol with HasMaxOuterIter {
+class LinearRegressionModel(override val uid: String)(val mloutput: MLResults, val sc: SparkContext) extends Model[LinearRegressionModel] with HasIcpt
+    with HasRegParam with HasTol with HasMaxOuterIter with BaseSystemMLRegressorModel {
   override def copy(extra: ParamMap): LinearRegressionModel = {
     val that = new LinearRegressionModel(uid)(mloutput, sc)
     copyValues(that, extra)
   }
   
-  override def transformSchema(schema: StructType): StructType = schema
+  def getPredictionScript(mloutput: MLResults, isSingleNode:Boolean): (Script, String) =
+    PredictionUtils.getGLMPredictionScript(mloutput.getBinaryBlockMatrix("beta_out"), isSingleNode)
   
-  def transform(df: DataFrame): DataFrame = {
-    val isSingleNode = false
-    val glmPredOut = PredictionUtils.doGLMPredict(isSingleNode, df, null, sc, mloutput, "beta_out", getPredictParams())
-    val predictedDF = glmPredOut.getDF(df.sqlContext, "means").withColumnRenamed("C1", "prediction")
-    val dataset = RDDConverterUtils.addIDToDataFrame(df, df.sqlContext, "ID")
-    return PredictionUtils.joinUsingID(dataset, predictedDF)
-  }
-  
-  def transform(X: MatrixBlock): MatrixBlock =  {
-    val isSingleNode = true
-    return PredictionUtils.doGLMPredict(isSingleNode, null, X, sc, mloutput, "beta_out", getPredictParams()).getMatrixBlock("means")
-  }
+  def transform(df: ScriptsUtils.SparkDataType): DataFrame = transform(df, mloutput, sc, "means")
   
+  def transform(X: MatrixBlock): MatrixBlock =  transform(X, mloutput, sc, "means")
   
-  def getPredictParams(): Map[String, String] = {
-    Map("X" -> " ",
-        "B" -> " ",
-        // Gaussian distribution
-        "dfam" -> "1", "vpow" -> "0.0",
-        // identity link function
-        "link" -> "1", "lpow" -> "1.0"
-//        // Dispersion value: TODO
-//        ,"disp" -> "5.0"
-        )
-  }
 }
\ No newline at end of file
diff --git a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
index 3098da9c21d..a9ca6ab188c 100644
--- a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
@@ -60,7 +60,7 @@ class LogisticRegression(override val uid: String, val sc: SparkContext) extends
     new LogisticRegressionModel("log")(ret._1, ret._2, sc)
   }
   
-  def fit(df: DataFrame): LogisticRegressionModel = {
+  def fit(df: ScriptsUtils.SparkDataType): LogisticRegressionModel = {
     val ret = fit(df, sc)
     new LogisticRegressionModel("log")(ret._1, ret._2, sc)
   }
@@ -100,10 +100,10 @@ class LogisticRegressionModel(override val uid: String)(
   def setOutputRawPredictions(outRawPred:Boolean): Unit = { outputRawPredictions = outRawPred }
   
   def getPredictionScript(mloutput: MLResults, isSingleNode:Boolean): (Script, String) =
-    PredictionUtils.getGLMPredictionScript(mloutput.getBinaryBlockMatrix("B_out"), isSingleNode)
+    PredictionUtils.getGLMPredictionScript(mloutput.getBinaryBlockMatrix("B_out"), isSingleNode, 3)
    
   def transform(X: MatrixBlock): MatrixBlock = transform(X, mloutput, labelMapping, sc, "means")
-  def transform(df: DataFrame): DataFrame = transform(df, mloutput, labelMapping, sc, "means")
+  def transform(df: ScriptsUtils.SparkDataType): DataFrame = transform(df, mloutput, labelMapping, sc, "means")
 }
 
 /**
diff --git a/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala b/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala
index 28836221582..fd05f27828c 100644
--- a/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/NaiveBayes.scala
@@ -50,7 +50,7 @@ class NaiveBayes(override val uid: String, val sc: SparkContext) extends Estimat
     new NaiveBayesModel("naive")(ret._1, ret._2, sc)
   }
   
-  def fit(df: DataFrame): NaiveBayesModel = {
+  def fit(df: ScriptsUtils.SparkDataType): NaiveBayesModel = {
     val ret = fit(df, sc)
     new NaiveBayesModel("naive")(ret._1, ret._2, sc)
   }
@@ -104,6 +104,6 @@ class NaiveBayesModel(override val uid: String)
   }
   
   def transform(X: MatrixBlock): MatrixBlock = transform(X, mloutput, labelMapping, sc, "probs")
-  def transform(df: DataFrame): DataFrame = transform(df, mloutput, labelMapping, sc, "probs")
+  def transform(df: ScriptsUtils.SparkDataType): DataFrame = transform(df, mloutput, labelMapping, sc, "probs")
   
 }
\ No newline at end of file
diff --git a/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala b/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala
index f91a82cadc2..8e3893d578b 100644
--- a/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/PredictionUtils.scala
@@ -21,7 +21,6 @@ package org.apache.sysml.api.ml
 
 import org.apache.spark.sql.functions.udf
 import org.apache.spark.rdd.RDD
-import org.apache.sysml.api.{ MLContext, MLOutput }
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.SparkContext
 import org.apache.sysml.runtime.matrix.data.MatrixBlock
@@ -35,11 +34,11 @@ import org.apache.sysml.api.mlcontext.BinaryBlockMatrix
 
 object PredictionUtils {
   
-  def getGLMPredictionScript(B_full: BinaryBlockMatrix, isSingleNode:Boolean): (Script, String)  = {
+  def getGLMPredictionScript(B_full: BinaryBlockMatrix, isSingleNode:Boolean, dfam:java.lang.Integer=1): (Script, String)  = {
     val script = dml(ScriptsUtils.getDMLScript(LogisticRegressionModel.scriptPath))
       .in("$X", " ")
       .in("$B", " ")
-      .in("$dfam", "3")
+      .in("$dfam", dfam)
       .out("means")
     val ret = if(isSingleNode) {
       script.in("B_full", B_full.getMatrixBlock, B_full.getMatrixMetadata)
@@ -50,23 +49,7 @@ object PredictionUtils {
     (ret, "X")
   }
   
-  def doGLMPredict(isSingleNode:Boolean, df:DataFrame, X: MatrixBlock, sc:SparkContext, mloutput:MLOutput, B:String, paramsMap: Map[String, String]): MLOutput = {
-    val ml = new MLContext(sc)
-    if(isSingleNode) {
-      ml.registerInput("X", X);
-      ml.registerInput("B_full", mloutput.getMatrixBlock(B), mloutput.getMatrixCharacteristics(B));
-    }
-    else {
-      val mcXin = new MatrixCharacteristics()
-      val Xin = RDDConverterUtils.vectorDataFrameToBinaryBlock(df.rdd.sparkContext, df, mcXin, false, "features")
-      ml.registerInput("X", Xin, mcXin);
-      ml.registerInput("B_full", mloutput.getBinaryBlockedRDD(B), mloutput.getMatrixCharacteristics(B));  
-    }
-    ml.registerOutput("means");
-    ml.executeScript(ScriptsUtils.getDMLScript(LogisticRegressionModel.scriptPath), paramsMap)
-  }
-  
-  def fillLabelMapping(df: DataFrame, revLabelMapping: java.util.HashMap[Int, String]): RDD[String]  = {
+  def fillLabelMapping(df: ScriptsUtils.SparkDataType, revLabelMapping: java.util.HashMap[Int, String]): RDD[String]  = {
     val temp = df.select("label").distinct.rdd.map(_.apply(0).toString).collect()
     val labelMapping = new java.util.HashMap[String, Int]
     for(i <- 0 until temp.length) {
@@ -152,23 +135,6 @@ object PredictionUtils {
     tempDF1.join(df2, tempDF1.col("ID1").equalTo(df2.col("ID"))).drop("ID1")
   }
   
-  def computePredictedClassLabelsFromProbability(mlscoreoutput:MLOutput, isSingleNode:Boolean, sc:SparkContext, inProbVar:String): MLOutput = {
-    val mlNew = new MLContext(sc)
-    if(isSingleNode) {
-      mlNew.registerInput("Prob", mlscoreoutput.getMatrixBlock(inProbVar), mlscoreoutput.getMatrixCharacteristics(inProbVar));
-    }
-    else {
-      mlNew.registerInput("Prob", mlscoreoutput.getBinaryBlockedRDD(inProbVar), mlscoreoutput.getMatrixCharacteristics(inProbVar));
-    }
-    mlNew.registerOutput("Prediction")
-    mlNew.executeScript(
-      """
-        Prob = read("temp1");
-        Prediction = rowIndexMax(Prob); # assuming one-based label mapping
-        write(Prediction, "tempOut", "csv");
-        """)
-  }
-  
   def computePredictedClassLabelsFromProbability(mlscoreoutput:MLResults, isSingleNode:Boolean, sc:SparkContext, inProbVar:String): MLResults = {
     val ml = new org.apache.sysml.api.mlcontext.MLContext(sc)
     val script = dml(
diff --git a/src/main/scala/org/apache/sysml/api/ml/SVM.scala b/src/main/scala/org/apache/sysml/api/ml/SVM.scala
index 93e91ec4660..07a7283a80d 100644
--- a/src/main/scala/org/apache/sysml/api/ml/SVM.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/SVM.scala
@@ -71,7 +71,7 @@ class SVM (override val uid: String, val sc: SparkContext, val isMultiClass:Bool
     new SVMModel("svm")(ret._1, sc, isMultiClass, ret._2)
   }
   
-  def fit(df: DataFrame): SVMModel = {
+  def fit(df: ScriptsUtils.SparkDataType): SVMModel = {
     val ret = fit(df, sc)
     new SVMModel("svm")(ret._1, sc, isMultiClass, ret._2)
   }
@@ -109,5 +109,5 @@ class SVMModel (override val uid: String)(val mloutput: MLResults, val sc: Spark
   }
   
   def transform(X: MatrixBlock): MatrixBlock = transform(X, mloutput, labelMapping, sc, "scores")
-  def transform(df: DataFrame): DataFrame = transform(df, mloutput, labelMapping, sc, "scores")
+  def transform(df: ScriptsUtils.SparkDataType): DataFrame = transform(df, mloutput, labelMapping, sc, "scores")
 }
\ No newline at end of file
diff --git a/src/main/scala/org/apache/sysml/api/ml/ScriptsUtils.scala b/src/main/scala/org/apache/sysml/api/ml/ScriptsUtils.scala
index fdf682d2b7b..10f9d33bf32 100644
--- a/src/main/scala/org/apache/sysml/api/ml/ScriptsUtils.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/ScriptsUtils.scala
@@ -26,6 +26,8 @@ import org.apache.sysml.runtime.DMLRuntimeException
 
 object ScriptsUtils {
   var systemmlHome = System.getenv("SYSTEMML_HOME")
+		  
+  type SparkDataType = org.apache.spark.sql.DataFrame // org.apache.spark.sql.Dataset[_]
 
   /**
    * set SystemML home