From f503890330687bbb2207ab92517eec3a35ab7c6d Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <mwdusenb@us.ibm.com>
Date: Fri, 30 Oct 2015 14:35:15 -0700
Subject: [PATCH 1/8] Adding remaining methods to PySpark BlockMatrix: cache,
 persist, validate, transpose.

---
 python/pyspark/mllib/linalg/distributed.py | 41 ++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index 43cb0beef1bd3..de34cadb4084e 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -29,6 +29,7 @@
 from pyspark import RDD
 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
 from pyspark.mllib.linalg import _convert_to_vector, Matrix
+from pyspark.storagelevel import StorageLevel
 
 
 __all__ = ['DistributedMatrix', 'RowMatrix', 'IndexedRow',
@@ -789,6 +790,30 @@ def numCols(self):
         """
         return self._java_matrix_wrapper.call("numCols")
 
+    def cache(self):
+        """
+        Caches the underlying RDD.
+        """
+        self._java_matrix_wrapper.call("cache")
+        return self
+
+    def persist(self, storageLevel):
+        """
+        Persists the underlying RDD with the specified storage level.
+        """
+        if not isinstance(storageLevel, StorageLevel):
+            raise TypeError("`storageLevel` should be a StorageLevel, got %s" % type(storageLevel))
+        javaStorageLevel = self._java_matrix_wrapper._sc._getJavaStorageLevel(storageLevel)
+        self._java_matrix_wrapper.call("persist", javaStorageLevel)
+        return self
+
+    def validate(self):
+        """
+        Validates the block matrix info against the matrix data (`blocks`)
+        and throws an exception if any error is found.
+        """
+        self._java_matrix_wrapper.call("validate")
+
     def add(self, other):
         """
         Adds two block matrices together. The matrices must have the
@@ -857,6 +882,22 @@ def multiply(self, other):
         java_block_matrix = self._java_matrix_wrapper.call("multiply", other_java_block_matrix)
         return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock)
 
+    def transpose(self):
+        """
+        Transpose this BlockMatrix. Returns a new BlockMatrix
+        instance sharing the same underlying data. Is a lazy operation.
+
+        >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),
+        ...                          ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])
+        >>> mat = BlockMatrix(blocks, 3, 2)
+
+        >>> mat_transposed = mat.transpose()
+        >>> mat_transposed.toLocalMatrix()
+        DenseMatrix(2, 6, [1.0, 4.0, 2.0, 5.0, 3.0, 6.0, 7.0, 10.0, 8.0, 11.0, 9.0, 12.0], 0)
+        """
+        java_transposed_matrix = self._java_matrix_wrapper.call("transpose")
+        return BlockMatrix(java_transposed_matrix, self.colsPerBlock, self.rowsPerBlock)
+
     def toLocalMatrix(self):
         """
         Collect the distributed matrix on the driver as a DenseMatrix.

From edf3e45ea0492b29a4d1aeef452cc87c00067725 Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <mwdusenb@us.ibm.com>
Date: Fri, 30 Oct 2015 14:56:19 -0700
Subject: [PATCH 2/8] Adding remaining method to PySpark CoordinateMatrix:
 transpose.

---
 python/pyspark/mllib/linalg/distributed.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index de34cadb4084e..e2883367fefe0 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -515,6 +515,25 @@ def numCols(self):
         """
         return self._java_matrix_wrapper.call("numCols")
 
+    def transpose(self):
+        """
+        Transpose this CoordinateMatrix.
+
+        >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2),
+        ...                           MatrixEntry(1, 0, 2),
+        ...                           MatrixEntry(2, 1, 3.7)])
+        >>> mat = CoordinateMatrix(entries)
+        >>> mat_transposed = mat.transpose()
+
+        >>> print(mat_transposed.numRows())
+        2
+
+        >>> print(mat_transposed.numCols())
+        3
+        """
+        java_transposed_matrix = self._java_matrix_wrapper.call("transpose")
+        return CoordinateMatrix(java_transposed_matrix)
+
     def toRowMatrix(self):
         """
         Convert this matrix to a RowMatrix.

From e1532790e80a6b1032c7df7548c6db95167d7f56 Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <mwdusenb@us.ibm.com>
Date: Fri, 30 Oct 2015 15:27:37 -0700
Subject: [PATCH 3/8] Adding remaining method to PySpark IndexedRowMatrix:
 computeGramianMatrix. Note that 'multiply' and 'computeSVD' are part of the
 SPARK-6227 PR.

---
 python/pyspark/mllib/linalg/distributed.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index e2883367fefe0..78f7e739b343f 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -312,6 +312,20 @@ def columnSimilarities(self):
         java_coordinate_matrix = self._java_matrix_wrapper.call("columnSimilarities")
         return CoordinateMatrix(java_coordinate_matrix)
 
+    def computeGramianMatrix(self):
+        """
+        Computes the Gramian matrix `A^T A`. Note that this cannot be
+        computed on matrices with more than 65535 columns.
+
+        >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
+        ...                        IndexedRow(1, [4, 5, 6])])
+        >>> mat = IndexedRowMatrix(rows)
+
+        >>> mat.computeGramianMatrix()
+        DenseMatrix(3, 3, [17.0, 22.0, 27.0, 22.0, 29.0, 36.0, 27.0, 36.0, 45.0], 0)
+        """
+        return self._java_matrix_wrapper.call("computeGramianMatrix")
+
     def toRowMatrix(self):
         """
         Convert this matrix to a RowMatrix.

From 587bea5e5550fa4bcaee61c06dfdad9a935fb092 Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <mwdusenb@us.ibm.com>
Date: Mon, 2 Nov 2015 13:53:15 -0800
Subject: [PATCH 4/8] Adding remaining methods to PySpark RowMatrix:
 computeGramianMatrix, computeCovariance, computeColumnSummaryStatistics,
 columnSimilarities, tallSkinnyQR.

---
 python/pyspark/mllib/linalg/__init__.py    |  24 ++++
 python/pyspark/mllib/linalg/distributed.py | 145 ++++++++++++++++++++-
 2 files changed, 168 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 4cd7306edb11b..610afc5e052e7 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -1235,6 +1235,30 @@ def sparse(numRows, numCols, colPtrs, rowIndices, values):
         return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values)
 
 
+class QRDecomposition(object):
+    """
+    Represents QR factors.
+    """
+    def __init__(self, Q, R):
+        self._Q = Q
+        self._R = R
+
+    @property
+    def Q(self):
+        """
+        An orthogonal matrix Q in a QR decomposition.
+        May be null if not computed.
+        """
+        return self._Q
+
+    @property
+    def R(self):
+        """
+        An upper triangular matrix R in a QR decomposition.
+        """
+        return self._R
+
+
 def _test():
     import doctest
     (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS)
diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index 78f7e739b343f..0db2682a6ba28 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -28,7 +28,8 @@
 
 from pyspark import RDD
 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
-from pyspark.mllib.linalg import _convert_to_vector, Matrix
+from pyspark.mllib.linalg import _convert_to_vector, Matrix, QRDecomposition
+from pyspark.mllib.stat import MultivariateStatisticalSummary
 from pyspark.storagelevel import StorageLevel
 
 
@@ -152,6 +153,148 @@ def numCols(self):
         """
         return self._java_matrix_wrapper.call("numCols")
 
+    def computeColumnSummaryStatistics(self):
+        """
+        Computes column-wise summary statistics.
+
+        :return: :class:`MultivariateStatisticalSummary` object
+                 containing column-wise summary statistics.
+
+        >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]])
+        >>> mat = RowMatrix(rows)
+
+        >>> colStats = mat.computeColumnSummaryStatistics()
+        >>> colStats.mean()
+        array([ 2.5,  3.5,  4.5])
+        """
+        java_col_stats = self._java_matrix_wrapper.call("computeColumnSummaryStatistics")
+        return MultivariateStatisticalSummary(java_col_stats)
+
+    def computeCovariance(self):
+        """
+        Computes the covariance matrix, treating each row as an
+        observation. Note that this cannot be computed on matrices
+        with more than 65535 columns.
+
+        >>> rows = sc.parallelize([[1, 2], [2, 1]])
+        >>> mat = RowMatrix(rows)
+
+        >>> mat.computeCovariance()
+        DenseMatrix(2, 2, [0.5, -0.5, -0.5, 0.5], 0)
+        """
+        return self._java_matrix_wrapper.call("computeCovariance")
+
+    def computeGramianMatrix(self):
+        """
+        Computes the Gramian matrix `A^T A`. Note that this cannot be
+        computed on matrices with more than 65535 columns.
+
+        >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]])
+        >>> mat = RowMatrix(rows)
+
+        >>> mat.computeGramianMatrix()
+        DenseMatrix(3, 3, [17.0, 22.0, 27.0, 22.0, 29.0, 36.0, 27.0, 36.0, 45.0], 0)
+        """
+        return self._java_matrix_wrapper.call("computeGramianMatrix")
+
+    def columnSimilarities(self, threshold=0.0):
+        """
+        Compute similarities between columns of this matrix.
+
+        The threshold parameter is a trade-off knob between estimate
+        quality and computational cost.
+
+        The default threshold setting of 0 guarantees deterministically
+        correct results, but uses the brute-force approach of computing
+        normalized dot products.
+
+        Setting the threshold to positive values uses a sampling
+        approach and incurs strictly less computational cost than the
+        brute-force approach. However the similarities computed will
+        be estimates.
+
+        The sampling guarantees relative-error correctness for those
+        pairs of columns that have similarity greater than the given
+        similarity threshold.
+
+        To describe the guarantee, we set some notation:
+            * Let A be the smallest in magnitude non-zero element of
+              this matrix.
+            * Let B be the largest in magnitude non-zero element of
+              this matrix.
+            * Let L be the maximum number of non-zeros per row.
+
+        For example, for {0,1} matrices: A=B=1.
+        Another example, for the Netflix matrix: A=1, B=5
+
+        For those column pairs that are above the threshold, the
+        computed similarity is correct to within 20% relative error
+        with probability at least 1 - (0.981)^10/B^
+
+        The shuffle size is bounded by the *smaller* of the following
+        two expressions:
+
+            * O(n log(n) L / (threshold * A))
+            * O(m L^2^)
+
+        The latter is the cost of the brute-force approach, so for
+        non-zero thresholds, the cost is always cheaper than the
+        brute-force approach.
+
+        :param: threshold: Set to 0 for deterministic guaranteed
+                           correctness. Similarities above this
+                           threshold are estimated with the cost vs
+                           estimate quality trade-off described above.
+        :return: An n x n sparse upper-triangular CoordinateMatrix of
+                 cosine similarities between columns of this matrix.
+
+        >>> rows = sc.parallelize([[1, 2], [1, 5]])
+        >>> mat = RowMatrix(rows)
+
+        >>> sims = mat.columnSimilarities()
+        >>> round(sims.entries.first().value, 12)
+        0.919145030018
+        """
+        java_sims_mat = self._java_matrix_wrapper.call("columnSimilarities", float(threshold))
+        return CoordinateMatrix(java_sims_mat)
+
+    def tallSkinnyQR(self, computeQ=False):
+        """
+        Compute the QR decomposition of this RowMatrix.
+
+        The implementation is designed to optimize the QR decomposition
+        (factorization) for the RowMatrix of a tall and skinny shape.
+
+        Reference:
+         Paul G. Constantine, David F. Gleich. "Tall and skinny QR
+         factorizations in MapReduce architectures"
+         ([[http://dx.doi.org/10.1145/1996092.1996103]])
+
+        :param: computeQ: whether to computeQ
+        :return: QRDecomposition(Q: RowMatrix, R: Matrix), where
+                 Q = None if computeQ = false.
+
+        >>> rows = sc.parallelize([[3, -6], [4, -8], [0, 1]])
+        >>> mat = RowMatrix(rows)
+        >>> decomp = mat.tallSkinnyQR(True)
+
+        >>> # Test with absolute values
+        >>> decomp.Q.rows.map(lambda row: abs(row.toArray()).tolist()).collect()
+        [[0.6..., 0.0], [0.8..., 0.0], [0.0, 1.0]]
+
+        >>> # Test with absolute values
+        >>> abs(decomp.R.toArray()).tolist()
+        [[5.0, 10.0], [0.0, 1.0]]
+        """
+        decomp = JavaModelWrapper(self._java_matrix_wrapper.call("tallSkinnyQR", computeQ))
+        if computeQ:
+            java_Q = decomp.call("Q")
+            Q = RowMatrix(java_Q)
+        else:
+            Q = None
+        R = decomp.call("R")
+        return QRDecomposition(Q, R)
+
 
 class IndexedRow(object):
     """

From 12baa78f6a05e58b1b8cc3ed3431d35cddabfb51 Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <mwdusenb@us.ibm.com>
Date: Tue, 3 Nov 2015 12:18:50 -0800
Subject: [PATCH 5/8] Improving robustness of PySpark test for Python 2.6.

---
 python/pyspark/mllib/linalg/distributed.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index 0db2682a6ba28..f2eee49c6f928 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -252,8 +252,8 @@ def columnSimilarities(self, threshold=0.0):
         >>> mat = RowMatrix(rows)
 
         >>> sims = mat.columnSimilarities()
-        >>> round(sims.entries.first().value, 12)
-        0.919145030018
+        >>> sims.entries.first().value
+        0.91914503...
         """
         java_sims_mat = self._java_matrix_wrapper.call("columnSimilarities", float(threshold))
         return CoordinateMatrix(java_sims_mat)
@@ -277,13 +277,16 @@ def tallSkinnyQR(self, computeQ=False):
         >>> rows = sc.parallelize([[3, -6], [4, -8], [0, 1]])
         >>> mat = RowMatrix(rows)
         >>> decomp = mat.tallSkinnyQR(True)
+        >>> Q = decomp.Q
+        >>> R = decomp.R
 
         >>> # Test with absolute values
-        >>> decomp.Q.rows.map(lambda row: abs(row.toArray()).tolist()).collect()
+        >>> absQRows = Q.rows.map(lambda row: abs(row.toArray()).tolist())
+        >>> absQRows.collect()
         [[0.6..., 0.0], [0.8..., 0.0], [0.0, 1.0]]
 
         >>> # Test with absolute values
-        >>> abs(decomp.R.toArray()).tolist()
+        >>> abs(R.toArray()).tolist()
         [[5.0, 10.0], [0.0, 1.0]]
         """
         decomp = JavaModelWrapper(self._java_matrix_wrapper.call("tallSkinnyQR", computeQ))

From fca41ca4df8f04389dc9ce87de7a2af591476e2e Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <mwdusenb@us.ibm.com>
Date: Tue, 3 Nov 2015 12:20:28 -0800
Subject: [PATCH 6/8] Adding experimental tag to QRDecomposition.

---
 python/pyspark/mllib/linalg/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 610afc5e052e7..b724ed88f3d9e 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -43,7 +43,8 @@
 
 
 __all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors',
-           'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices']
+           'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices',
+           'QRDecomposition']
 
 
 if sys.version_info[:2] == (2, 7):
@@ -1237,6 +1238,8 @@ def sparse(numRows, numCols, colPtrs, rowIndices, values):
 
 class QRDecomposition(object):
     """
+    .. note:: Experimental
+
     Represents QR factors.
     """
     def __init__(self, Q, R):

From f1410bf63f99bbb3b46ec8da03406b3bd1768783 Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <mwdusenb@us.ibm.com>
Date: Thu, 21 Apr 2016 11:02:46 -0700
Subject: [PATCH 7/8] Adding @since annotations, and adding a comment to Scala.

---
 .../mllib/linalg/distributed/IndexedRowMatrix.scala |  3 ++-
 python/pyspark/mllib/linalg/__init__.py             |  3 +++
 python/pyspark/mllib/linalg/distributed.py          | 13 ++++++++++++-
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
index 06b9c4ac67bb0..b03b3ecde94f4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
@@ -188,7 +188,8 @@ class IndexedRowMatrix @Since("1.0.0") (
   }
 
   /**
-   * Computes the Gramian matrix `A^T A`.
+   * Computes the Gramian matrix `A^T A`. Note that this cannot be
+   * computed on matrices with more than 65535 columns.
    */
   @Since("1.0.0")
   def computeGramianMatrix(): Matrix = {
diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index b724ed88f3d9e..70509a6d9bece 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -38,6 +38,7 @@
 
 import numpy as np
 
+from pyspark import since
 from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
     IntegerType, ByteType, BooleanType
 
@@ -1247,6 +1248,7 @@ def __init__(self, Q, R):
         self._R = R
 
     @property
+    @since('2.0.0')
     def Q(self):
         """
         An orthogonal matrix Q in a QR decomposition.
@@ -1255,6 +1257,7 @@ def Q(self):
         return self._Q
 
     @property
+    @since('2.0.0')
     def R(self):
         """
         An upper triangular matrix R in a QR decomposition.
diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index f2eee49c6f928..136dd3319c9cc 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -26,7 +26,7 @@
 
 from py4j.java_gateway import JavaObject
 
-from pyspark import RDD
+from pyspark import RDD, since
 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
 from pyspark.mllib.linalg import _convert_to_vector, Matrix, QRDecomposition
 from pyspark.mllib.stat import MultivariateStatisticalSummary
@@ -153,6 +153,7 @@ def numCols(self):
         """
         return self._java_matrix_wrapper.call("numCols")
 
+    @since('2.0.0')
     def computeColumnSummaryStatistics(self):
         """
         Computes column-wise summary statistics.
@@ -170,6 +171,7 @@ def computeColumnSummaryStatistics(self):
         java_col_stats = self._java_matrix_wrapper.call("computeColumnSummaryStatistics")
         return MultivariateStatisticalSummary(java_col_stats)
 
+    @since('2.0.0')
     def computeCovariance(self):
         """
         Computes the covariance matrix, treating each row as an
@@ -184,6 +186,7 @@ def computeCovariance(self):
         """
         return self._java_matrix_wrapper.call("computeCovariance")
 
+    @since('2.0.0')
     def computeGramianMatrix(self):
         """
         Computes the Gramian matrix `A^T A`. Note that this cannot be
@@ -197,6 +200,7 @@ def computeGramianMatrix(self):
         """
         return self._java_matrix_wrapper.call("computeGramianMatrix")
 
+    @since('2.0.0')
     def columnSimilarities(self, threshold=0.0):
         """
         Compute similarities between columns of this matrix.
@@ -258,6 +262,7 @@ def columnSimilarities(self, threshold=0.0):
         java_sims_mat = self._java_matrix_wrapper.call("columnSimilarities", float(threshold))
         return CoordinateMatrix(java_sims_mat)
 
+    @since('2.0.0')
     def tallSkinnyQR(self, computeQ=False):
         """
         Compute the QR decomposition of this RowMatrix.
@@ -458,6 +463,7 @@ def columnSimilarities(self):
         java_coordinate_matrix = self._java_matrix_wrapper.call("columnSimilarities")
         return CoordinateMatrix(java_coordinate_matrix)
 
+    @since('2.0.0')
     def computeGramianMatrix(self):
         """
         Computes the Gramian matrix `A^T A`. Note that this cannot be
@@ -675,6 +681,7 @@ def numCols(self):
         """
         return self._java_matrix_wrapper.call("numCols")
 
+    @since('2.0.0')
     def transpose(self):
         """
         Transpose this CoordinateMatrix.
@@ -969,6 +976,7 @@ def numCols(self):
         """
         return self._java_matrix_wrapper.call("numCols")
 
+    @since('2.0.0')
     def cache(self):
         """
         Caches the underlying RDD.
@@ -976,6 +984,7 @@ def cache(self):
         self._java_matrix_wrapper.call("cache")
         return self
 
+    @since('2.0.0')
     def persist(self, storageLevel):
         """
         Persists the underlying RDD with the specified storage level.
@@ -986,6 +995,7 @@ def persist(self, storageLevel):
         self._java_matrix_wrapper.call("persist", javaStorageLevel)
         return self
 
+    @since('2.0.0')
     def validate(self):
         """
         Validates the block matrix info against the matrix data (`blocks`)
@@ -1061,6 +1071,7 @@ def multiply(self, other):
         java_block_matrix = self._java_matrix_wrapper.call("multiply", other_java_block_matrix)
         return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock)
 
+    @since('2.0.0')
     def transpose(self):
         """
         Transpose this BlockMatrix. Returns a new BlockMatrix

From c0c9565706c148cb7dd64250630931ab41838d3b Mon Sep 17 00:00:00 2001
From: Mike Dusenberry <mwdusenb@us.ibm.com>
Date: Thu, 21 Apr 2016 14:44:26 -0700
Subject: [PATCH 8/8] Adding the subtract method.

---
 python/pyspark/mllib/linalg/distributed.py | 35 ++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index 136dd3319c9cc..af34ce346b0ca 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -1036,6 +1036,41 @@ def add(self, other):
         java_block_matrix = self._java_matrix_wrapper.call("add", other_java_block_matrix)
         return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock)
 
+    @since('2.0.0')
+    def subtract(self, other):
+        """
+        Subtracts the given block matrix `other` from this block matrix:
+        `this - other`. The matrices must have the same size and
+        matching `rowsPerBlock` and `colsPerBlock` values.  If one of
+        the sub matrix blocks that are being subtracted is a
+        SparseMatrix, the resulting sub matrix block will also be a
+        SparseMatrix, even if it is being subtracted from a DenseMatrix.
+        If two dense sub matrix blocks are subtracted, the output block
+        will also be a DenseMatrix.
+
+        >>> dm1 = Matrices.dense(3, 2, [3, 1, 5, 4, 6, 2])
+        >>> dm2 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12])
+        >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [1, 2, 3])
+        >>> blocks1 = sc.parallelize([((0, 0), dm1), ((1, 0), dm2)])
+        >>> blocks2 = sc.parallelize([((0, 0), dm2), ((1, 0), dm1)])
+        >>> blocks3 = sc.parallelize([((0, 0), sm), ((1, 0), dm2)])
+        >>> mat1 = BlockMatrix(blocks1, 3, 2)
+        >>> mat2 = BlockMatrix(blocks2, 3, 2)
+        >>> mat3 = BlockMatrix(blocks3, 3, 2)
+
+        >>> mat1.subtract(mat2).toLocalMatrix()
+        DenseMatrix(6, 2, [-4.0, -7.0, -4.0, 4.0, 7.0, 4.0, -6.0, -5.0, -10.0, 6.0, 5.0, 10.0], 0)
+
+        >>> mat2.subtract(mat3).toLocalMatrix()
+        DenseMatrix(6, 2, [6.0, 8.0, 9.0, -4.0, -7.0, -4.0, 10.0, 9.0, 9.0, -6.0, -5.0, -10.0], 0)
+        """
+        if not isinstance(other, BlockMatrix):
+            raise TypeError("Other should be a BlockMatrix, got %s" % type(other))
+
+        other_java_block_matrix = other._java_matrix_wrapper._java_model
+        java_block_matrix = self._java_matrix_wrapper.call("subtract", other_java_block_matrix)
+        return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock)
+
     def multiply(self, other):
         """
         Left multiplies this BlockMatrix by `other`, another