From d4f9fe5eff6865a77882f4762ebb575bcc985023 Mon Sep 17 00:00:00 2001
From: Hu Shiwen <yajiedesign@gmail.com>
Date: Fri, 25 Aug 2017 14:39:35 +0800
Subject: [PATCH 1/4] fix linalg_impl

---
 Jenkinsfile                | 11 ++++++++++-
 src/operator/linalg_impl.h | 30 ++++++++++++++++++++++++++++--
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 2d4cc017c865..2dfc57c9a265 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -150,7 +150,7 @@ try {
           }
         }
       },
-      'Amalgamation': {
+      'Amalgamation MIN': {
         node('mxnetlinux') {
           ws('workspace/amalgamation') {
             init_git()
@@ -159,6 +159,15 @@ try {
           }
         }
       },
+      'Amalgamation': {
+        node('mxnetlinux') {
+          ws('workspace/amalgamation') {
+            init_git()
+            make('cpu', '-C amalgamation/ clean')
+            make('cpu', '-C amalgamation/ USE_BLAS=openblas')
+          }
+        }
+      },
       'GPU: MKLML': {
         node('mxnetlinux') {
           ws('workspace/build-mklml') {
diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
index c1e813614c72..5609e0bed555 100644
--- a/src/operator/linalg_impl.h
+++ b/src/operator/linalg_impl.h
@@ -342,6 +342,8 @@ inline void linalg_gemm(const Tensor<xpu, 2, DType>& A,
     case kWriteTo:
     case kWriteInplace:
       linalg_gemm(A, B, C, DType(1.0), DType(0.0), tA, tB, s);
+
+
       break;
     case kAddTo:
       linalg_gemm(A, B, C, DType(1.0), DType(1.0), tA, tB, s);
@@ -365,10 +367,34 @@ inline void linalg_gemm<cpu, DType>(const Tensor<cpu, 2, DType>& A,
       break;
     case kWriteTo:
     case kWriteInplace:
-      C = dot(tA ? A.T() : A, tB ? B.T() : B);
+  	  if (tA) {
+  	    if (tB) {
+  	      const_cast<Tensor<xpu, 2, DType>&>(C) = dot(A.T(), B.T());
+  	    } else {
+  	      const_cast<Tensor<xpu, 2, DType>&>(C) = dot(A.T(), B);
+  	    }
+  	  } else {
+  	    if (tB) {
+  	      const_cast<Tensor<xpu, 2, DType>&>(C) = dot(A, B.T());
+  	    } else {
+  	      const_cast<Tensor<xpu, 2, DType>&>(C) = dot(A, B);
+  	    }
+  	  }
       break;
     case kAddTo:
-      C += dot(tA ? A.T() : A, tB ? B.T() : B);
+      if (tA) {
+        if (tB) {
+          const_cast<Tensor<xpu, 2, DType>&>(C) += dot(A.T(), B.T());
+        } else {
+          const_cast<Tensor<xpu, 2, DType>&>(C) += dot(A.T(), B);
+        }
+      } else {
+        if (tB) {
+          const_cast<Tensor<xpu, 2, DType>&>(C) += dot(A, B.T());
+        } else {
+          const_cast<Tensor<xpu, 2, DType>&>(C) += dot(A, B);
+        }
+      }
       break;
     default:
       LOG(FATAL) << "not reached";

From 871a4eac482508a09f2c366dda69afa822d7c6e4 Mon Sep 17 00:00:00 2001
From: Hu Shiwen <yajiedesign@gmail.com>
Date: Fri, 25 Aug 2017 14:42:45 +0800
Subject: [PATCH 2/4] fix

---
 src/operator/linalg_impl.h | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
index 5609e0bed555..bad37d6250fc 100644
--- a/src/operator/linalg_impl.h
+++ b/src/operator/linalg_impl.h
@@ -342,8 +342,6 @@ inline void linalg_gemm(const Tensor<xpu, 2, DType>& A,
     case kWriteTo:
     case kWriteInplace:
       linalg_gemm(A, B, C, DType(1.0), DType(0.0), tA, tB, s);
-
-
       break;
     case kAddTo:
       linalg_gemm(A, B, C, DType(1.0), DType(1.0), tA, tB, s);
@@ -369,30 +367,30 @@ inline void linalg_gemm<cpu, DType>(const Tensor<cpu, 2, DType>& A,
     case kWriteInplace:
   	  if (tA) {
   	    if (tB) {
-  	      const_cast<Tensor<xpu, 2, DType>&>(C) = dot(A.T(), B.T());
+  	      const_cast<Tensor<cpu, 2, DType>&>(C) = dot(A.T(), B.T());
   	    } else {
-  	      const_cast<Tensor<xpu, 2, DType>&>(C) = dot(A.T(), B);
+  	      const_cast<Tensor<cpu, 2, DType>&>(C) = dot(A.T(), B);
   	    }
   	  } else {
   	    if (tB) {
-  	      const_cast<Tensor<xpu, 2, DType>&>(C) = dot(A, B.T());
+  	      const_cast<Tensor<cpu, 2, DType>&>(C) = dot(A, B.T());
   	    } else {
-  	      const_cast<Tensor<xpu, 2, DType>&>(C) = dot(A, B);
+  	      const_cast<Tensor<cpu, 2, DType>&>(C) = dot(A, B);
   	    }
   	  }
       break;
     case kAddTo:
       if (tA) {
         if (tB) {
-          const_cast<Tensor<xpu, 2, DType>&>(C) += dot(A.T(), B.T());
+          const_cast<Tensor<cpu, 2, DType>&>(C) += dot(A.T(), B.T());
         } else {
-          const_cast<Tensor<xpu, 2, DType>&>(C) += dot(A.T(), B);
+          const_cast<Tensor<cpu, 2, DType>&>(C) += dot(A.T(), B);
         }
       } else {
         if (tB) {
-          const_cast<Tensor<xpu, 2, DType>&>(C) += dot(A, B.T());
+          const_cast<Tensor<cpu, 2, DType>&>(C) += dot(A, B.T());
         } else {
-          const_cast<Tensor<xpu, 2, DType>&>(C) += dot(A, B);
+          const_cast<Tensor<cpu, 2, DType>&>(C) += dot(A, B);
         }
       }
       break;

From 41c788deb383a989cd2e8d8fcb6b09b7b70e4f2a Mon Sep 17 00:00:00 2001
From: Hu Shiwen <yajiedesign@gmail.com>
Date: Fri, 25 Aug 2017 14:45:56 +0800
Subject: [PATCH 3/4] fix

---
 src/operator/linalg_impl.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
index bad37d6250fc..ff174b33b2b7 100644
--- a/src/operator/linalg_impl.h
+++ b/src/operator/linalg_impl.h
@@ -365,19 +365,19 @@ inline void linalg_gemm<cpu, DType>(const Tensor<cpu, 2, DType>& A,
       break;
     case kWriteTo:
     case kWriteInplace:
-  	  if (tA) {
-  	    if (tB) {
-  	      const_cast<Tensor<cpu, 2, DType>&>(C) = dot(A.T(), B.T());
-  	    } else {
-  	      const_cast<Tensor<cpu, 2, DType>&>(C) = dot(A.T(), B);
-  	    }
-  	  } else {
-  	    if (tB) {
-  	      const_cast<Tensor<cpu, 2, DType>&>(C) = dot(A, B.T());
-  	    } else {
-  	      const_cast<Tensor<cpu, 2, DType>&>(C) = dot(A, B);
-  	    }
-  	  }
+     if (tA) {
+       if (tB) {
+         const_cast<Tensor<cpu, 2, DType>&>(C) = dot(A.T(), B.T());
+       } else {
+         const_cast<Tensor<cpu, 2, DType>&>(C) = dot(A.T(), B);
+       }
+     } else {
+       if (tB) {
+         const_cast<Tensor<cpu, 2, DType>&>(C) = dot(A, B.T());
+       } else {
+         const_cast<Tensor<cpu, 2, DType>&>(C) = dot(A, B);
+       }
+     }
       break;
     case kAddTo:
       if (tA) {

From fc3b4b4055deed7ff14c8621117a42cca613f1e1 Mon Sep 17 00:00:00 2001
From: Hu Shiwen <yajiedesign@gmail.com>
Date: Fri, 25 Aug 2017 15:31:00 +0800
Subject: [PATCH 4/4] fix

---
 src/operator/linalg_impl.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
index ff174b33b2b7..e95eff0cc407 100644
--- a/src/operator/linalg_impl.h
+++ b/src/operator/linalg_impl.h
@@ -324,9 +324,9 @@ LINALG_GPU_BATCH_TRSM(DtrsmBatched, double)
  * \param A the first operand of the gemm
  * \param B the second operand of the gemm
  * \param C the data to be assigned
- * \tparam tA whether the `A` operand should be transposed first.
- * \tparam tB whether the `B` operand should be transposed first.
- * \tparam s the stream to perform the operation
+ * \param tA whether the `A` operand should be transposed first.
+ * \param tB whether the `B` operand should be transposed first.
+ * \param s the stream to perform the operation
  * \param req the assignment request
  */
 template<typename xpu, typename DType>
@@ -353,8 +353,8 @@ inline void linalg_gemm(const Tensor<xpu, 2, DType>& A,
 
 // A cpu specialization for linalg_gemm<xpu, DType> that uses mshadow::dot(), if no cblas.
 #if (MSHADOW_USE_CBLAS == 0)
-template<typename DType>
-inline void linalg_gemm<cpu, DType>(const Tensor<cpu, 2, DType>& A,
+template<typename xpu, typename DType>
+inline void linalg_gemm(const Tensor<cpu, 2, DType>& A,
                         const Tensor<cpu, 2, DType>& B,
                         const Tensor<cpu, 2, DType>& C,
                         bool tA, bool tB, Stream<cpu> *s,