From 1320de37c879bee3932811076dfb464faf10337f Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Thu, 28 Nov 2019 15:41:25 +0000
Subject: [PATCH 01/65] Add intgemm as a submodule

---
 .gitmodules      | 3 +++
 3rdparty/intgemm | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 3rdparty/intgemm

diff --git a/.gitmodules b/.gitmodules
index 1900820d4c86..f6f939b8f4be 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -26,3 +26,6 @@
 [submodule "3rdparty/nvidia_cub"]
 	path = 3rdparty/nvidia_cub
 	url = https://github.com/NVlabs/cub.git
+[submodule "3rdparty/intgemm"]
+	path = 3rdparty/intgemm
+	url = https://github.com/kpu/intgemm
diff --git a/3rdparty/intgemm b/3rdparty/intgemm
new file mode 160000
index 000000000000..1a14fdae0fc2
--- /dev/null
+++ b/3rdparty/intgemm
@@ -0,0 +1 @@
+Subproject commit 1a14fdae0fc262d37b73cacc3ba535cc8b945deb

From 0c68e33aa3a9cee6e9df2fe46bdec500aa5c9c95 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Thu, 28 Nov 2019 16:22:41 +0000
Subject: [PATCH 02/65] Update to remove DEFAULT macro

---
 3rdparty/intgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index 1a14fdae0fc2..37ba078bcab4 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit 1a14fdae0fc262d37b73cacc3ba535cc8b945deb
+Subproject commit 37ba078bcab455081ecf8f218d953405aa137e19

From 3bf28e5750b957e62129be53f26944d86b3a5449 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Thu, 28 Nov 2019 16:23:02 +0000
Subject: [PATCH 03/65] Add intgemm to CMake

---
 CMakeLists.txt | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2bb1f225ad13..a6512991703d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,6 +30,7 @@ mxnet_option(USE_F16C             "Build with x86 F16C instruction support" ON)
 mxnet_option(USE_LAPACK           "Build with lapack support" ON)
 mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
 mxnet_option(USE_MKLDNN           "Build with MKL-DNN support" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
+mxnet_option(USE_INTGEMM          "Build with x86 intgemm library for low-precision multiplication" ON IF (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
 mxnet_option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON IF NOT MSVC)
 mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support" OFF)
 mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   ON)
@@ -276,6 +277,14 @@ if(USE_MKLDNN)
   list(APPEND mxnet_LINKER_LIBS dnnl)
 endif()
 
+if(USE_INTGEMM)
+  message(STATUS "Using intgemm")
+  add_subdirectory(3rdparty/intgemm)
+  include_directories(3rdparty/intgemm)
+  #intgemm generates a config header based on AVX512 support in the compiler.
+  include_directories(${CMAKE_CURRENT_BINARY_DIR}/3rdparty/intgemm)
+endif()
+
 # Allow Cuda compiles outside of src tree to find things in 'src' and 'include'
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
@@ -522,6 +531,10 @@ endif()
 FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h" "include/*.h")
 FILE(GLOB_RECURSE CUDA "src/*.cu" "src/*.cuh")
 
+if (USE_INTGEMM)
+  list(APPEND SOURCE "3rdparty/intgemm/intgemm.cc")
+endif()
+
 # add nnvm to source
 FILE(GLOB_RECURSE NNVMSOURCE
   3rdparty/tvm/nnvm/src/c_api/*.cc

From 5b01d0b7b4ebba5e80a0e257d2b189bf25884d89 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Thu, 28 Nov 2019 17:30:27 +0000
Subject: [PATCH 04/65] Operator working for PrepareB

---
 .../contrib/intgemm/prepare_b_op-inl.h        |  91 ++++++++++++++
 src/operator/contrib/intgemm/prepare_b_op.cc  | 117 ++++++++++++++++++
 2 files changed, 208 insertions(+)
 create mode 100644 src/operator/contrib/intgemm/prepare_b_op-inl.h
 create mode 100644 src/operator/contrib/intgemm/prepare_b_op.cc

diff --git a/src/operator/contrib/intgemm/prepare_b_op-inl.h b/src/operator/contrib/intgemm/prepare_b_op-inl.h
new file mode 100644
index 000000000000..d102ef08ddd8
--- /dev/null
+++ b/src/operator/contrib/intgemm/prepare_b_op-inl.h
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file prepare_b_op-inl.h
+ * \brief Quantize B to int8 and permute to a CPU-dependent format in preparation for multiplication.
+ * This 
+ */
+#ifndef MXNET_OPERATOR_CONTRIB_INTGEMM_PREPARE_B_OP_INL_H_
+#define MXNET_OPERATOR_CONTRIB_INTGEMM_PREPARE_B_OP_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../elemwise_op_common.h"
+#include "../../tensor/init_op.h"
+
+namespace mxnet {
+namespace op {
+
+struct PrepareBParam : public dmlc::Parameter<PrepareBParam> {
+  float multiplier;
+  DMLC_DECLARE_PARAMETER(PrepareBParam) {
+    DMLC_DECLARE_FIELD(multiplier)
+      .describe("Multiply floats by this constant before casting to int8.  Typically you would set this to 127.0 / max absolute value in B.");
+  }
+};
+
+inline bool PrepareBOpShape(const nnvm::NodeAttrs& attrs,
+                             mxnet::ShapeVector* in_attrs,
+                             mxnet::ShapeVector* out_attrs) {
+  // One in, one out.
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+
+  const mxnet::TShape &shape = in_attrs->front();
+  if (mxnet::ndim_is_known(shape)) {
+    CHECK_GE(shape.ndim(), 2) << "Matrices have at least two dimensions.";
+  }
+  return !mxnet::op::shape_is_none(out_attrs->at(0));
+}
+
+inline bool PrepareBOpType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int>* in_attrs,
+                            std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  // This routine converts from float to int8.
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+  return true;
+}
+
+inline bool PrepareBOpStorageType(const nnvm::NodeAttrs& attrs,
+                                   const int dev_mask,
+                                   DispatchMode* dispatch_mode,
+                                   std::vector<int>* in_attrs,
+                                   std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  // Dense storage only.
+  return storage_type_assign(&out_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute) &&
+    storage_type_assign(&in_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CONTRIB_INTGEMM_PREPARE_B_INL_H_
diff --git a/src/operator/contrib/intgemm/prepare_b_op.cc b/src/operator/contrib/intgemm/prepare_b_op.cc
new file mode 100644
index 000000000000..32424e515b9e
--- /dev/null
+++ b/src/operator/contrib/intgemm/prepare_b_op.cc
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file prepare_b_op.cc
+ * \brief Converts B matrices to intgemm's representation.
+ */
+#include "./prepare_b_op-inl.h"
+#include "../../../../3rdparty/intgemm/aligned.h"
+#include "../../../../3rdparty/intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(PrepareBParam);
+
+namespace {
+void PrepareBOpForward(const nnvm::NodeAttrs& attrs,
+                       const OpContext& ctx,
+                       const TBlob& in,
+                       const TBlob& out) {
+  CHECK_EQ(in.type_flag_, mshadow::kFloat32);
+  CHECK_EQ(out.type_flag_, mshadow::kInt8);
+  CHECK(in.CheckContiguous());
+  CHECK(out.CheckContiguous());
+  int B_cols = 1;
+  for (int s = 0; s < in.shape_.ndim() - 1; ++s) {
+    B_cols *= in.shape_[s];
+  }
+  int inner = in.shape_[in.shape_.ndim() - 1];
+  CHECK_EQ(inner % ::intgemm::Int8::kBTileRow, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::kBTileRow;
+  CHECK_EQ(B_cols % ::intgemm::Int8::kBTileCol, 0) << "intgemm requires B have a multiple of " << ::intgemm::Int8::kBTileCol << " columns inthe equation C = AB.";
+ 
+  const float *B = in.dptr<float>();
+  int8_t *quantB = out.dptr<int8_t>();
+  const PrepareBParam& param = nnvm::get<PrepareBParam>(attrs.parsed);
+  // TODO: eliminate transpose here by making a PrepareBColumnMajor.
+  intgemm::AlignedVector<float> B_transpose(inner * B_cols);
+  for (int i = 0; i < inner; ++i) {
+    for (int j = 0; j < B_cols; ++j) {
+      B_transpose[i * B_cols + j] = B[i + inner * j];
+    }
+  }
+  ::intgemm::Int8::PrepareB(B_transpose.begin(), quantB, param.multiplier, inner, B_cols);
+}
+} // namespace
+
+void PrepareBOpForwardExCPU(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
+  CHECK_EQ(inputs[0].storage_type(), kDefaultStorage);
+  CHECK_EQ(outputs[0].storage_type(), kDefaultStorage);
+  PrepareBOpForward(attrs, ctx, inputs[0].data(), outputs[0].data());
+}
+
+void PrepareBOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
+  PrepareBOpForward(attrs, ctx, inputs[0], outputs[0]);
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_prepareb)
+.describe(R"code(This operator converts a float32 matrix to intgemm's internal representation of B in preparation for the operation C = AB.  B should be provided in column-major order i.e. the last dimension of shape is the number of rows of B. This operator is not meant to be fast; it is meant to be run offline to quantize a model.
+
+The float32 values are multiplied by the provided multiplier before casting to int8.
+
+The internal representation of B is CPU dependent: AVX512BW, AVX2, and SSSE3 have different formats.
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<PrepareBParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"B"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", PrepareBOpShape)
+.set_attr<nnvm::FInferType>("FInferType", PrepareBOpType)
+.set_attr<FInferStorageType>("FInferStorageType", PrepareBOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", PrepareBOpForwardCPU)
+.set_attr<FComputeEx>("FComputeEx<cpu>", PrepareBOpForwardExCPU)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.add_argument("B", "NDArray-or-Symbol", "Parameter matrix to be prepared for multiplication.")
+.add_arguments(PrepareBParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet

From 0d7b54aa7a5e6b3d19d288cce26a9a3e5e336227 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Fri, 29 Nov 2019 13:02:13 +0000
Subject: [PATCH 05/65] Consolidate CPU inline into cc since there's only one
 dispatch

---
 .../contrib/intgemm/prepare_b_op-inl.h        | 91 -------------------
 src/operator/contrib/intgemm/prepare_b_op.cc  | 71 +++++++++++++--
 2 files changed, 63 insertions(+), 99 deletions(-)
 delete mode 100644 src/operator/contrib/intgemm/prepare_b_op-inl.h

diff --git a/src/operator/contrib/intgemm/prepare_b_op-inl.h b/src/operator/contrib/intgemm/prepare_b_op-inl.h
deleted file mode 100644
index d102ef08ddd8..000000000000
--- a/src/operator/contrib/intgemm/prepare_b_op-inl.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file prepare_b_op-inl.h
- * \brief Quantize B to int8 and permute to a CPU-dependent format in preparation for multiplication.
- * This 
- */
-#ifndef MXNET_OPERATOR_CONTRIB_INTGEMM_PREPARE_B_OP_INL_H_
-#define MXNET_OPERATOR_CONTRIB_INTGEMM_PREPARE_B_OP_INL_H_
-
-#include <mxnet/operator_util.h>
-#include <vector>
-#include "../../mshadow_op.h"
-#include "../../mxnet_op.h"
-#include "../../operator_common.h"
-#include "../../elemwise_op_common.h"
-#include "../../tensor/init_op.h"
-
-namespace mxnet {
-namespace op {
-
-struct PrepareBParam : public dmlc::Parameter<PrepareBParam> {
-  float multiplier;
-  DMLC_DECLARE_PARAMETER(PrepareBParam) {
-    DMLC_DECLARE_FIELD(multiplier)
-      .describe("Multiply floats by this constant before casting to int8.  Typically you would set this to 127.0 / max absolute value in B.");
-  }
-};
-
-inline bool PrepareBOpShape(const nnvm::NodeAttrs& attrs,
-                             mxnet::ShapeVector* in_attrs,
-                             mxnet::ShapeVector* out_attrs) {
-  // One in, one out.
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
-
-  const mxnet::TShape &shape = in_attrs->front();
-  if (mxnet::ndim_is_known(shape)) {
-    CHECK_GE(shape.ndim(), 2) << "Matrices have at least two dimensions.";
-  }
-  return !mxnet::op::shape_is_none(out_attrs->at(0));
-}
-
-inline bool PrepareBOpType(const nnvm::NodeAttrs& attrs,
-                            std::vector<int>* in_attrs,
-                            std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-
-  // This routine converts from float to int8.
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
-  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
-  return true;
-}
-
-inline bool PrepareBOpStorageType(const nnvm::NodeAttrs& attrs,
-                                   const int dev_mask,
-                                   DispatchMode* dispatch_mode,
-                                   std::vector<int>* in_attrs,
-                                   std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  // Dense storage only.
-  return storage_type_assign(&out_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute) &&
-    storage_type_assign(&in_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
-}
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_CONTRIB_INTGEMM_PREPARE_B_INL_H_
diff --git a/src/operator/contrib/intgemm/prepare_b_op.cc b/src/operator/contrib/intgemm/prepare_b_op.cc
index 32424e515b9e..bc0bfa15720a 100644
--- a/src/operator/contrib/intgemm/prepare_b_op.cc
+++ b/src/operator/contrib/intgemm/prepare_b_op.cc
@@ -21,13 +21,71 @@
  * \file prepare_b_op.cc
  * \brief Converts B matrices to intgemm's representation.
  */
-#include "./prepare_b_op-inl.h"
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
 #include "../../../../3rdparty/intgemm/aligned.h"
 #include "../../../../3rdparty/intgemm/intgemm.h"
 
 namespace mxnet {
 namespace op {
 
+struct PrepareBParam : public dmlc::Parameter<PrepareBParam> {
+  float multiplier;
+  DMLC_DECLARE_PARAMETER(PrepareBParam) {
+    DMLC_DECLARE_FIELD(multiplier)
+      .describe("Multiply floats by this constant before casting to int8.  Typically you would set this to 127.0 / max absolute value in B.");
+  }
+};
+
+inline bool PrepareBOpShape(const nnvm::NodeAttrs& attrs,
+                             mxnet::ShapeVector* in_attrs,
+                             mxnet::ShapeVector* out_attrs) {
+  // One in, one out.
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+
+  const mxnet::TShape &shape = in_attrs->front();
+  if (mxnet::ndim_is_known(shape)) {
+    CHECK_GE(shape.ndim(), 2) << "Matrices have at least two dimensions.";
+  }
+  return !mxnet::op::shape_is_none(out_attrs->at(0));
+}
+
+inline bool PrepareBOpType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int>* in_attrs,
+                            std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  // This routine converts from float to int8.
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+  return true;
+}
+
+inline bool PrepareBOpStorageType(const nnvm::NodeAttrs& attrs,
+                                   const int dev_mask,
+                                   DispatchMode* dispatch_mode,
+                                   std::vector<int>* in_attrs,
+                                   std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  // Dense storage only.
+  return storage_type_assign(&out_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute) &&
+    storage_type_assign(&in_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
+}
+
+
+
 DMLC_REGISTER_PARAMETER(PrepareBParam);
 
 namespace {
@@ -39,11 +97,8 @@ void PrepareBOpForward(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out.type_flag_, mshadow::kInt8);
   CHECK(in.CheckContiguous());
   CHECK(out.CheckContiguous());
-  int B_cols = 1;
-  for (int s = 0; s < in.shape_.ndim() - 1; ++s) {
-    B_cols *= in.shape_[s];
-  }
-  int inner = in.shape_[in.shape_.ndim() - 1];
+  size_t B_cols = in.shape_.ProdShape(0, in.shape_.ndim() - 1);
+  size_t inner = in.shape_[in.shape_.ndim() - 1];
   CHECK_EQ(inner % ::intgemm::Int8::kBTileRow, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::kBTileRow;
   CHECK_EQ(B_cols % ::intgemm::Int8::kBTileCol, 0) << "intgemm requires B have a multiple of " << ::intgemm::Int8::kBTileCol << " columns inthe equation C = AB.";
  
@@ -52,8 +107,8 @@ void PrepareBOpForward(const nnvm::NodeAttrs& attrs,
   const PrepareBParam& param = nnvm::get<PrepareBParam>(attrs.parsed);
   // TODO: eliminate transpose here by making a PrepareBColumnMajor.
   intgemm::AlignedVector<float> B_transpose(inner * B_cols);
-  for (int i = 0; i < inner; ++i) {
-    for (int j = 0; j < B_cols; ++j) {
+  for (size_t i = 0; i < inner; ++i) {
+    for (size_t j = 0; j < B_cols; ++j) {
       B_transpose[i * B_cols + j] = B[i + inner * j];
     }
   }

From 88fb3a5165594791aeed9de1e4ebe543a3f5d29c Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Fri, 29 Nov 2019 13:02:36 +0000
Subject: [PATCH 06/65] intgemm MaxAbsolute operator

---
 .../contrib/intgemm/max_absolute_op.cc        | 119 ++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 src/operator/contrib/intgemm/max_absolute_op.cc

diff --git a/src/operator/contrib/intgemm/max_absolute_op.cc b/src/operator/contrib/intgemm/max_absolute_op.cc
new file mode 100644
index 000000000000..7c2786aebaa6
--- /dev/null
+++ b/src/operator/contrib/intgemm/max_absolute_op.cc
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file max_absolute_op.cc
+ * \brief Computes maximum absolute value of a tensor using intgemm
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "../../../../3rdparty/intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+/*struct MaxAbsoluteParam : public dmlc::Parameter<MaxAbsoluteParam> {
+};
+DMLC_REGISTER_PARAMETER(MaxAbsoluteParam);*/
+
+inline bool MaxAbsoluteOpShape(const nnvm::NodeAttrs& attrs,
+                             mxnet::ShapeVector* in_attrs,
+                             mxnet::ShapeVector* out_attrs) {
+  // One in, one out.
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, 1));
+  return shape_is_known(in_attrs->at(0));
+}
+
+inline bool MaxAbsoluteOpType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int>* in_attrs,
+                            std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+  return true;
+}
+
+inline bool MaxAbsoluteOpStorageType(const nnvm::NodeAttrs& attrs,
+                                   const int dev_mask,
+                                   DispatchMode* dispatch_mode,
+                                   std::vector<int>* in_attrs,
+                                   std::vector<int>* out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  (*out_attrs)[0] = kDefaultStorage;
+  return true;
+}
+
+void MaxAbsoluteOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo) << "TODO request types other than write";
+  const TBlob &in = inputs.front(), &out = outputs.front();
+  CHECK_EQ(in.type_flag_, mshadow::kFloat32);
+  CHECK_EQ(out.type_flag_, mshadow::kFloat32);
+  CHECK(in.CheckContiguous());
+  CHECK(out.CheckContiguous());
+  size_t size = in.shape_.Size();
+  CHECK_EQ(size % (512 / 8 / sizeof(float)), 0) << "The total size of the input must be a multiple of 16.";
+ 
+  const float *data = in.dptr<float>();
+  *out.dptr<float>() = ::intgemm::MaxAbsolute(data, data + size);
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_maxabsolute)
+.describe(R"code(Compute the maximum absolute value in a tensor of float32 fast on a CPU.  The tensor's total size must be a multiple of 16 and aligned to a multiple of 64 bytes.
+mxnet.nd.contrib.intgemm_maxabsolute(arr) == arr.abs().max()
+)code" ADD_FILELINE)
+//.set_attr_parser(ParamParser<MaxAbsoluteParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", MaxAbsoluteOpShape)
+.set_attr<nnvm::FInferType>("FInferType", MaxAbsoluteOpType)
+.set_attr<FInferStorageType>("FInferStorageType", MaxAbsoluteOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", MaxAbsoluteOpForwardCPU)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.add_argument("data", "NDArray-or-Symbol", "Tensor to compute maximum absolute value of");
+//.add_arguments(MaxAbsoluteParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet

From 9e5d7d5ada88ec904003653285c5648e61f7726c Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 2 Dec 2019 10:28:16 +0000
Subject: [PATCH 07/65] intgemm fully_connected operator

import mxnet as mx
a = mx.nd.random_uniform(low=-1.0, high=1.0, shape=[5, 64])
b = mx.nd.random_uniform(low=-1.0, high=1.0, shape=[8, 64])
b_scale = 127.0 / mx.nd.contrib.intgemm_maxabsolute(b).asscalar()
b_prepared = mx.nd.contrib.intgemm_prepareb(b, multiplier = b_scale)
mx.nd.FullyConnected(a, b, num_hidden=8, no_bias=True, flatten=False)
mx.nd.contrib.intgemm_fully_connected(a, b_prepared, out_float_multiplier=1.0/b_scale, num_hidden=8, no_bias=True, flatten=False)
---
 3rdparty/intgemm                              |   2 +-
 .../intgemm/intgemm_fully_connected_op.cc     | 270 ++++++++++++++++++
 src/operator/contrib/intgemm/prepare_a_op.cc  | 136 +++++++++
 src/operator/contrib/intgemm/prepare_b_op.cc  |   5 +-
 4 files changed, 408 insertions(+), 5 deletions(-)
 create mode 100644 src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
 create mode 100644 src/operator/contrib/intgemm/prepare_a_op.cc

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index 37ba078bcab4..faa4a3aef206 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit 37ba078bcab455081ecf8f218d953405aa137e19
+Subproject commit faa4a3aef2062d1dd2da51730caf80b5875a9f10
diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
new file mode 100644
index 000000000000..6b3b3ebefa65
--- /dev/null
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file intgemm_fully_connected_op.cc
+ * \brief Operator wrapping intgemm's Multiply routine
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <cstdlib>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "../../../../3rdparty/intgemm/aligned.h"
+#include "../../../../3rdparty/intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+struct IntgemmFullyConnectedParam : public dmlc::Parameter<IntgemmFullyConnectedParam> {
+  float out_float_multiplier;
+  int out_type;
+  int num_hidden;
+  bool no_bias;
+  bool flatten;
+  DMLC_DECLARE_PARAMETER(IntgemmFullyConnectedParam) {
+    // This part os a copy of the FullyConnected parameters.
+    DMLC_DECLARE_FIELD(num_hidden).set_lower_bound(1)
+    .describe("Number of hidden nodes of the output.");
+    DMLC_DECLARE_FIELD(no_bias).set_default(false)
+    .describe("Whether to disable bias parameter.");
+    DMLC_DECLARE_FIELD(flatten).set_default(true)
+    .describe("Whether to collapse all but the first axis of the input data tensor.");
+
+    DMLC_DECLARE_FIELD(out_type)
+    .add_enum("float32", mshadow::kFloat32)
+    .add_enum("int32", mshadow::kInt32)
+    .set_default(mshadow::kFloat32)
+    .describe("Output data type.");
+    DMLC_DECLARE_FIELD(out_float_multiplier)
+    .describe("If the out_type is float32, unquantize by multiplying by this number.  Typically 1.0/preparea.multiplier/prepareb.multiplier.  If you pass A in as float32, then A will be quantized using preparea.multiplier = 127.0/max(abs(A)) and out_float_multiplier will be adjusted accordingly.");
+  }
+};
+DMLC_REGISTER_PARAMETER(IntgemmFullyConnectedParam);
+
+namespace {
+template<class T> void IntgemmFullyConnectedSanity(const nnvm::NodeAttrs& attrs, T* in, T* out) {
+  // 2-3 parameters: A, B, and optional bias
+  const IntgemmFullyConnectedParam& param = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
+  CHECK_EQ(in->size(), param.no_bias ? 2U : 3U);
+  CHECK_EQ(out->size(), 1U);
+}
+} // namespace
+
+inline bool IntgemmFullyConnectedOpShape(const nnvm::NodeAttrs& attrs,
+                             mxnet::ShapeVector* in_shape,
+                             mxnet::ShapeVector* out_shape) {
+  IntgemmFullyConnectedSanity(attrs, in_shape, out_shape);
+  // This follows FullyConnectedShape except there's no option to flatten and the bias is implied.
+  const IntgemmFullyConnectedParam& param = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
+
+  // The rest is copied from FullyConnected.
+  using namespace mshadow;
+  if (!param.no_bias) {
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+  }
+  CHECK_EQ(out_shape->size(), 1U);
+  mxnet::TShape dshape = (*in_shape)[0];
+  mxnet::TShape oshape = (*out_shape)[0];
+  // require data to be known
+  if (!mxnet::ndim_is_known(dshape)) return false;
+
+  index_t num_input;
+  if (!param.flatten) {
+    num_input = dshape[dshape.ndim()-1];
+  } else {
+    num_input = dshape.ProdShape(1, dshape.ndim());
+  }
+  SHAPE_ASSIGN_CHECK(*in_shape, 1, Shape2(param.num_hidden, num_input));
+  if (!param.no_bias) {
+    if (!shape_assign(&(*in_shape)[2], Shape1(param.num_hidden)) &&
+        !shape_assign(&(*in_shape)[2], Shape2(param.num_hidden, 1))) {
+      LOG(FATAL) << "Unexpected shape for bias " << (*in_shape)[2];
+    }
+  }
+
+  if (!param.flatten) {
+    mxnet::TShape result_shape(dshape);
+    result_shape[dshape.ndim()-1] = param.num_hidden;
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape);
+  } else {
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden));
+  }
+  if (oshape.ndim() > 0) {
+    dshape[0] = oshape[0];
+    SHAPE_ASSIGN_CHECK(*in_shape, 0, dshape);
+  }
+  return true;
+}
+
+bool IntgemmFullyConnectedOpType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int>* in_attrs,
+                            std::vector<int>* out_attrs) {
+  IntgemmFullyConnectedSanity(attrs, in_attrs, out_attrs);
+  const IntgemmFullyConnectedParam& param = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
+
+  // Match the configuration for output.
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, param.out_type);
+  if (in_attrs->size() == 3) {
+    // Bias has same type as output.
+    TYPE_ASSIGN_CHECK(*in_attrs, 2, (*out_attrs)[0]);
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[2]);
+  }
+  // Users have to prepare B.
+  TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kInt8);
+  // A can be a float (in which case it is automatically quantized) or int8.
+  if (type_is_none((*in_attrs)[0])) {
+    return false;
+  }
+  return ((*in_attrs)[0] == mshadow::kInt8 || (*in_attrs)[0] == mshadow::kFloat32);
+}
+
+namespace {
+
+// TODO: amend AlignedVector to allow a reset.
+class FreeMe {
+  public:
+    FreeMe() : mem_(nullptr) {}
+    ~FreeMe() { std::free(mem_); }
+    void Reset(int8_t *with) {
+      std::free(mem_);
+      mem_ = with;
+    }
+  private:
+    int8_t *mem_;
+};
+
+} // namespace
+
+void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  IntgemmFullyConnectedSanity(attrs, &inputs, &outputs);
+  const IntgemmFullyConnectedParam& param = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
+
+  const TBlob &A = inputs[0], &B = inputs[1], &C = outputs[0];
+
+  CHECK(A.type_flag_ == mshadow::kInt8 || A.type_flag_ == mshadow::kFloat32);
+  CHECK_EQ(B.type_flag_, mshadow::kInt8);
+  CHECK(C.type_flag_ == mshadow::kInt32 || C.type_flag_ == mshadow::kFloat32);
+  CHECK(A.CheckContiguous());
+  CHECK(B.CheckContiguous());
+  CHECK(C.CheckContiguous());
+  CHECK_GE(A.shape_.ndim(), 1);
+  CHECK_GE(B.shape_.ndim(), 2);
+  size_t A_rows = A.shape_.ProdShape(0, A.shape_.ndim() - 1);
+  size_t inner = A.shape_[A.shape_.ndim() - 1];
+  CHECK_EQ(B.shape_[B.shape_.ndim() - 1], inner);
+  size_t B_cols = B.shape_.ProdShape(0, B.shape_.ndim() - 1);
+
+  CHECK_EQ(C.shape_.Size(), A_rows * B_cols);
+
+  bool bias = (inputs.size() == 3);
+  if (bias) {
+    CHECK_EQ(inputs[2].type_flag_, mshadow::kFloat32);
+    CHECK_EQ(C.type_flag_, mshadow::kFloat32);
+    CHECK_EQ(inputs[2].shape_.Size(), param.num_hidden);
+  }
+  CHECK_EQ(inner % ::intgemm::Int8::kBTileRow, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::kBTileRow;
+  CHECK_EQ(B_cols % ::intgemm::Int8::kBTileCol, 0) << "intgemm requires B have a multiple of " << ::intgemm::Int8::kBTileCol << " columns inthe equation C = AB.";
+
+  float out_float_multiplier = param.out_float_multiplier;
+
+  int8_t *A_quant;
+  // TODO report this memory consumption?
+  FreeMe A_quant_store;
+  if (A.type_flag_ == mshadow::kFloat32) {
+    const float *A_raw = A.dptr<float>();
+    // Quantize A for the user.  TODO: allow scale to be passed in.  Should the induced scale be an output?
+    float scale = 127.0 / ::intgemm::MaxAbsolute(A_raw, A_raw + A.shape_.Size());
+    out_float_multiplier /= scale;
+    // TODO report this memory consumption to mxnet?
+    A_quant = (int8_t*)aligned_alloc(64, A.shape_.Size());
+    CHECK(A_quant);
+    A_quant_store.Reset(A_quant);
+    ::intgemm::Int8::PrepareA(A_raw, A_quant, scale, A_rows, inner);
+  } else {
+    CHECK_EQ(A.type_flag_, mshadow::kInt8);
+    A_quant = A.dptr<int8_t>();
+  }
+  const int8_t *B_quant = B.dptr<int8_t>();
+  
+  if (bias) {
+    if (C.type_flag_ == mshadow::kFloat32) {
+      ::intgemm::callbacks::UnquantizeAndAddBiasAndWrite cb(out_float_multiplier, inputs[2].dptr<float>(), C.dptr<float>());
+      ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
+    } else {
+      // int32
+      ::intgemm::callbacks::AddBiasAndWrite cb(inputs[2].dptr<int32_t>(), C.dptr<int32_t>());
+      ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
+    }
+  } else {
+    if (C.type_flag_ == mshadow::kFloat32) {
+      ::intgemm::callbacks::UnquantizeAndWrite cb(out_float_multiplier, C.dptr<float>());
+      ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
+    } else {
+      // int32
+      ::intgemm::callbacks::Write<int32_t> cb(C.dptr<int32_t>());
+      ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
+    }
+  }
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_fully_connected)
+.describe(R"code(This operator converts quantizes float32 to int8 while also banning -128.
+
+It it suitable for preparing an A matrix for use by intgemm's C=AB operation.
+
+The float32 values are multiplied by the provided multiplier before casting to int8.  Typically this is 127.0 / maxabsolute(A).
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<IntgemmFullyConnectedParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const IntgemmFullyConnectedParam& params = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
+  return params.no_bias ? 2 : 3;
+})
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"A", "B"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", IntgemmFullyConnectedOpShape)
+.set_attr<nnvm::FInferType>("FInferType", IntgemmFullyConnectedOpType)
+//.set_attr<FInferStorageType>("FInferStorageType", IntgemmFullyConnectedOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", IntgemmFullyConnectedOpForwardCPU)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.add_argument("data", "NDArray-or-Symbol", "First (A) argument to multiplication. Tensor of float32 (quantized on the fly) or int8 from intgemm_preparea. If you use a different quantizer, be sure to ban -128. The last dimension must be a multiple of 64.")
+.add_argument("weight", "NDArray-or-Symbol", "Second (B) argument to multiplication. Tensor of int8 from intgemm_prepareb. The last dimension must be a multiple of 64.  The product of non-last dimensions must be a multiple of 8.")
+.add_argument("bias", "NDArray-or-Symbol", "Bias term.")
+.add_arguments(IntgemmFullyConnectedParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/intgemm/prepare_a_op.cc b/src/operator/contrib/intgemm/prepare_a_op.cc
new file mode 100644
index 000000000000..2d3fb61df7f1
--- /dev/null
+++ b/src/operator/contrib/intgemm/prepare_a_op.cc
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file prepare_a_op.cc
+ * \brief Converts A matrices (typically activations) to intgemm's 
+ * representation for A in C=AB.  This just quantizes to int8 and bans -128.  
+ * The only difference from Quantize/QuantizeV2 is that it bans -128.
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "../../../../3rdparty/intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+struct PrepareAParam : public dmlc::Parameter<PrepareAParam> {
+  float multiplier;
+  DMLC_DECLARE_PARAMETER(PrepareAParam) {
+    DMLC_DECLARE_FIELD(multiplier)
+      .describe("Multiply floats by this constant before casting to int8.  Typically you would set this to 127.0 / max absolute value in A.");
+  }
+};
+DMLC_REGISTER_PARAMETER(PrepareAParam);
+
+inline bool PrepareAOpShape(const nnvm::NodeAttrs& attrs,
+                             mxnet::ShapeVector* in_attrs,
+                             mxnet::ShapeVector* out_attrs) {
+  // One in, one out.
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+
+  return shape_is_known(out_attrs->at(0));
+}
+
+inline bool PrepareAOpType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int>* in_attrs,
+                            std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  // This routine converts from float to int8.
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+  return true;
+}
+
+inline bool PrepareAOpStorageType(const nnvm::NodeAttrs& attrs,
+                                   const int dev_mask,
+                                   DispatchMode* dispatch_mode,
+                                   std::vector<int>* in_attrs,
+                                   std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  // Dense storage only.
+  return storage_type_assign(&out_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute) &&
+    storage_type_assign(&in_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
+}
+
+
+void PrepareAOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
+  const TBlob &in = inputs[0], &out = outputs[0];
+
+  CHECK_EQ(in.type_flag_, mshadow::kFloat32);
+  CHECK_EQ(out.type_flag_, mshadow::kInt8);
+  CHECK(in.CheckContiguous());
+  CHECK(out.CheckContiguous());
+  size_t size = in.shape_.Size();
+  CHECK_EQ(size % 16, 0) << "intgemm PrepareA requires the size be a multiple of 16.";
+ 
+  const float *A = in.dptr<float>();
+  int8_t *quantA = out.dptr<int8_t>();
+  const PrepareAParam& param = nnvm::get<PrepareAParam>(attrs.parsed);
+  ::intgemm::Int8::Quantize(A, quantA, param.multiplier, size);
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_preparea)
+.describe(R"code(This operator converts quantizes float32 to int8 while also banning -128.
+
+It it suitable for preparing an A matrix for use by intgemm's C=AB operation.
+
+The float32 values are multiplied by the provided multiplier before casting to int8.  Typically this is 127.0 / maxabsolute(A).
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<PrepareAParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"A"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", PrepareAOpShape)
+.set_attr<nnvm::FInferType>("FInferType", PrepareAOpType)
+.set_attr<FInferStorageType>("FInferStorageType", PrepareAOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", PrepareAOpForwardCPU)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.add_argument("A", "NDArray-or-Symbol", "Activation matrix to be prepared for multiplication.")
+.add_arguments(PrepareAParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/intgemm/prepare_b_op.cc b/src/operator/contrib/intgemm/prepare_b_op.cc
index bc0bfa15720a..aeeb8f313eaf 100644
--- a/src/operator/contrib/intgemm/prepare_b_op.cc
+++ b/src/operator/contrib/intgemm/prepare_b_op.cc
@@ -42,6 +42,7 @@ struct PrepareBParam : public dmlc::Parameter<PrepareBParam> {
       .describe("Multiply floats by this constant before casting to int8.  Typically you would set this to 127.0 / max absolute value in B.");
   }
 };
+DMLC_REGISTER_PARAMETER(PrepareBParam);
 
 inline bool PrepareBOpShape(const nnvm::NodeAttrs& attrs,
                              mxnet::ShapeVector* in_attrs,
@@ -84,10 +85,6 @@ inline bool PrepareBOpStorageType(const nnvm::NodeAttrs& attrs,
     storage_type_assign(&in_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
 }
 
-
-
-DMLC_REGISTER_PARAMETER(PrepareBParam);
-
 namespace {
 void PrepareBOpForward(const nnvm::NodeAttrs& attrs,
                        const OpContext& ctx,

From 897bf6e9fa94aff978c0e9ea5cc11a51c2aadf3f Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 2 Dec 2019 10:36:34 +0000
Subject: [PATCH 08/65] Update to latest intgemm

---
 3rdparty/intgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index faa4a3aef206..12daa143f40f 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit faa4a3aef2062d1dd2da51730caf80b5875a9f10
+Subproject commit 12daa143f40f99861c20564e9c6ad50f947c0ca5

From b65e33f35b0118c3b26214c3e1bf838f764d7ed0 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 2 Dec 2019 10:43:05 +0000
Subject: [PATCH 09/65] Remove trailing whitespace

---
 src/operator/contrib/intgemm/intgemm_fully_connected_op.cc | 2 +-
 src/operator/contrib/intgemm/max_absolute_op.cc            | 2 +-
 src/operator/contrib/intgemm/prepare_a_op.cc               | 6 +++---
 src/operator/contrib/intgemm/prepare_b_op.cc               | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
index 6b3b3ebefa65..b7a67279ed0b 100644
--- a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -214,7 +214,7 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
     A_quant = A.dptr<int8_t>();
   }
   const int8_t *B_quant = B.dptr<int8_t>();
-  
+
   if (bias) {
     if (C.type_flag_ == mshadow::kFloat32) {
       ::intgemm::callbacks::UnquantizeAndAddBiasAndWrite cb(out_float_multiplier, inputs[2].dptr<float>(), C.dptr<float>());
diff --git a/src/operator/contrib/intgemm/max_absolute_op.cc b/src/operator/contrib/intgemm/max_absolute_op.cc
index 7c2786aebaa6..488d3f48c2da 100644
--- a/src/operator/contrib/intgemm/max_absolute_op.cc
+++ b/src/operator/contrib/intgemm/max_absolute_op.cc
@@ -88,7 +88,7 @@ void MaxAbsoluteOpForwardCPU(const nnvm::NodeAttrs& attrs,
   CHECK(out.CheckContiguous());
   size_t size = in.shape_.Size();
   CHECK_EQ(size % (512 / 8 / sizeof(float)), 0) << "The total size of the input must be a multiple of 16.";
- 
+
   const float *data = in.dptr<float>();
   *out.dptr<float>() = ::intgemm::MaxAbsolute(data, data + size);
 }
diff --git a/src/operator/contrib/intgemm/prepare_a_op.cc b/src/operator/contrib/intgemm/prepare_a_op.cc
index 2d3fb61df7f1..e5a1d4420416 100644
--- a/src/operator/contrib/intgemm/prepare_a_op.cc
+++ b/src/operator/contrib/intgemm/prepare_a_op.cc
@@ -19,8 +19,8 @@
 
 /*!
  * \file prepare_a_op.cc
- * \brief Converts A matrices (typically activations) to intgemm's 
- * representation for A in C=AB.  This just quantizes to int8 and bans -128.  
+ * \brief Converts A matrices (typically activations) to intgemm's
+ * representation for A in C=AB.  This just quantizes to int8 and bans -128.
  * The only difference from Quantize/QuantizeV2 is that it bans -128.
  */
 
@@ -100,7 +100,7 @@ void PrepareAOpForwardCPU(const nnvm::NodeAttrs& attrs,
   CHECK(out.CheckContiguous());
   size_t size = in.shape_.Size();
   CHECK_EQ(size % 16, 0) << "intgemm PrepareA requires the size be a multiple of 16.";
- 
+
   const float *A = in.dptr<float>();
   int8_t *quantA = out.dptr<int8_t>();
   const PrepareAParam& param = nnvm::get<PrepareAParam>(attrs.parsed);
diff --git a/src/operator/contrib/intgemm/prepare_b_op.cc b/src/operator/contrib/intgemm/prepare_b_op.cc
index aeeb8f313eaf..d33adaf82806 100644
--- a/src/operator/contrib/intgemm/prepare_b_op.cc
+++ b/src/operator/contrib/intgemm/prepare_b_op.cc
@@ -98,7 +98,7 @@ void PrepareBOpForward(const nnvm::NodeAttrs& attrs,
   size_t inner = in.shape_[in.shape_.ndim() - 1];
   CHECK_EQ(inner % ::intgemm::Int8::kBTileRow, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::kBTileRow;
   CHECK_EQ(B_cols % ::intgemm::Int8::kBTileCol, 0) << "intgemm requires B have a multiple of " << ::intgemm::Int8::kBTileCol << " columns inthe equation C = AB.";
- 
+
   const float *B = in.dptr<float>();
   int8_t *quantB = out.dptr<int8_t>();
   const PrepareBParam& param = nnvm::get<PrepareBParam>(attrs.parsed);

From b615ee8d694d8463be55563d72386d3ee13cb468 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 2 Dec 2019 13:01:40 +0000
Subject: [PATCH 10/65] Extract common code from Prepare* operations

---
 src/operator/contrib/intgemm/prepare_a_op.cc  |  60 ++--------
 src/operator/contrib/intgemm/prepare_b_op.cc  | 106 +++---------------
 .../contrib/intgemm/prepare_op-common.cc      |  78 +++++++++++++
 .../contrib/intgemm/prepare_op-common.h       |  60 ++++++++++
 4 files changed, 163 insertions(+), 141 deletions(-)
 create mode 100644 src/operator/contrib/intgemm/prepare_op-common.cc
 create mode 100644 src/operator/contrib/intgemm/prepare_op-common.h

diff --git a/src/operator/contrib/intgemm/prepare_a_op.cc b/src/operator/contrib/intgemm/prepare_a_op.cc
index e5a1d4420416..6dab1d68ea10 100644
--- a/src/operator/contrib/intgemm/prepare_a_op.cc
+++ b/src/operator/contrib/intgemm/prepare_a_op.cc
@@ -24,6 +24,7 @@
  * The only difference from Quantize/QuantizeV2 is that it bans -128.
  */
 
+#include "prepare_op-common.h"
 #include <mxnet/operator_util.h>
 #include <vector>
 #include "../../mshadow_op.h"
@@ -36,53 +37,6 @@
 namespace mxnet {
 namespace op {
 
-struct PrepareAParam : public dmlc::Parameter<PrepareAParam> {
-  float multiplier;
-  DMLC_DECLARE_PARAMETER(PrepareAParam) {
-    DMLC_DECLARE_FIELD(multiplier)
-      .describe("Multiply floats by this constant before casting to int8.  Typically you would set this to 127.0 / max absolute value in A.");
-  }
-};
-DMLC_REGISTER_PARAMETER(PrepareAParam);
-
-inline bool PrepareAOpShape(const nnvm::NodeAttrs& attrs,
-                             mxnet::ShapeVector* in_attrs,
-                             mxnet::ShapeVector* out_attrs) {
-  // One in, one out.
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
-
-  return shape_is_known(out_attrs->at(0));
-}
-
-inline bool PrepareAOpType(const nnvm::NodeAttrs& attrs,
-                            std::vector<int>* in_attrs,
-                            std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-
-  // This routine converts from float to int8.
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
-  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
-  return true;
-}
-
-inline bool PrepareAOpStorageType(const nnvm::NodeAttrs& attrs,
-                                   const int dev_mask,
-                                   DispatchMode* dispatch_mode,
-                                   std::vector<int>* in_attrs,
-                                   std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  // Dense storage only.
-  return storage_type_assign(&out_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute) &&
-    storage_type_assign(&in_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
-}
-
-
 void PrepareAOpForwardCPU(const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
                           const std::vector<TBlob>& inputs,
@@ -103,7 +57,7 @@ void PrepareAOpForwardCPU(const nnvm::NodeAttrs& attrs,
 
   const float *A = in.dptr<float>();
   int8_t *quantA = out.dptr<int8_t>();
-  const PrepareAParam& param = nnvm::get<PrepareAParam>(attrs.parsed);
+  const PrepareParam& param = nnvm::get<PrepareParam>(attrs.parsed);
   ::intgemm::Int8::Quantize(A, quantA, param.multiplier, size);
 }
 
@@ -114,23 +68,23 @@ It it suitable for preparing an A matrix for use by intgemm's C=AB operation.
 
 The float32 values are multiplied by the provided multiplier before casting to int8.  Typically this is 127.0 / maxabsolute(A).
 )code" ADD_FILELINE)
-.set_attr_parser(ParamParser<PrepareAParam>)
+.set_attr_parser(ParamParser<PrepareParam>)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
     return std::vector<std::string>{"A"};
   })
-.set_attr<mxnet::FInferShape>("FInferShape", PrepareAOpShape)
-.set_attr<nnvm::FInferType>("FInferType", PrepareAOpType)
-.set_attr<FInferStorageType>("FInferStorageType", PrepareAOpStorageType)
+.set_attr<mxnet::FInferShape>("FInferShape", PrepareOpShape)
+.set_attr<nnvm::FInferType>("FInferType", PrepareOpType)
+.set_attr<FInferStorageType>("FInferStorageType", PrepareOpStorageType)
 .set_attr<FCompute>("FCompute<cpu>", PrepareAOpForwardCPU)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
 .add_argument("A", "NDArray-or-Symbol", "Activation matrix to be prepared for multiplication.")
-.add_arguments(PrepareAParam::__FIELDS__());
+.add_arguments(PrepareParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/intgemm/prepare_b_op.cc b/src/operator/contrib/intgemm/prepare_b_op.cc
index d33adaf82806..ba0752b978e6 100644
--- a/src/operator/contrib/intgemm/prepare_b_op.cc
+++ b/src/operator/contrib/intgemm/prepare_b_op.cc
@@ -22,6 +22,7 @@
  * \brief Converts B matrices to intgemm's representation.
  */
 
+#include "prepare_op-common.h"
 #include <mxnet/operator_util.h>
 #include <vector>
 #include "../../mshadow_op.h"
@@ -35,61 +36,18 @@
 namespace mxnet {
 namespace op {
 
-struct PrepareBParam : public dmlc::Parameter<PrepareBParam> {
-  float multiplier;
-  DMLC_DECLARE_PARAMETER(PrepareBParam) {
-    DMLC_DECLARE_FIELD(multiplier)
-      .describe("Multiply floats by this constant before casting to int8.  Typically you would set this to 127.0 / max absolute value in B.");
-  }
-};
-DMLC_REGISTER_PARAMETER(PrepareBParam);
-
-inline bool PrepareBOpShape(const nnvm::NodeAttrs& attrs,
-                             mxnet::ShapeVector* in_attrs,
-                             mxnet::ShapeVector* out_attrs) {
-  // One in, one out.
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
-
-  const mxnet::TShape &shape = in_attrs->front();
-  if (mxnet::ndim_is_known(shape)) {
-    CHECK_GE(shape.ndim(), 2) << "Matrices have at least two dimensions.";
-  }
-  return !mxnet::op::shape_is_none(out_attrs->at(0));
-}
-
-inline bool PrepareBOpType(const nnvm::NodeAttrs& attrs,
-                            std::vector<int>* in_attrs,
-                            std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-
-  // This routine converts from float to int8.
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
-  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
-  return true;
-}
-
-inline bool PrepareBOpStorageType(const nnvm::NodeAttrs& attrs,
-                                   const int dev_mask,
-                                   DispatchMode* dispatch_mode,
-                                   std::vector<int>* in_attrs,
-                                   std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  // Dense storage only.
-  return storage_type_assign(&out_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute) &&
-    storage_type_assign(&in_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
-}
+void PrepareBOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
 
-namespace {
-void PrepareBOpForward(const nnvm::NodeAttrs& attrs,
-                       const OpContext& ctx,
-                       const TBlob& in,
-                       const TBlob& out) {
+  const TBlob &in = inputs.front();
+  const TBlob &out = outputs.front();
   CHECK_EQ(in.type_flag_, mshadow::kFloat32);
   CHECK_EQ(out.type_flag_, mshadow::kInt8);
   CHECK(in.CheckContiguous());
@@ -101,7 +59,6 @@ void PrepareBOpForward(const nnvm::NodeAttrs& attrs,
 
   const float *B = in.dptr<float>();
   int8_t *quantB = out.dptr<int8_t>();
-  const PrepareBParam& param = nnvm::get<PrepareBParam>(attrs.parsed);
   // TODO: eliminate transpose here by making a PrepareBColumnMajor.
   intgemm::AlignedVector<float> B_transpose(inner * B_cols);
   for (size_t i = 0; i < inner; ++i) {
@@ -109,35 +66,9 @@ void PrepareBOpForward(const nnvm::NodeAttrs& attrs,
       B_transpose[i * B_cols + j] = B[i + inner * j];
     }
   }
+  const PrepareParam& param = nnvm::get<PrepareParam>(attrs.parsed);
   ::intgemm::Int8::PrepareB(B_transpose.begin(), quantB, param.multiplier, inner, B_cols);
 }
-} // namespace
-
-void PrepareBOpForwardExCPU(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const std::vector<NDArray>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<NDArray>& outputs) {
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-  CHECK_EQ(req.size(), 1U);
-  CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
-  CHECK_EQ(inputs[0].storage_type(), kDefaultStorage);
-  CHECK_EQ(outputs[0].storage_type(), kDefaultStorage);
-  PrepareBOpForward(attrs, ctx, inputs[0].data(), outputs[0].data());
-}
-
-void PrepareBOpForwardCPU(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const std::vector<TBlob>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-  CHECK_EQ(req.size(), 1U);
-  CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
-  PrepareBOpForward(attrs, ctx, inputs[0], outputs[0]);
-}
 
 NNVM_REGISTER_OP(_contrib_intgemm_prepareb)
 .describe(R"code(This operator converts a float32 matrix to intgemm's internal representation of B in preparation for the operation C = AB.  B should be provided in column-major order i.e. the last dimension of shape is the number of rows of B. This operator is not meant to be fast; it is meant to be run offline to quantize a model.
@@ -146,24 +77,23 @@ The float32 values are multiplied by the provided multiplier before casting to i
 
 The internal representation of B is CPU dependent: AVX512BW, AVX2, and SSSE3 have different formats.
 )code" ADD_FILELINE)
-.set_attr_parser(ParamParser<PrepareBParam>)
+.set_attr_parser(ParamParser<PrepareParam>)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
     return std::vector<std::string>{"B"};
   })
-.set_attr<mxnet::FInferShape>("FInferShape", PrepareBOpShape)
-.set_attr<nnvm::FInferType>("FInferType", PrepareBOpType)
-.set_attr<FInferStorageType>("FInferStorageType", PrepareBOpStorageType)
+.set_attr<mxnet::FInferShape>("FInferShape", PrepareOpShape)
+.set_attr<nnvm::FInferType>("FInferType", PrepareOpType)
+.set_attr<FInferStorageType>("FInferStorageType", PrepareOpStorageType)
 .set_attr<FCompute>("FCompute<cpu>", PrepareBOpForwardCPU)
-.set_attr<FComputeEx>("FComputeEx<cpu>", PrepareBOpForwardExCPU)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
 .add_argument("B", "NDArray-or-Symbol", "Parameter matrix to be prepared for multiplication.")
-.add_arguments(PrepareBParam::__FIELDS__());
+.add_arguments(PrepareParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/intgemm/prepare_op-common.cc b/src/operator/contrib/intgemm/prepare_op-common.cc
new file mode 100644
index 000000000000..6f70c186d82d
--- /dev/null
+++ b/src/operator/contrib/intgemm/prepare_op-common.cc
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file prepare_op-common.cc
+ * \brief Common functions for intgemm's PrepareA and PrepareB functions.
+ * These are used to convert float tensors to values suitable for
+ * multiplication.
+ */
+
+#include "prepare_op-common.h"
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(PrepareParam);
+
+bool PrepareOpShape(const nnvm::NodeAttrs& attrs,
+                    mxnet::ShapeVector* in_attrs,
+                    mxnet::ShapeVector* out_attrs) {
+  // One in, one out, same size.
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+
+  return shape_is_known(out_attrs->at(0));
+}
+
+bool PrepareOpType(const nnvm::NodeAttrs& attrs,
+                   std::vector<int>* in_attrs,
+                   std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  // This routine converts from float to int8.
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+  return true;
+}
+
+bool PrepareOpStorageType(const nnvm::NodeAttrs& attrs,
+                          const int dev_mask,
+                          DispatchMode* dispatch_mode,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  // Dense storage only.
+  return storage_type_assign(&out_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute) &&
+    storage_type_assign(&in_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/intgemm/prepare_op-common.h b/src/operator/contrib/intgemm/prepare_op-common.h
new file mode 100644
index 000000000000..135d82dfec79
--- /dev/null
+++ b/src/operator/contrib/intgemm/prepare_op-common.h
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file prepare_op-common.h
+ * \brief Common functions for intgemm's PrepareA and PrepareB functions.
+ * These are used to convert float tensors to values suitable for
+ * multiplication.
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+namespace mxnet {
+namespace op {
+
+struct PrepareParam : public dmlc::Parameter<PrepareParam> {
+  float multiplier;
+  DMLC_DECLARE_PARAMETER(PrepareParam) {
+    DMLC_DECLARE_FIELD(multiplier)
+      .describe("Multiply floats by this constant before casting to int8.  Typically you would set this to 127.0 / max absolute value.");
+  }
+};
+
+bool PrepareOpShape(const nnvm::NodeAttrs& attrs,
+                    mxnet::ShapeVector* in_attrs,
+                    mxnet::ShapeVector* out_attrs);
+
+bool PrepareOpType(const nnvm::NodeAttrs& attrs,
+                   std::vector<int>* in_attrs,
+                   std::vector<int>* out_attrs);
+
+bool PrepareOpStorageType(const nnvm::NodeAttrs& attrs,
+                          const int dev_mask,
+                          DispatchMode* dispatch_mode,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs);
+
+}  // namespace op
+}  // namespace mxnet

From ed6be7e41675420f2ad2e3028d99838e15e89dc8 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Tue, 3 Dec 2019 11:28:34 +0000
Subject: [PATCH 11/65] Disable in-place, zero gradients following existing
 quantization code

---
 .../intgemm/intgemm_fully_connected_op.cc     | 54 ++++++++++---------
 src/operator/contrib/intgemm/prepare_a_op.cc  |  7 ++-
 src/operator/contrib/intgemm/prepare_b_op.cc  |  7 ++-
 3 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
index b7a67279ed0b..e3772d84e84f 100644
--- a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -56,17 +56,15 @@ struct IntgemmFullyConnectedParam : public dmlc::Parameter<IntgemmFullyConnected
     .add_enum("int32", mshadow::kInt32)
     .set_default(mshadow::kFloat32)
     .describe("Output data type.");
-    DMLC_DECLARE_FIELD(out_float_multiplier)
-    .describe("If the out_type is float32, unquantize by multiplying by this number.  Typically 1.0/preparea.multiplier/prepareb.multiplier.  If you pass A in as float32, then A will be quantized using preparea.multiplier = 127.0/max(abs(A)) and out_float_multiplier will be adjusted accordingly.");
   }
 };
 DMLC_REGISTER_PARAMETER(IntgemmFullyConnectedParam);
 
 namespace {
 template<class T> void IntgemmFullyConnectedSanity(const nnvm::NodeAttrs& attrs, T* in, T* out) {
-  // 2-3 parameters: A, B, and optional bias
+  // 3-4 parameters: A, B, scaling, and optional bias
   const IntgemmFullyConnectedParam& param = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
-  CHECK_EQ(in->size(), param.no_bias ? 2U : 3U);
+  CHECK_EQ(in->size(), param.no_bias ? 3U : 4U);
   CHECK_EQ(out->size(), 1U);
 }
 } // namespace
@@ -81,9 +79,9 @@ inline bool IntgemmFullyConnectedOpShape(const nnvm::NodeAttrs& attrs,
   // The rest is copied from FullyConnected.
   using namespace mshadow;
   if (!param.no_bias) {
-    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+    CHECK_EQ(in_shape->size(), 4U) << "Input:[data, weight, scaling_factor, bias]";
   } else {
-    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, scaling_factor]";
   }
   CHECK_EQ(out_shape->size(), 1U);
   mxnet::TShape dshape = (*in_shape)[0];
@@ -98,10 +96,11 @@ inline bool IntgemmFullyConnectedOpShape(const nnvm::NodeAttrs& attrs,
     num_input = dshape.ProdShape(1, dshape.ndim());
   }
   SHAPE_ASSIGN_CHECK(*in_shape, 1, Shape2(param.num_hidden, num_input));
+  SHAPE_ASSIGN_CHECK(*in_shape, 2, mxnet::TShape(1, 1));
   if (!param.no_bias) {
-    if (!shape_assign(&(*in_shape)[2], Shape1(param.num_hidden)) &&
-        !shape_assign(&(*in_shape)[2], Shape2(param.num_hidden, 1))) {
-      LOG(FATAL) << "Unexpected shape for bias " << (*in_shape)[2];
+    if (!shape_assign(&(*in_shape)[3], Shape1(param.num_hidden)) &&
+        !shape_assign(&(*in_shape)[3], Shape2(param.num_hidden, 1))) {
+      LOG(FATAL) << "Unexpected shape for bias " << (*in_shape)[3];
     }
   }
 
@@ -127,11 +126,13 @@ bool IntgemmFullyConnectedOpType(const nnvm::NodeAttrs& attrs,
 
   // Match the configuration for output.
   TYPE_ASSIGN_CHECK(*out_attrs, 0, param.out_type);
-  if (in_attrs->size() == 3) {
+  if (!param.no_bias) {
     // Bias has same type as output.
-    TYPE_ASSIGN_CHECK(*in_attrs, 2, (*out_attrs)[0]);
-    TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[2]);
+    TYPE_ASSIGN_CHECK(*in_attrs, 3, (*out_attrs)[0]);
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[3]);
   }
+  // Scaling is float32.
+  TYPE_ASSIGN_CHECK(*in_attrs, 2, mshadow::kFloat32);
   // Users have to prepare B.
   TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kInt8);
   // A can be a float (in which case it is automatically quantized) or int8.
@@ -166,7 +167,7 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
   IntgemmFullyConnectedSanity(attrs, &inputs, &outputs);
   const IntgemmFullyConnectedParam& param = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
   CHECK_EQ(req.size(), 1U);
-  CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
+  CHECK_EQ(req[0], kWriteTo) << "TODO: doing more than overwriting for intgemm.  Note: kWriteInplace = " << kWriteInplace << " kWriteTo = " << kWriteTo;
 
   const TBlob &A = inputs[0], &B = inputs[1], &C = outputs[0];
 
@@ -185,16 +186,16 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
 
   CHECK_EQ(C.shape_.Size(), A_rows * B_cols);
 
-  bool bias = (inputs.size() == 3);
+  bool bias = !param.no_bias;
   if (bias) {
-    CHECK_EQ(inputs[2].type_flag_, mshadow::kFloat32);
+    CHECK_EQ(inputs[3].type_flag_, mshadow::kFloat32);
     CHECK_EQ(C.type_flag_, mshadow::kFloat32);
-    CHECK_EQ(inputs[2].shape_.Size(), param.num_hidden);
+    CHECK_EQ(inputs[3].shape_.Size(), param.num_hidden);
   }
   CHECK_EQ(inner % ::intgemm::Int8::kBTileRow, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::kBTileRow;
   CHECK_EQ(B_cols % ::intgemm::Int8::kBTileCol, 0) << "intgemm requires B have a multiple of " << ::intgemm::Int8::kBTileCol << " columns inthe equation C = AB.";
 
-  float out_float_multiplier = param.out_float_multiplier;
+  float out_float_multiplier = *inputs[2].dptr<float>();
 
   int8_t *A_quant;
   // TODO report this memory consumption?
@@ -217,11 +218,11 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
 
   if (bias) {
     if (C.type_flag_ == mshadow::kFloat32) {
-      ::intgemm::callbacks::UnquantizeAndAddBiasAndWrite cb(out_float_multiplier, inputs[2].dptr<float>(), C.dptr<float>());
+      ::intgemm::callbacks::UnquantizeAndAddBiasAndWrite cb(out_float_multiplier, inputs[3].dptr<float>(), C.dptr<float>());
       ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
     } else {
       // int32
-      ::intgemm::callbacks::AddBiasAndWrite cb(inputs[2].dptr<int32_t>(), C.dptr<int32_t>());
+      ::intgemm::callbacks::AddBiasAndWrite cb(inputs[3].dptr<int32_t>(), C.dptr<int32_t>());
       ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
     }
   } else {
@@ -246,24 +247,25 @@ The float32 values are multiplied by the provided multiplier before casting to i
 .set_attr_parser(ParamParser<IntgemmFullyConnectedParam>)
 .set_num_inputs([](const NodeAttrs& attrs) {
   const IntgemmFullyConnectedParam& params = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
-  return params.no_bias ? 2 : 3;
+  return params.no_bias ? 3 : 4;
 })
 .set_num_outputs(1)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"A", "B"};
+    const IntgemmFullyConnectedParam& params = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
+    return params.no_bias ? std::vector<std::string>{"A", "weight", "scaling"} : std::vector<std::string>{"A", "weight", "scaling", "bias"};
   })
 .set_attr<mxnet::FInferShape>("FInferShape", IntgemmFullyConnectedOpShape)
 .set_attr<nnvm::FInferType>("FInferType", IntgemmFullyConnectedOpType)
 //.set_attr<FInferStorageType>("FInferStorageType", IntgemmFullyConnectedOpStorageType)
 .set_attr<FCompute>("FCompute<cpu>", IntgemmFullyConnectedOpForwardCPU)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs) {
-    return std::vector<std::pair<int, int> >{{0, 0}};
-  })
-.add_argument("data", "NDArray-or-Symbol", "First (A) argument to multiplication. Tensor of float32 (quantized on the fly) or int8 from intgemm_preparea. If you use a different quantizer, be sure to ban -128. The last dimension must be a multiple of 64.")
+.add_argument("A", "NDArray-or-Symbol", "First (A) argument to multiplication. Tensor of float32 (quantized on the fly) or int8 from intgemm_preparea. If you use a different quantizer, be sure to ban -128. The last dimension must be a multiple of 64.")
 .add_argument("weight", "NDArray-or-Symbol", "Second (B) argument to multiplication. Tensor of int8 from intgemm_prepareb. The last dimension must be a multiple of 64.  The product of non-last dimensions must be a multiple of 8.")
+.add_argument("scaling", "NDArray-or-Symbol", "Scaling factor to apply if output type is float32.")
 .add_argument("bias", "NDArray-or-Symbol", "Bias term.")
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_arguments(IntgemmFullyConnectedParam::__FIELDS__());
 
 }  // namespace op
diff --git a/src/operator/contrib/intgemm/prepare_a_op.cc b/src/operator/contrib/intgemm/prepare_a_op.cc
index 6dab1d68ea10..7d9b172bc95c 100644
--- a/src/operator/contrib/intgemm/prepare_a_op.cc
+++ b/src/operator/contrib/intgemm/prepare_a_op.cc
@@ -79,11 +79,10 @@ The float32 values are multiplied by the provided multiplier before casting to i
 .set_attr<nnvm::FInferType>("FInferType", PrepareOpType)
 .set_attr<FInferStorageType>("FInferStorageType", PrepareOpStorageType)
 .set_attr<FCompute>("FCompute<cpu>", PrepareAOpForwardCPU)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs) {
-    return std::vector<std::pair<int, int> >{{0, 0}};
-  })
 .add_argument("A", "NDArray-or-Symbol", "Activation matrix to be prepared for multiplication.")
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_arguments(PrepareParam::__FIELDS__());
 
 }  // namespace op
diff --git a/src/operator/contrib/intgemm/prepare_b_op.cc b/src/operator/contrib/intgemm/prepare_b_op.cc
index ba0752b978e6..8a2833ffc8e2 100644
--- a/src/operator/contrib/intgemm/prepare_b_op.cc
+++ b/src/operator/contrib/intgemm/prepare_b_op.cc
@@ -88,11 +88,10 @@ The internal representation of B is CPU dependent: AVX512BW, AVX2, and SSSE3 hav
 .set_attr<nnvm::FInferType>("FInferType", PrepareOpType)
 .set_attr<FInferStorageType>("FInferStorageType", PrepareOpStorageType)
 .set_attr<FCompute>("FCompute<cpu>", PrepareBOpForwardCPU)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs) {
-    return std::vector<std::pair<int, int> >{{0, 0}};
-  })
 .add_argument("B", "NDArray-or-Symbol", "Parameter matrix to be prepared for multiplication.")
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_arguments(PrepareParam::__FIELDS__());
 
 }  // namespace op

From 153a628eb5e995f5ccc7364cdaef1ad569b4fce8 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Tue, 3 Dec 2019 12:05:28 +0000
Subject: [PATCH 12/65] Remove commented out parameter

---
 src/operator/contrib/intgemm/max_absolute_op.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/operator/contrib/intgemm/max_absolute_op.cc b/src/operator/contrib/intgemm/max_absolute_op.cc
index 488d3f48c2da..f98c6dfa262e 100644
--- a/src/operator/contrib/intgemm/max_absolute_op.cc
+++ b/src/operator/contrib/intgemm/max_absolute_op.cc
@@ -34,10 +34,6 @@
 namespace mxnet {
 namespace op {
 
-/*struct MaxAbsoluteParam : public dmlc::Parameter<MaxAbsoluteParam> {
-};
-DMLC_REGISTER_PARAMETER(MaxAbsoluteParam);*/
-
 inline bool MaxAbsoluteOpShape(const nnvm::NodeAttrs& attrs,
                              mxnet::ShapeVector* in_attrs,
                              mxnet::ShapeVector* out_attrs) {

From f1cd4abc8cdf45cdd2778512540453eeb86d7dac Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Tue, 3 Dec 2019 13:00:18 +0000
Subject: [PATCH 13/65] Better documentation/parameter naming for intgemm fully
 connected

---
 .../contrib/intgemm/intgemm_fully_connected_op.cc | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
index e3772d84e84f..89dc9e7b4a36 100644
--- a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -238,11 +238,15 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_intgemm_fully_connected)
-.describe(R"code(This operator converts quantizes float32 to int8 while also banning -128.
+.describe(R"code(Multiply matrices using 8-bit integers.
 
-It it suitable for preparing an A matrix for use by intgemm's C=AB operation.
+The data argument can be either float32 or prepared using intgemm_preparea.
 
-The float32 values are multiplied by the provided multiplier before casting to int8.  Typically this is 127.0 / maxabsolute(A).
+The weight argument must be prepared using intgemm_prepareb.
+
+If out_type is float32, then a scaling factor is applied before bias.  Typically this is 1/the scaling factor you provided to prepareb/the scaling factor you provided to preparea (if data is quantized).
+
+The out_type can be int32 or float32.  Bias must have the same type.
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<IntgemmFullyConnectedParam>)
 .set_num_inputs([](const NodeAttrs& attrs) {
@@ -253,13 +257,12 @@ The float32 values are multiplied by the provided multiplier before casting to i
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
     const IntgemmFullyConnectedParam& params = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
-    return params.no_bias ? std::vector<std::string>{"A", "weight", "scaling"} : std::vector<std::string>{"A", "weight", "scaling", "bias"};
+    return params.no_bias ? std::vector<std::string>{"data", "weight", "scaling"} : std::vector<std::string>{"data", "weight", "scaling", "bias"};
   })
 .set_attr<mxnet::FInferShape>("FInferShape", IntgemmFullyConnectedOpShape)
 .set_attr<nnvm::FInferType>("FInferType", IntgemmFullyConnectedOpType)
-//.set_attr<FInferStorageType>("FInferStorageType", IntgemmFullyConnectedOpStorageType)
 .set_attr<FCompute>("FCompute<cpu>", IntgemmFullyConnectedOpForwardCPU)
-.add_argument("A", "NDArray-or-Symbol", "First (A) argument to multiplication. Tensor of float32 (quantized on the fly) or int8 from intgemm_preparea. If you use a different quantizer, be sure to ban -128. The last dimension must be a multiple of 64.")
+.add_argument("data", "NDArray-or-Symbol", "First (A) argument to multiplication. Tensor of float32 (quantized on the fly) or int8 from intgemm_preparea. If you use a different quantizer, be sure to ban -128. The last dimension must be a multiple of 64.")
 .add_argument("weight", "NDArray-or-Symbol", "Second (B) argument to multiplication. Tensor of int8 from intgemm_prepareb. The last dimension must be a multiple of 64.  The product of non-last dimensions must be a multiple of 8.")
 .add_argument("scaling", "NDArray-or-Symbol", "Scaling factor to apply if output type is float32.")
 .add_argument("bias", "NDArray-or-Symbol", "Bias term.")

From 8b5d10760e500ce42f0688a7f5426e187b4cf826 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Tue, 3 Dec 2019 13:21:23 +0000
Subject: [PATCH 14/65] Rename preparea to prepare_data, prepareb to
 prepare_weight

---
 .../intgemm/intgemm_fully_connected_op.cc      | 10 +++++-----
 .../{prepare_a_op.cc => prepare_data_op.cc}    | 18 +++++++++---------
 .../{prepare_b_op.cc => prepare_weight_op.cc}  | 16 ++++++++--------
 3 files changed, 22 insertions(+), 22 deletions(-)
 rename src/operator/contrib/intgemm/{prepare_a_op.cc => prepare_data_op.cc} (83%)
 rename src/operator/contrib/intgemm/{prepare_b_op.cc => prepare_weight_op.cc} (80%)

diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
index 89dc9e7b4a36..8441d3a3af3d 100644
--- a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -240,11 +240,11 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
 NNVM_REGISTER_OP(_contrib_intgemm_fully_connected)
 .describe(R"code(Multiply matrices using 8-bit integers.
 
-The data argument can be either float32 or prepared using intgemm_preparea.
+The data argument can be either float32 or prepared using intgemm_prepare_data.
 
-The weight argument must be prepared using intgemm_prepareb.
+The weight argument must be prepared using intgemm_prepare_weight.
 
-If out_type is float32, then a scaling factor is applied before bias.  Typically this is 1/the scaling factor you provided to prepareb/the scaling factor you provided to preparea (if data is quantized).
+If out_type is float32, then a scaling factor is applied before bias.  Typically this is 1/the scaling factor you provided to prepare_weight/the scaling factor you provided to prepare_data (if data is quantized).
 
 The out_type can be int32 or float32.  Bias must have the same type.
 )code" ADD_FILELINE)
@@ -262,8 +262,8 @@ The out_type can be int32 or float32.  Bias must have the same type.
 .set_attr<mxnet::FInferShape>("FInferShape", IntgemmFullyConnectedOpShape)
 .set_attr<nnvm::FInferType>("FInferType", IntgemmFullyConnectedOpType)
 .set_attr<FCompute>("FCompute<cpu>", IntgemmFullyConnectedOpForwardCPU)
-.add_argument("data", "NDArray-or-Symbol", "First (A) argument to multiplication. Tensor of float32 (quantized on the fly) or int8 from intgemm_preparea. If you use a different quantizer, be sure to ban -128. The last dimension must be a multiple of 64.")
-.add_argument("weight", "NDArray-or-Symbol", "Second (B) argument to multiplication. Tensor of int8 from intgemm_prepareb. The last dimension must be a multiple of 64.  The product of non-last dimensions must be a multiple of 8.")
+.add_argument("data", "NDArray-or-Symbol", "First argument to multiplication. Tensor of float32 (quantized on the fly) or int8 from intgemm_prepare_data. If you use a different quantizer, be sure to ban -128. The last dimension must be a multiple of 64.")
+.add_argument("weight", "NDArray-or-Symbol", "Second argument to multiplication. Tensor of int8 from intgemm_prepare_weight. The last dimension must be a multiple of 64.  The product of non-last dimensions must be a multiple of 8.")
 .add_argument("scaling", "NDArray-or-Symbol", "Scaling factor to apply if output type is float32.")
 .add_argument("bias", "NDArray-or-Symbol", "Bias term.")
 // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
diff --git a/src/operator/contrib/intgemm/prepare_a_op.cc b/src/operator/contrib/intgemm/prepare_data_op.cc
similarity index 83%
rename from src/operator/contrib/intgemm/prepare_a_op.cc
rename to src/operator/contrib/intgemm/prepare_data_op.cc
index 7d9b172bc95c..88293b1d3e28 100644
--- a/src/operator/contrib/intgemm/prepare_a_op.cc
+++ b/src/operator/contrib/intgemm/prepare_data_op.cc
@@ -18,8 +18,8 @@
  */
 
 /*!
- * \file prepare_a_op.cc
- * \brief Converts A matrices (typically activations) to intgemm's
+ * \file prepare_data_op.cc
+ * \brief Converts data aka A matrices (typically activations) to intgemm's
  * representation for A in C=AB.  This just quantizes to int8 and bans -128.
  * The only difference from Quantize/QuantizeV2 is that it bans -128.
  */
@@ -37,7 +37,7 @@
 namespace mxnet {
 namespace op {
 
-void PrepareAOpForwardCPU(const nnvm::NodeAttrs& attrs,
+void PrepareDataOpForwardCPU(const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
                           const std::vector<TBlob>& inputs,
                           const std::vector<OpReqType>& req,
@@ -53,7 +53,7 @@ void PrepareAOpForwardCPU(const nnvm::NodeAttrs& attrs,
   CHECK(in.CheckContiguous());
   CHECK(out.CheckContiguous());
   size_t size = in.shape_.Size();
-  CHECK_EQ(size % 16, 0) << "intgemm PrepareA requires the size be a multiple of 16.";
+  CHECK_EQ(size % 16, 0) << "intgemm PrepareData requires the size be a multiple of 16.";
 
   const float *A = in.dptr<float>();
   int8_t *quantA = out.dptr<int8_t>();
@@ -61,10 +61,10 @@ void PrepareAOpForwardCPU(const nnvm::NodeAttrs& attrs,
   ::intgemm::Int8::Quantize(A, quantA, param.multiplier, size);
 }
 
-NNVM_REGISTER_OP(_contrib_intgemm_preparea)
+NNVM_REGISTER_OP(_contrib_intgemm_prepare_data)
 .describe(R"code(This operator converts quantizes float32 to int8 while also banning -128.
 
-It it suitable for preparing an A matrix for use by intgemm's C=AB operation.
+It it suitable for preparing an data matrix for use by intgemm's C=data * weights operation.
 
 The float32 values are multiplied by the provided multiplier before casting to int8.  Typically this is 127.0 / maxabsolute(A).
 )code" ADD_FILELINE)
@@ -73,13 +73,13 @@ The float32 values are multiplied by the provided multiplier before casting to i
 .set_num_outputs(1)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"A"};
+    return std::vector<std::string>{"data"};
   })
 .set_attr<mxnet::FInferShape>("FInferShape", PrepareOpShape)
 .set_attr<nnvm::FInferType>("FInferType", PrepareOpType)
 .set_attr<FInferStorageType>("FInferStorageType", PrepareOpStorageType)
-.set_attr<FCompute>("FCompute<cpu>", PrepareAOpForwardCPU)
-.add_argument("A", "NDArray-or-Symbol", "Activation matrix to be prepared for multiplication.")
+.set_attr<FCompute>("FCompute<cpu>", PrepareDataOpForwardCPU)
+.add_argument("data", "NDArray-or-Symbol", "Activation matrix to be prepared for multiplication.")
 // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
 // will be reverted after the improvement of CachedOP is done.
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
diff --git a/src/operator/contrib/intgemm/prepare_b_op.cc b/src/operator/contrib/intgemm/prepare_weight_op.cc
similarity index 80%
rename from src/operator/contrib/intgemm/prepare_b_op.cc
rename to src/operator/contrib/intgemm/prepare_weight_op.cc
index 8a2833ffc8e2..5b9135505378 100644
--- a/src/operator/contrib/intgemm/prepare_b_op.cc
+++ b/src/operator/contrib/intgemm/prepare_weight_op.cc
@@ -36,7 +36,7 @@
 namespace mxnet {
 namespace op {
 
-void PrepareBOpForwardCPU(const nnvm::NodeAttrs& attrs,
+void PrepareWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
                           const std::vector<TBlob>& inputs,
                           const std::vector<OpReqType>& req,
@@ -59,7 +59,7 @@ void PrepareBOpForwardCPU(const nnvm::NodeAttrs& attrs,
 
   const float *B = in.dptr<float>();
   int8_t *quantB = out.dptr<int8_t>();
-  // TODO: eliminate transpose here by making a PrepareBColumnMajor.
+  // TODO: eliminate transpose here by making a PrepareWeightColumnMajor.
   intgemm::AlignedVector<float> B_transpose(inner * B_cols);
   for (size_t i = 0; i < inner; ++i) {
     for (size_t j = 0; j < B_cols; ++j) {
@@ -70,25 +70,25 @@ void PrepareBOpForwardCPU(const nnvm::NodeAttrs& attrs,
   ::intgemm::Int8::PrepareB(B_transpose.begin(), quantB, param.multiplier, inner, B_cols);
 }
 
-NNVM_REGISTER_OP(_contrib_intgemm_prepareb)
-.describe(R"code(This operator converts a float32 matrix to intgemm's internal representation of B in preparation for the operation C = AB.  B should be provided in column-major order i.e. the last dimension of shape is the number of rows of B. This operator is not meant to be fast; it is meant to be run offline to quantize a model.
+NNVM_REGISTER_OP(_contrib_intgemm_prepare_weight)
+.describe(R"code(This operator converts a float32 matrix to intgemm's internal representation of weight matrices in preparation for the operation C = data * weight.  weight should be provided in column-major order i.e. the last dimension of shape is the number of rows of weight.  This is the usual representation mxnet uses for weights. This operator is not meant to be fast; it is meant to be run offline to quantize a model.
 
 The float32 values are multiplied by the provided multiplier before casting to int8.
 
-The internal representation of B is CPU dependent: AVX512BW, AVX2, and SSSE3 have different formats.
+The internal representation is CPU dependent: AVX512BW, AVX2, and SSSE3 have different formats.
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<PrepareParam>)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"B"};
+    return std::vector<std::string>{"weight"};
   })
 .set_attr<mxnet::FInferShape>("FInferShape", PrepareOpShape)
 .set_attr<nnvm::FInferType>("FInferType", PrepareOpType)
 .set_attr<FInferStorageType>("FInferStorageType", PrepareOpStorageType)
-.set_attr<FCompute>("FCompute<cpu>", PrepareBOpForwardCPU)
-.add_argument("B", "NDArray-or-Symbol", "Parameter matrix to be prepared for multiplication.")
+.set_attr<FCompute>("FCompute<cpu>", PrepareWeightOpForwardCPU)
+.add_argument("weight", "NDArray-or-Symbol", "Parameter matrix to be prepared for multiplication.")
 // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
 // will be reverted after the improvement of CachedOP is done.
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)

From 6e801f4fe3b95f6d6d746cc401725a6c22e92e07 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Tue, 3 Dec 2019 14:25:14 +0000
Subject: [PATCH 15/65] Allow all request types for max_absolute

---
 src/operator/contrib/intgemm/max_absolute_op.cc | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/operator/contrib/intgemm/max_absolute_op.cc b/src/operator/contrib/intgemm/max_absolute_op.cc
index f98c6dfa262e..fe49c4138b2d 100644
--- a/src/operator/contrib/intgemm/max_absolute_op.cc
+++ b/src/operator/contrib/intgemm/max_absolute_op.cc
@@ -69,14 +69,13 @@ inline bool MaxAbsoluteOpStorageType(const nnvm::NodeAttrs& attrs,
 }
 
 void MaxAbsoluteOpForwardCPU(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const std::vector<TBlob>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<TBlob>& outputs) {
+                             const OpContext& ctx,
+                             const std::vector<TBlob>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
-  CHECK_EQ(req[0], kWriteTo) << "TODO request types other than write";
   const TBlob &in = inputs.front(), &out = outputs.front();
   CHECK_EQ(in.type_flag_, mshadow::kFloat32);
   CHECK_EQ(out.type_flag_, mshadow::kFloat32);
@@ -86,7 +85,7 @@ void MaxAbsoluteOpForwardCPU(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(size % (512 / 8 / sizeof(float)), 0) << "The total size of the input must be a multiple of 16.";
 
   const float *data = in.dptr<float>();
-  *out.dptr<float>() = ::intgemm::MaxAbsolute(data, data + size);
+  KERNEL_ASSIGN(*out.dptr<float>(), req[0], ::intgemm::MaxAbsolute(data, data + size));
 }
 
 NNVM_REGISTER_OP(_contrib_intgemm_maxabsolute)
@@ -109,7 +108,6 @@ mxnet.nd.contrib.intgemm_maxabsolute(arr) == arr.abs().max()
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
 .add_argument("data", "NDArray-or-Symbol", "Tensor to compute maximum absolute value of");
-//.add_arguments(MaxAbsoluteParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet

From f492f261876684aa05461f7678411df44ad3548d Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Tue, 3 Dec 2019 14:50:36 +0000
Subject: [PATCH 16/65] Clarify error message

---
 src/operator/contrib/intgemm/prepare_weight_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/contrib/intgemm/prepare_weight_op.cc b/src/operator/contrib/intgemm/prepare_weight_op.cc
index 5b9135505378..7f13b6bca791 100644
--- a/src/operator/contrib/intgemm/prepare_weight_op.cc
+++ b/src/operator/contrib/intgemm/prepare_weight_op.cc
@@ -55,7 +55,7 @@ void PrepareWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
   size_t B_cols = in.shape_.ProdShape(0, in.shape_.ndim() - 1);
   size_t inner = in.shape_[in.shape_.ndim() - 1];
   CHECK_EQ(inner % ::intgemm::Int8::kBTileRow, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::kBTileRow;
-  CHECK_EQ(B_cols % ::intgemm::Int8::kBTileCol, 0) << "intgemm requires B have a multiple of " << ::intgemm::Int8::kBTileCol << " columns inthe equation C = AB.";
+  CHECK_EQ(B_cols % ::intgemm::Int8::kBTileCol, 0) << "intgemm requires the output dimension (the product of all but the last dimension of the weight matrix) to be a multiple of " << ::intgemm::Int8::kBTileCol << ".";
 
   const float *B = in.dptr<float>();
   int8_t *quantB = out.dptr<int8_t>();

From 7a02d05d648dd6f38224c9112da1e5eadfe598d3 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Tue, 3 Dec 2019 14:51:13 +0000
Subject: [PATCH 17/65] Add operator to slice a B matrix

---
 .../contrib/intgemm/take_weight_op.cc         | 133 ++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 src/operator/contrib/intgemm/take_weight_op.cc

diff --git a/src/operator/contrib/intgemm/take_weight_op.cc b/src/operator/contrib/intgemm/take_weight_op.cc
new file mode 100644
index 000000000000..46d3afec1bd3
--- /dev/null
+++ b/src/operator/contrib/intgemm/take_weight_op.cc
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file select_weight_op.cc
+ * \brief Takes from the all-but-last dimension of a tensor stored in
+ * intgemm's weight format.  This is particularly useful for output matrices where
+ * some outputs are excluded.
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "../../../../3rdparty/intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool TakeWeightOpShape(const nnvm::NodeAttrs& shape,
+                             mxnet::ShapeVector* in_shape,
+                             mxnet::ShapeVector* out_shape) {
+  // 0 is weight, 1 is indices.
+  CHECK_EQ(in_shape->size(), 2U);
+  CHECK_EQ(out_shape->size(), 1U);
+
+  mxnet::TShape &weight = (*in_shape)[0];
+  mxnet::TShape &indices = (*in_shape)[1];
+  mxnet::TShape &out = (*out_shape)[0];
+
+  // weight matrices should be 2-dimensional by now.
+  SHAPE_ASSIGN_CHECK(*in_shape, 0, mxnet::TShape(2, -1));
+  SHAPE_ASSIGN_CHECK(*out_shape, 0, mxnet::TShape(2, -1));
+  // indices are 1-dimensional.
+  SHAPE_ASSIGN_CHECK(*in_shape, 1, mxnet::TShape(1, -1));
+
+  SHAPE_ASSIGN_CHECK(*out_shape, 0, mxnet::TShape({indices[0], weight[1]}));
+  SHAPE_ASSIGN_CHECK(*in_shape, 0, mxnet::TShape({-1, out[1]}));
+  SHAPE_ASSIGN_CHECK(*in_shape, 1, mxnet::TShape({out[0]}));
+
+  return shape_is_known(weight) && shape_is_known(indices) && shape_is_known(out);
+}
+
+inline bool TakeWeightOpType(const nnvm::NodeAttrs& attrs,
+                             std::vector<int>* in_attrs,
+                             std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt8);
+  // TODO 64-bit index support.  Though if you're going that far, you're probably overflowing matrix multiply.
+  TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kInt32);
+  return true;
+}
+
+inline bool TakeWeightOpStorageType(const nnvm::NodeAttrs& attrs,
+                                    const int dev_mask,
+                                    DispatchMode* dispatch_mode,
+                                    std::vector<int>* in_attrs,
+                                    std::vector<int>* out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  (*out_attrs)[0] = kDefaultStorage;
+  return true;
+}
+
+void TakeWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo) << "TODO request types other than write";
+  const TBlob &weight = inputs.front(), &indices = inputs[1], &out = outputs.front();
+  CHECK_EQ(weight.type_flag_, mshadow::kInt8);
+  CHECK_EQ(indices.type_flag_, mshadow::kInt32);
+  CHECK_EQ(out.type_flag_, mshadow::kInt8);
+  CHECK(weight.CheckContiguous());
+  CHECK(indices.CheckContiguous());
+  CHECK(out.CheckContiguous());
+  size_t B_cols = indices.shape_[0];
+  size_t inner = weight.shape_[weight.shape_.ndim() - 1];
+  CHECK_EQ(inner % ::intgemm::Int8::kBTileRow, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::kBTileRow;
+  CHECK_EQ(B_cols % ::intgemm::Int8::kBTileCol, 0) << "For efficiency, intgemm requires there to be a multiple of " << ::intgemm::Int8::kBTileCol << " indices.";
+  // mxnet doesn't have a uint32_t type so we'll just pointer cast.  But check the sizes are the same.  TODO statically.
+  assert(sizeof(int32_t) == sizeof(::intgemm::Index));
+  const ::intgemm::Index *index = reinterpret_cast<const ::intgemm::Index*>(indices.dptr<int32_t>());
+
+  ::intgemm::Int8::SelectColumnsB(weight.dptr<int8_t>(), out.dptr<int8_t>(), inner, index, index + B_cols);
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_take_weight_op)
+.describe(R"code(Index a weight matrix stored in intgemm's weight format.
+The indices select the outputs of matrix multiplication, not the inner dot product dimension.
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"weight", "indices"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", TakeWeightOpShape)
+.set_attr<nnvm::FInferType>("FInferType", TakeWeightOpType)
+.set_attr<FInferStorageType>("FInferStorageType", TakeWeightOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", TakeWeightOpForwardCPU)
+.add_argument("weight", "NDArray-or-Symbol", "Tensor already in intgemm weight format to select from")
+.add_argument("indices", "NDArray-or-Symbol", "indices to select on the 0th dimension of weight");
+
+}  // namespace op
+}  // namespace mxnet

From 947f9113efd8148c4313357f76d9946da00b1cf4 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Tue, 3 Dec 2019 14:52:59 +0000
Subject: [PATCH 18/65] Update intgemm with VNNI

---
 3rdparty/intgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index 12daa143f40f..170a721538e2 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit 12daa143f40f99861c20564e9c6ad50f947c0ca5
+Subproject commit 170a721538e25541036a4bd8a634bf9d03afdb38

From b28c699c0a19bec0b76af70ae36fcdaae6222347 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Tue, 3 Dec 2019 15:48:41 +0000
Subject: [PATCH 19/65] Revert "Update intgemm with VNNI".  It's not ready for
 compilers that lack VNNI support yet.

This reverts commit 947f9113efd8148c4313357f76d9946da00b1cf4.
---
 3rdparty/intgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index 170a721538e2..12daa143f40f 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit 170a721538e25541036a4bd8a634bf9d03afdb38
+Subproject commit 12daa143f40f99861c20564e9c6ad50f947c0ca5

From 8f7deb651ff12a655c08d276839d6b1c88ec485a Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Tue, 3 Dec 2019 16:23:37 +0000
Subject: [PATCH 20/65] Remove op suffix on intgemm_take_weight

---
 src/operator/contrib/intgemm/take_weight_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/contrib/intgemm/take_weight_op.cc b/src/operator/contrib/intgemm/take_weight_op.cc
index 46d3afec1bd3..f5ca244d2623 100644
--- a/src/operator/contrib/intgemm/take_weight_op.cc
+++ b/src/operator/contrib/intgemm/take_weight_op.cc
@@ -112,7 +112,7 @@ void TakeWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
   ::intgemm::Int8::SelectColumnsB(weight.dptr<int8_t>(), out.dptr<int8_t>(), inner, index, index + B_cols);
 }
 
-NNVM_REGISTER_OP(_contrib_intgemm_take_weight_op)
+NNVM_REGISTER_OP(_contrib_intgemm_take_weight)
 .describe(R"code(Index a weight matrix stored in intgemm's weight format.
 The indices select the outputs of matrix multiplication, not the inner dot product dimension.
 )code" ADD_FILELINE)

From 63c1a3b83eb73bf77b96547cd1895a4825de4849 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Fri, 20 Dec 2019 16:52:50 +0000
Subject: [PATCH 21/65] Update intgemm

---
 3rdparty/intgemm                                           | 2 +-
 src/operator/contrib/intgemm/intgemm_fully_connected_op.cc | 4 ++--
 src/operator/contrib/intgemm/prepare_weight_op.cc          | 4 ++--
 src/operator/contrib/intgemm/take_weight_op.cc             | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index 12daa143f40f..4b04b639796a 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit 12daa143f40f99861c20564e9c6ad50f947c0ca5
+Subproject commit 4b04b639796a78a3c755aa94c2f3a09b0e365a54
diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
index 8441d3a3af3d..1cfa040eee1c 100644
--- a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -192,8 +192,8 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
     CHECK_EQ(C.type_flag_, mshadow::kFloat32);
     CHECK_EQ(inputs[3].shape_.Size(), param.num_hidden);
   }
-  CHECK_EQ(inner % ::intgemm::Int8::kBTileRow, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::kBTileRow;
-  CHECK_EQ(B_cols % ::intgemm::Int8::kBTileCol, 0) << "intgemm requires B have a multiple of " << ::intgemm::Int8::kBTileCol << " columns inthe equation C = AB.";
+  CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows;
+  CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) << "intgemm requires B have a multiple of " << ::intgemm::Int8::tile_info.b_cols << " columns inthe equation C = AB.";
 
   float out_float_multiplier = *inputs[2].dptr<float>();
 
diff --git a/src/operator/contrib/intgemm/prepare_weight_op.cc b/src/operator/contrib/intgemm/prepare_weight_op.cc
index 7f13b6bca791..2b2ad408c72b 100644
--- a/src/operator/contrib/intgemm/prepare_weight_op.cc
+++ b/src/operator/contrib/intgemm/prepare_weight_op.cc
@@ -54,8 +54,8 @@ void PrepareWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
   CHECK(out.CheckContiguous());
   size_t B_cols = in.shape_.ProdShape(0, in.shape_.ndim() - 1);
   size_t inner = in.shape_[in.shape_.ndim() - 1];
-  CHECK_EQ(inner % ::intgemm::Int8::kBTileRow, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::kBTileRow;
-  CHECK_EQ(B_cols % ::intgemm::Int8::kBTileCol, 0) << "intgemm requires the output dimension (the product of all but the last dimension of the weight matrix) to be a multiple of " << ::intgemm::Int8::kBTileCol << ".";
+  CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows;
+  CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) << "intgemm requires the output dimension (the product of all but the last dimension of the weight matrix) to be a multiple of " << ::intgemm::Int8::tile_info.b_cols << ".";
 
   const float *B = in.dptr<float>();
   int8_t *quantB = out.dptr<int8_t>();
diff --git a/src/operator/contrib/intgemm/take_weight_op.cc b/src/operator/contrib/intgemm/take_weight_op.cc
index f5ca244d2623..1299fc791c1a 100644
--- a/src/operator/contrib/intgemm/take_weight_op.cc
+++ b/src/operator/contrib/intgemm/take_weight_op.cc
@@ -103,8 +103,8 @@ void TakeWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
   CHECK(out.CheckContiguous());
   size_t B_cols = indices.shape_[0];
   size_t inner = weight.shape_[weight.shape_.ndim() - 1];
-  CHECK_EQ(inner % ::intgemm::Int8::kBTileRow, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::kBTileRow;
-  CHECK_EQ(B_cols % ::intgemm::Int8::kBTileCol, 0) << "For efficiency, intgemm requires there to be a multiple of " << ::intgemm::Int8::kBTileCol << " indices.";
+  CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows;
+  CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) << "For efficiency, intgemm requires there to be a multiple of " << ::intgemm::Int8::tile_info.b_cols << " indices.";
   // mxnet doesn't have a uint32_t type so we'll just pointer cast.  But check the sizes are the same.  TODO statically.
   assert(sizeof(int32_t) == sizeof(::intgemm::Index));
   const ::intgemm::Index *index = reinterpret_cast<const ::intgemm::Index*>(indices.dptr<int32_t>());

From d777fedc7f0465a1091b41780cc9664f2296dda3 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Thu, 16 Jan 2020 13:56:52 +0000
Subject: [PATCH 22/65] Update intgemm

---
 3rdparty/intgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index 4b04b639796a..86feaac3c504 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit 4b04b639796a78a3c755aa94c2f3a09b0e365a54
+Subproject commit 86feaac3c5049b27f4ef571965242d4a8fb1943c

From c6b47a1cfac694e6769e00967e3138e3dfc811e5 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 3 Feb 2020 12:36:14 +0000
Subject: [PATCH 23/65] Refactor prepare operations to take scaling as tensors.
  PrepareBQuantizedTransposed.

This will make it easier to store a consistent file on disk.
---
 3rdparty/intgemm                              |   2 +-
 .../contrib/intgemm/prepare_data_op.cc        |  66 ++++++++--
 .../contrib/intgemm/prepare_op-common.cc      |  78 -----------
 .../contrib/intgemm/prepare_op-common.h       |  60 ---------
 .../contrib/intgemm/prepare_weight_op.cc      | 122 ++++++++++++++----
 5 files changed, 150 insertions(+), 178 deletions(-)
 delete mode 100644 src/operator/contrib/intgemm/prepare_op-common.cc
 delete mode 100644 src/operator/contrib/intgemm/prepare_op-common.h

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index 86feaac3c504..76a6d9f643c0 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit 86feaac3c5049b27f4ef571965242d4a8fb1943c
+Subproject commit 76a6d9f643c06880549725379b7207a259eb57b5
diff --git a/src/operator/contrib/intgemm/prepare_data_op.cc b/src/operator/contrib/intgemm/prepare_data_op.cc
index 88293b1d3e28..bfa6351a92f4 100644
--- a/src/operator/contrib/intgemm/prepare_data_op.cc
+++ b/src/operator/contrib/intgemm/prepare_data_op.cc
@@ -24,7 +24,6 @@
  * The only difference from Quantize/QuantizeV2 is that it bans -128.
  */
 
-#include "prepare_op-common.h"
 #include <mxnet/operator_util.h>
 #include <vector>
 #include "../../mshadow_op.h"
@@ -37,12 +36,54 @@
 namespace mxnet {
 namespace op {
 
+bool PrepareDataOpShape(const nnvm::NodeAttrs& attrs,
+                    mxnet::ShapeVector* in_attrs,
+                    mxnet::ShapeVector* out_attrs) {
+  // data and maximum
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+
+  SHAPE_ASSIGN_CHECK(*in_attrs, 1, mxnet::TShape(1, 1));
+
+  return shape_is_known(out_attrs->at(0));
+}
+
+bool PrepareDataOpType(const nnvm::NodeAttrs& attrs,
+                   std::vector<int>* in_attrs,
+                   std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  // This routine converts from float to int8 with a scaling factor
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  return true;
+}
+
+bool PrepareDataOpStorageType(const nnvm::NodeAttrs& attrs,
+                          const int dev_mask,
+                          DispatchMode* dispatch_mode,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage);
+  STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 0, kDefaultStorage);
+  STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 1, kDefaultStorage);
+  DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+  return true;
+}
+
 void PrepareDataOpForwardCPU(const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
                           const std::vector<TBlob>& inputs,
                           const std::vector<OpReqType>& req,
                           const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
   CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
@@ -57,8 +98,8 @@ void PrepareDataOpForwardCPU(const nnvm::NodeAttrs& attrs,
 
   const float *A = in.dptr<float>();
   int8_t *quantA = out.dptr<int8_t>();
-  const PrepareParam& param = nnvm::get<PrepareParam>(attrs.parsed);
-  ::intgemm::Int8::Quantize(A, quantA, param.multiplier, size);
+  const float multiplier = 127.0 / *inputs[1].dptr<float>();
+  ::intgemm::Int8::Quantize(A, quantA, multiplier, size);
 }
 
 NNVM_REGISTER_OP(_contrib_intgemm_prepare_data)
@@ -66,24 +107,23 @@ NNVM_REGISTER_OP(_contrib_intgemm_prepare_data)
 
 It it suitable for preparing an data matrix for use by intgemm's C=data * weights operation.
 
-The float32 values are multiplied by the provided multiplier before casting to int8.  Typically this is 127.0 / maxabsolute(A).
+The float32 values are scaled such that maxabs maps to 127. Typically maxabs = maxabsolute(A).
 )code" ADD_FILELINE)
-.set_attr_parser(ParamParser<PrepareParam>)
-.set_num_inputs(1)
+.set_num_inputs(2)
 .set_num_outputs(1)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"data"};
+    return std::vector<std::string>{"data", "maxabs"};
   })
-.set_attr<mxnet::FInferShape>("FInferShape", PrepareOpShape)
-.set_attr<nnvm::FInferType>("FInferType", PrepareOpType)
-.set_attr<FInferStorageType>("FInferStorageType", PrepareOpStorageType)
+.set_attr<mxnet::FInferShape>("FInferShape", PrepareDataOpShape)
+.set_attr<nnvm::FInferType>("FInferType", PrepareDataOpType)
+.set_attr<FInferStorageType>("FInferStorageType", PrepareDataOpStorageType)
 .set_attr<FCompute>("FCompute<cpu>", PrepareDataOpForwardCPU)
 .add_argument("data", "NDArray-or-Symbol", "Activation matrix to be prepared for multiplication.")
+.add_argument("maxabs", "NDArray-or-Symbol", "Maximum absolute value to be used for scaling.  (The values will be multiplied by 127.0 / maxabs.")
 // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
 // will be reverted after the improvement of CachedOP is done.
-.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
-.add_arguments(PrepareParam::__FIELDS__());
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/intgemm/prepare_op-common.cc b/src/operator/contrib/intgemm/prepare_op-common.cc
deleted file mode 100644
index 6f70c186d82d..000000000000
--- a/src/operator/contrib/intgemm/prepare_op-common.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file prepare_op-common.cc
- * \brief Common functions for intgemm's PrepareA and PrepareB functions.
- * These are used to convert float tensors to values suitable for
- * multiplication.
- */
-
-#include "prepare_op-common.h"
-#include <mxnet/operator_util.h>
-#include <vector>
-#include "../../mshadow_op.h"
-#include "../../mxnet_op.h"
-#include "../../operator_common.h"
-#include "../../tensor/init_op.h"
-
-namespace mxnet {
-namespace op {
-
-DMLC_REGISTER_PARAMETER(PrepareParam);
-
-bool PrepareOpShape(const nnvm::NodeAttrs& attrs,
-                    mxnet::ShapeVector* in_attrs,
-                    mxnet::ShapeVector* out_attrs) {
-  // One in, one out, same size.
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
-
-  return shape_is_known(out_attrs->at(0));
-}
-
-bool PrepareOpType(const nnvm::NodeAttrs& attrs,
-                   std::vector<int>* in_attrs,
-                   std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-
-  // This routine converts from float to int8.
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
-  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
-  return true;
-}
-
-bool PrepareOpStorageType(const nnvm::NodeAttrs& attrs,
-                          const int dev_mask,
-                          DispatchMode* dispatch_mode,
-                          std::vector<int>* in_attrs,
-                          std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  // Dense storage only.
-  return storage_type_assign(&out_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute) &&
-    storage_type_assign(&in_attrs->front(), kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
-}
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/contrib/intgemm/prepare_op-common.h b/src/operator/contrib/intgemm/prepare_op-common.h
deleted file mode 100644
index 135d82dfec79..000000000000
--- a/src/operator/contrib/intgemm/prepare_op-common.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file prepare_op-common.h
- * \brief Common functions for intgemm's PrepareA and PrepareB functions.
- * These are used to convert float tensors to values suitable for
- * multiplication.
- */
-
-#include <mxnet/operator_util.h>
-#include <vector>
-#include "../../mshadow_op.h"
-#include "../../mxnet_op.h"
-#include "../../operator_common.h"
-#include "../../tensor/init_op.h"
-
-namespace mxnet {
-namespace op {
-
-struct PrepareParam : public dmlc::Parameter<PrepareParam> {
-  float multiplier;
-  DMLC_DECLARE_PARAMETER(PrepareParam) {
-    DMLC_DECLARE_FIELD(multiplier)
-      .describe("Multiply floats by this constant before casting to int8.  Typically you would set this to 127.0 / max absolute value.");
-  }
-};
-
-bool PrepareOpShape(const nnvm::NodeAttrs& attrs,
-                    mxnet::ShapeVector* in_attrs,
-                    mxnet::ShapeVector* out_attrs);
-
-bool PrepareOpType(const nnvm::NodeAttrs& attrs,
-                   std::vector<int>* in_attrs,
-                   std::vector<int>* out_attrs);
-
-bool PrepareOpStorageType(const nnvm::NodeAttrs& attrs,
-                          const int dev_mask,
-                          DispatchMode* dispatch_mode,
-                          std::vector<int>* in_attrs,
-                          std::vector<int>* out_attrs);
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/contrib/intgemm/prepare_weight_op.cc b/src/operator/contrib/intgemm/prepare_weight_op.cc
index 2b2ad408c72b..fc351ab3f1e5 100644
--- a/src/operator/contrib/intgemm/prepare_weight_op.cc
+++ b/src/operator/contrib/intgemm/prepare_weight_op.cc
@@ -18,11 +18,10 @@
  */
 
 /*!
- * \file prepare_b_op.cc
- * \brief Converts B matrices to intgemm's representation.
+ * \file prepare_weight_op.cc
+ * \brief Converts weight matrices to intgemm's representation.
  */
 
-#include "prepare_op-common.h"
 #include <mxnet/operator_util.h>
 #include <vector>
 #include "../../mshadow_op.h"
@@ -36,19 +35,77 @@
 namespace mxnet {
 namespace op {
 
+struct PrepareWeightParam : public dmlc::Parameter<PrepareWeightParam> {
+  bool already_quantized;
+  DMLC_DECLARE_PARAMETER(PrepareWeightParam) {
+    DMLC_DECLARE_FIELD(already_quantized).set_default(false)
+    .describe("Is the weight matrix already quantized?");
+  }
+};
+DMLC_REGISTER_PARAMETER(PrepareWeightParam);
+
+bool PrepareWeightOpShape(const nnvm::NodeAttrs& attrs,
+                    mxnet::ShapeVector* in_attrs,
+                    mxnet::ShapeVector* out_attrs) {
+  // Optimal maximum parameter.
+  CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize.";
+  CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling.";
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+
+  if (in_attrs->size() == 2U) {
+    SHAPE_ASSIGN_CHECK(*in_attrs, 1, mxnet::TShape(1, 1));
+  }
+  return shape_is_known(out_attrs->at(0));
+}
+
+bool PrepareWeightOpType(const nnvm::NodeAttrs& attrs,
+                   std::vector<int>* in_attrs,
+                   std::vector<int>* out_attrs) {
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize.";
+  CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling.";
+  if (in_attrs->size() == 1U) {
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt8);
+  } else if (in_attrs->size() == 2U) {
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+    TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
+  }
+  return true;
+}
+
+bool PrepareWeightOpStorageType(const nnvm::NodeAttrs& attrs,
+                          const int dev_mask,
+                          DispatchMode* dispatch_mode,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs) {
+  CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize.";
+  CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling.";
+  CHECK_EQ(out_attrs->size(), 1U);
+  STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage);
+  STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 0, kDefaultStorage);
+  if (in_attrs->size() == 2U) {
+    STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 1, kDefaultStorage);
+  }
+  DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+  return true;
+}
+
 void PrepareWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
                           const std::vector<TBlob>& inputs,
                           const std::vector<OpReqType>& req,
                           const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 1U);
+  const PrepareWeightParam& params = nnvm::get<PrepareWeightParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), params.already_quantized ? 1U : 2U);
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
   CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
 
   const TBlob &in = inputs.front();
   const TBlob &out = outputs.front();
-  CHECK_EQ(in.type_flag_, mshadow::kFloat32);
   CHECK_EQ(out.type_flag_, mshadow::kInt8);
   CHECK(in.CheckContiguous());
   CHECK(out.CheckContiguous());
@@ -57,42 +114,55 @@ void PrepareWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows;
   CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) << "intgemm requires the output dimension (the product of all but the last dimension of the weight matrix) to be a multiple of " << ::intgemm::Int8::tile_info.b_cols << ".";
 
-  const float *B = in.dptr<float>();
   int8_t *quantB = out.dptr<int8_t>();
-  // TODO: eliminate transpose here by making a PrepareWeightColumnMajor.
-  intgemm::AlignedVector<float> B_transpose(inner * B_cols);
-  for (size_t i = 0; i < inner; ++i) {
-    for (size_t j = 0; j < B_cols; ++j) {
-      B_transpose[i * B_cols + j] = B[i + inner * j];
+  CHECK(in.type_flag_ == mshadow::kFloat32 || in.type_flag_ == mshadow::kInt8) << "Expected either 32-bit values to be quantized or 8-bit values to rearrange.";
+  if (in.type_flag_ == mshadow::kInt8) {
+    const int8_t *B = in.dptr<int8_t>();
+    ::intgemm::Int8::PrepareBQuantizedTransposed(B, quantB, inner, B_cols);
+  } else if (in.type_flag_ == mshadow::kFloat32) {
+    const float *B = in.dptr<float>();
+    // TODO: eliminate transpose here with https://github.com/kpu/intgemm/pull/56
+    intgemm::AlignedVector<float> B_transpose(inner * B_cols);
+    for (size_t i = 0; i < inner; ++i) {
+      for (size_t j = 0; j < B_cols; ++j) {
+        B_transpose[i * B_cols + j] = B[i + inner * j];
+      }
     }
+    ::intgemm::Int8::PrepareB(B_transpose.begin(), quantB, 127.0 / *inputs[1].dptr<float>(), inner, B_cols);
   }
-  const PrepareParam& param = nnvm::get<PrepareParam>(attrs.parsed);
-  ::intgemm::Int8::PrepareB(B_transpose.begin(), quantB, param.multiplier, inner, B_cols);
 }
 
 NNVM_REGISTER_OP(_contrib_intgemm_prepare_weight)
-.describe(R"code(This operator converts a float32 matrix to intgemm's internal representation of weight matrices in preparation for the operation C = data * weight.  weight should be provided in column-major order i.e. the last dimension of shape is the number of rows of weight.  This is the usual representation mxnet uses for weights. This operator is not meant to be fast; it is meant to be run offline to quantize a model.
+.describe(R"code(This operator converts a weight matrix in column-major format to intgemm's internal fast representation of weight matrices.  MXNet customarily stores weight matrices in column-major (transposed) format. This operator is not meant to be fast; it is meant to be run offline to quantize a model.
+
+In other words, it prepares weight for the operation C = data * weight^T.
+
+If the provided weight matrix is float32, it will be quantized first.  The quantization function is (int8_t)(127.0 / max * weight) where multiplier is provided as argument 1 (the weight matrix is argument 0).  Then the matrix will be rearranged into the CPU-dependent format.
 
-The float32 values are multiplied by the provided multiplier before casting to int8.
+If the provided weight matrix is already int8, the matrix will only be rearranged into the CPU-dependent format.  This way one can quantize with intgemm_prepare_data (which just quantizes), store to disk in a consistent format, then at load time convert to CPU-dependent format with intgemm_prepare_weight.
 
-The internal representation is CPU dependent: AVX512BW, AVX2, and SSSE3 have different formats.
+The internal representation depends on register length.  So AVX512, AVX2, and SSSE3 have different formats.  AVX512BW and AVX512VNNI have the same representation.
 )code" ADD_FILELINE)
-.set_attr_parser(ParamParser<PrepareParam>)
-.set_num_inputs(1)
+.set_attr_parser(ParamParser<PrepareWeightParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const PrepareWeightParam& params = nnvm::get<PrepareWeightParam>(attrs.parsed);
+  return params.already_quantized ? 1 : 2;
+})
 .set_num_outputs(1)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-  [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"weight"};
-  })
-.set_attr<mxnet::FInferShape>("FInferShape", PrepareOpShape)
-.set_attr<nnvm::FInferType>("FInferType", PrepareOpType)
-.set_attr<FInferStorageType>("FInferStorageType", PrepareOpStorageType)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  const PrepareWeightParam& params = nnvm::get<PrepareWeightParam>(attrs.parsed);
+  return params.already_quantized ? std::vector<std::string>{"weight"} : std::vector<std::string>{"weight", "maxabs"};
+})
+.set_attr<mxnet::FInferShape>("FInferShape", PrepareWeightOpShape)
+.set_attr<nnvm::FInferType>("FInferType", PrepareWeightOpType)
+.set_attr<FInferStorageType>("FInferStorageType", PrepareWeightOpStorageType)
 .set_attr<FCompute>("FCompute<cpu>", PrepareWeightOpForwardCPU)
 .add_argument("weight", "NDArray-or-Symbol", "Parameter matrix to be prepared for multiplication.")
+.add_argument("maxabs", "NDArray-or-Symbol", "Maximum absolute value for scaling. The weights will be multipled by 127.0 / maxabs.")
 // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
 // will be reverted after the improvement of CachedOP is done.
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
-.add_arguments(PrepareParam::__FIELDS__());
+.add_arguments(PrepareWeightParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet

From 07cf5777edf90f785f7b22ad39e201fdcdb5997b Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 3 Feb 2020 12:52:59 +0000
Subject: [PATCH 24/65] Remove unused variable

---
 src/operator/contrib/intgemm/intgemm_fully_connected_op.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
index 1cfa040eee1c..90f8346e793f 100644
--- a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -37,7 +37,6 @@ namespace mxnet {
 namespace op {
 
 struct IntgemmFullyConnectedParam : public dmlc::Parameter<IntgemmFullyConnectedParam> {
-  float out_float_multiplier;
   int out_type;
   int num_hidden;
   bool no_bias;

From 6c1a3886fb0f5a3cd2dd132e3bae483fadafd038 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 10 Feb 2020 17:23:41 +0000
Subject: [PATCH 25/65] Makefile compilation for intgemm

---
 Makefile | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 2bb10c4a6582..b2bdc524188e 100644
--- a/Makefile
+++ b/Makefile
@@ -461,8 +461,27 @@ endif
 all: lib/libmxnet.a lib/libmxnet.so $(BIN) extra-packages extension_libs
 
 SRC = $(wildcard src/*/*/*/*.cc src/*/*/*.cc src/*/*.cc src/*.cc)
-OBJ = $(patsubst %.cc, build/%.o, $(SRC))
 CUSRC = $(wildcard src/*/*/*/*.cu src/*/*/*.cu src/*/*.cu src/*.cu)
+
+#intgemm compiler tests for AVX512BW and AVX512VNNI
+ifeq ($(USE_INTGEMM), 1)
+  $(shell mkdir -p build/3rdparty/intgemm/)
+  $(shell echo '#pragma once' >build/3rdparty/intgemm/intgemm_config.h)
+  ifneq ($(shell $(CXX) $(CFLAGS) -mavx512f -mavx512bw -mavx512dq $(ROOTDIR)/3rdparty/intgemm/compile_test_avx512.cc 2>/dev/null && echo \\\#define INTGEMM_COMPILER_SUPPORTS_AVX512 >>build/3rdparty/intgemm/intgemm_config.h; echo $$?), 0)
+    $(warning WARNING: The compiler is too old for AVX512BW; so these instructions will not be used.)
+  endif
+  ifneq ($(shell $(CXX) $(CFLAGS) $(ROOTDIR)/3rdparty/intgemm/compile_test_avx512vnni.cc 2>/dev/null && echo \\\#define INTGEMM_COMPILER_SUPPORTS_AVX512VNNI >>build/3rdparty/intgemm/intgemm_config.h; echo $$?), 0)
+    $(warning WARNING: The compiler is too old for AVX512VNNI, so these instructions will not be used.)
+  endif
+  CFLAGS += -Ibuild/3rdparty/intgemm
+  SRC += 3rdparty/intgemm/intgemm.cc
+else
+  #If we're not using intgemm, remove the operators from src.
+  INTGEMM_OPS := $(wildcard src/operator/contrib/intgemm/*.cc)
+  SRC := $(filter-out $(SRC),$(INTGEMM_OPS))
+endif
+
+OBJ = $(patsubst %.cc, build/%.o, $(SRC))
 CUOBJ = $(patsubst %.cu, build/%_gpu.o, $(CUSRC))
 
 ifeq ($(USE_TVM_OP), 1)
@@ -479,6 +498,7 @@ ifeq ($(USE_CUDA), 1)
 endif
 endif
 
+
 # extra operators
 ifneq ($(EXTRA_OPERATORS),)
 	EXTRA_SRC = $(wildcard $(patsubst %, %/*.cc, $(EXTRA_OPERATORS)) $(patsubst %, %/*/*.cc, $(EXTRA_OPERATORS)))
@@ -609,6 +629,11 @@ $(DMLC_CORE)/libdmlc.a: DMLCCORE
 DMLCCORE:
 	+ cd $(DMLC_CORE); $(MAKE) libdmlc.a USE_SSE=$(USE_SSE) config=$(ROOTDIR)/$(config); cd $(ROOTDIR)
 
+ifeq ($(USE_INTGEMM), 1)
+build/3rdparty/intgemm/intgemm.o: 3rdparty/intgemm/intgemm.cc $(wildcard 3rdparty/intgemm/*.h) $(wildcard 3rdparty/intgemm/*/*.h)
+	$(CXX) $(CFLAGS) -std=c++11 -c 3rdparty/intgemm/intgemm.cc -o $@
+endif
+
 lib/libtvm_runtime.so:
 	echo "Compile TVM"
 	@mkdir -p $(@D)

From 85c5afd2194641c5ac2bb8a0ee0e681164b986f8 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 10 Feb 2020 17:43:46 +0000
Subject: [PATCH 26/65] Fix order of arguments to filter-out in Makefile

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index b2bdc524188e..3b0b68ff2e70 100644
--- a/Makefile
+++ b/Makefile
@@ -478,7 +478,7 @@ ifeq ($(USE_INTGEMM), 1)
 else
   #If we're not using intgemm, remove the operators from src.
   INTGEMM_OPS := $(wildcard src/operator/contrib/intgemm/*.cc)
-  SRC := $(filter-out $(SRC),$(INTGEMM_OPS))
+  SRC := $(filter-out $(INTGEMM_OPS),$(SRC))
 endif
 
 OBJ = $(patsubst %.cc, build/%.o, $(SRC))

From a17ba6593add4f69b0826e6a1de2e5ab0f0cd2d9 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 10 Feb 2020 18:26:04 +0000
Subject: [PATCH 27/65] Lint

---
 .../intgemm/intgemm_fully_connected_op.cc     | 64 ++++++++++++-------
 .../contrib/intgemm/max_absolute_op.cc        |  4 +-
 .../contrib/intgemm/prepare_data_op.cc        |  6 +-
 .../contrib/intgemm/prepare_weight_op.cc      | 27 ++++++--
 .../contrib/intgemm/take_weight_op.cc         | 28 +++++---
 5 files changed, 88 insertions(+), 41 deletions(-)

diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
index 90f8346e793f..79215fa1c4de 100644
--- a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -66,7 +66,7 @@ template<class T> void IntgemmFullyConnectedSanity(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in->size(), param.no_bias ? 3U : 4U);
   CHECK_EQ(out->size(), 1U);
 }
-} // namespace
+}  // namespace
 
 inline bool IntgemmFullyConnectedOpShape(const nnvm::NodeAttrs& attrs,
                              mxnet::ShapeVector* in_shape,
@@ -143,20 +143,20 @@ bool IntgemmFullyConnectedOpType(const nnvm::NodeAttrs& attrs,
 
 namespace {
 
-// TODO: amend AlignedVector to allow a reset.
+// This is used to free because AlignedVector does not have Reset.
 class FreeMe {
-  public:
-    FreeMe() : mem_(nullptr) {}
-    ~FreeMe() { std::free(mem_); }
-    void Reset(int8_t *with) {
-      std::free(mem_);
-      mem_ = with;
-    }
-  private:
-    int8_t *mem_;
+ public:
+  FreeMe() : mem_(nullptr) {}
+  ~FreeMe() { std::free(mem_); }
+  void Reset(int8_t *with) {
+    std::free(mem_);
+    mem_ = with;
+  }
+ private:
+  int8_t *mem_;
 };
 
-} // namespace
+}  // namespace
 
 void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
@@ -166,7 +166,7 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
   IntgemmFullyConnectedSanity(attrs, &inputs, &outputs);
   const IntgemmFullyConnectedParam& param = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
   CHECK_EQ(req.size(), 1U);
-  CHECK_EQ(req[0], kWriteTo) << "TODO: doing more than overwriting for intgemm.  Note: kWriteInplace = " << kWriteInplace << " kWriteTo = " << kWriteTo;
+  CHECK_EQ(req[0], kWriteTo) << "TODO: doing more than overwriting for intgemm.";
 
   const TBlob &A = inputs[0], &B = inputs[1], &C = outputs[0];
 
@@ -191,21 +191,24 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
     CHECK_EQ(C.type_flag_, mshadow::kFloat32);
     CHECK_EQ(inputs[3].shape_.Size(), param.num_hidden);
   }
-  CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows;
-  CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) << "intgemm requires B have a multiple of " << ::intgemm::Int8::tile_info.b_cols << " columns inthe equation C = AB.";
+  CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) <<
+    "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows;
+  CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) <<
+    "intgemm requires B have a multiple of " << ::intgemm::Int8::tile_info.b_cols <<
+    " columns inthe equation C = AB.";
 
   float out_float_multiplier = *inputs[2].dptr<float>();
 
   int8_t *A_quant;
-  // TODO report this memory consumption?
+  // TODO(kpuatamazon) report this memory consumption?
   FreeMe A_quant_store;
   if (A.type_flag_ == mshadow::kFloat32) {
     const float *A_raw = A.dptr<float>();
-    // Quantize A for the user.  TODO: allow scale to be passed in.  Should the induced scale be an output?
+    // Quantize A for the user.
+    // Future: allow scale to be passed in? Should the induced scale be an output?
     float scale = 127.0 / ::intgemm::MaxAbsolute(A_raw, A_raw + A.shape_.Size());
     out_float_multiplier /= scale;
-    // TODO report this memory consumption to mxnet?
-    A_quant = (int8_t*)aligned_alloc(64, A.shape_.Size());
+    A_quant = static_cast<int8_t*>(aligned_alloc(64, A.shape_.Size()));
     CHECK(A_quant);
     A_quant_store.Reset(A_quant);
     ::intgemm::Int8::PrepareA(A_raw, A_quant, scale, A_rows, inner);
@@ -217,7 +220,10 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
 
   if (bias) {
     if (C.type_flag_ == mshadow::kFloat32) {
-      ::intgemm::callbacks::UnquantizeAndAddBiasAndWrite cb(out_float_multiplier, inputs[3].dptr<float>(), C.dptr<float>());
+      ::intgemm::callbacks::UnquantizeAndAddBiasAndWrite cb(
+          out_float_multiplier,
+          inputs[3].dptr<float>(),
+          C.dptr<float>());
       ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
     } else {
       // int32
@@ -256,13 +262,25 @@ The out_type can be int32 or float32.  Bias must have the same type.
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
     const IntgemmFullyConnectedParam& params = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
-    return params.no_bias ? std::vector<std::string>{"data", "weight", "scaling"} : std::vector<std::string>{"data", "weight", "scaling", "bias"};
+    return params.no_bias ?
+      std::vector<std::string>{"data", "weight", "scaling"} :
+      std::vector<std::string>{"data", "weight", "scaling", "bias"};
   })
 .set_attr<mxnet::FInferShape>("FInferShape", IntgemmFullyConnectedOpShape)
 .set_attr<nnvm::FInferType>("FInferType", IntgemmFullyConnectedOpType)
 .set_attr<FCompute>("FCompute<cpu>", IntgemmFullyConnectedOpForwardCPU)
-.add_argument("data", "NDArray-or-Symbol", "First argument to multiplication. Tensor of float32 (quantized on the fly) or int8 from intgemm_prepare_data. If you use a different quantizer, be sure to ban -128. The last dimension must be a multiple of 64.")
-.add_argument("weight", "NDArray-or-Symbol", "Second argument to multiplication. Tensor of int8 from intgemm_prepare_weight. The last dimension must be a multiple of 64.  The product of non-last dimensions must be a multiple of 8.")
+.add_argument(
+    "data",
+    "NDArray-or-Symbol",
+    "First argument to multiplication. Tensor of float32 (quantized on the fly) or int8 from "
+      "intgemm_prepare_data. If you use a different quantizer, be sure to ban -128. The last "
+      "dimension must be a multiple of 64.")
+.add_argument(
+    "weight",
+    "NDArray-or-Symbol",
+    "Second argument to multiplication. Tensor of int8 from intgemm_prepare_weight. The last "
+      "dimension must be a multiple of 64.  The product of non-last dimensions must be a multiple "
+      "of 8.")
 .add_argument("scaling", "NDArray-or-Symbol", "Scaling factor to apply if output type is float32.")
 .add_argument("bias", "NDArray-or-Symbol", "Bias term.")
 // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
diff --git a/src/operator/contrib/intgemm/max_absolute_op.cc b/src/operator/contrib/intgemm/max_absolute_op.cc
index fe49c4138b2d..475f93bb73dc 100644
--- a/src/operator/contrib/intgemm/max_absolute_op.cc
+++ b/src/operator/contrib/intgemm/max_absolute_op.cc
@@ -82,7 +82,8 @@ void MaxAbsoluteOpForwardCPU(const nnvm::NodeAttrs& attrs,
   CHECK(in.CheckContiguous());
   CHECK(out.CheckContiguous());
   size_t size = in.shape_.Size();
-  CHECK_EQ(size % (512 / 8 / sizeof(float)), 0) << "The total size of the input must be a multiple of 16.";
+  CHECK_EQ(size % (512 / 8 / sizeof(float)), 0) <<
+    "The total size of the input must be a multiple of 16.";
 
   const float *data = in.dptr<float>();
   KERNEL_ASSIGN(*out.dptr<float>(), req[0], ::intgemm::MaxAbsolute(data, data + size));
@@ -92,7 +93,6 @@ NNVM_REGISTER_OP(_contrib_intgemm_maxabsolute)
 .describe(R"code(Compute the maximum absolute value in a tensor of float32 fast on a CPU.  The tensor's total size must be a multiple of 16 and aligned to a multiple of 64 bytes.
 mxnet.nd.contrib.intgemm_maxabsolute(arr) == arr.abs().max()
 )code" ADD_FILELINE)
-//.set_attr_parser(ParamParser<MaxAbsoluteParam>)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
diff --git a/src/operator/contrib/intgemm/prepare_data_op.cc b/src/operator/contrib/intgemm/prepare_data_op.cc
index bfa6351a92f4..95e53fa2af85 100644
--- a/src/operator/contrib/intgemm/prepare_data_op.cc
+++ b/src/operator/contrib/intgemm/prepare_data_op.cc
@@ -120,7 +120,11 @@ The float32 values are scaled such that maxabs maps to 127. Typically maxabs = m
 .set_attr<FInferStorageType>("FInferStorageType", PrepareDataOpStorageType)
 .set_attr<FCompute>("FCompute<cpu>", PrepareDataOpForwardCPU)
 .add_argument("data", "NDArray-or-Symbol", "Activation matrix to be prepared for multiplication.")
-.add_argument("maxabs", "NDArray-or-Symbol", "Maximum absolute value to be used for scaling.  (The values will be multiplied by 127.0 / maxabs.")
+.add_argument(
+    "maxabs",
+    "NDArray-or-Symbol",
+    "Maximum absolute value to be used for scaling.  (The values will be multiplied by 127.0 / "
+      "maxabs.")
 // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
 // will be reverted after the improvement of CachedOP is done.
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
diff --git a/src/operator/contrib/intgemm/prepare_weight_op.cc b/src/operator/contrib/intgemm/prepare_weight_op.cc
index fc351ab3f1e5..68790c30f70a 100644
--- a/src/operator/contrib/intgemm/prepare_weight_op.cc
+++ b/src/operator/contrib/intgemm/prepare_weight_op.cc
@@ -111,24 +111,33 @@ void PrepareWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
   CHECK(out.CheckContiguous());
   size_t B_cols = in.shape_.ProdShape(0, in.shape_.ndim() - 1);
   size_t inner = in.shape_[in.shape_.ndim() - 1];
-  CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows;
-  CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) << "intgemm requires the output dimension (the product of all but the last dimension of the weight matrix) to be a multiple of " << ::intgemm::Int8::tile_info.b_cols << ".";
+  CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) <<
+    "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows;
+  CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) <<
+    "intgemm requires the output dimension (the product of all but the last dimension of the "
+    "weight matrix) to be a multiple of " << ::intgemm::Int8::tile_info.b_cols << ".";
 
   int8_t *quantB = out.dptr<int8_t>();
-  CHECK(in.type_flag_ == mshadow::kFloat32 || in.type_flag_ == mshadow::kInt8) << "Expected either 32-bit values to be quantized or 8-bit values to rearrange.";
+  CHECK(in.type_flag_ == mshadow::kFloat32 || in.type_flag_ == mshadow::kInt8) <<
+    "Expected either 32-bit values to be quantized or 8-bit values to rearrange.";
   if (in.type_flag_ == mshadow::kInt8) {
     const int8_t *B = in.dptr<int8_t>();
     ::intgemm::Int8::PrepareBQuantizedTransposed(B, quantB, inner, B_cols);
   } else if (in.type_flag_ == mshadow::kFloat32) {
     const float *B = in.dptr<float>();
-    // TODO: eliminate transpose here with https://github.com/kpu/intgemm/pull/56
+    // TODO(kpuatamazon): eliminate transpose here with https://github.com/kpu/intgemm/pull/56
     intgemm::AlignedVector<float> B_transpose(inner * B_cols);
     for (size_t i = 0; i < inner; ++i) {
       for (size_t j = 0; j < B_cols; ++j) {
         B_transpose[i * B_cols + j] = B[i + inner * j];
       }
     }
-    ::intgemm::Int8::PrepareB(B_transpose.begin(), quantB, 127.0 / *inputs[1].dptr<float>(), inner, B_cols);
+    ::intgemm::Int8::PrepareB(
+        B_transpose.begin(),
+        quantB,
+        127.0 / *inputs[1].dptr<float>(),
+        inner,
+        B_cols);
   }
 }
 
@@ -151,14 +160,18 @@ The internal representation depends on register length.  So AVX512, AVX2, and SS
 .set_num_outputs(1)
 .set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
   const PrepareWeightParam& params = nnvm::get<PrepareWeightParam>(attrs.parsed);
-  return params.already_quantized ? std::vector<std::string>{"weight"} : std::vector<std::string>{"weight", "maxabs"};
+  return params.already_quantized ?
+    std::vector<std::string>{"weight"} : std::vector<std::string>{"weight", "maxabs"};
 })
 .set_attr<mxnet::FInferShape>("FInferShape", PrepareWeightOpShape)
 .set_attr<nnvm::FInferType>("FInferType", PrepareWeightOpType)
 .set_attr<FInferStorageType>("FInferStorageType", PrepareWeightOpStorageType)
 .set_attr<FCompute>("FCompute<cpu>", PrepareWeightOpForwardCPU)
 .add_argument("weight", "NDArray-or-Symbol", "Parameter matrix to be prepared for multiplication.")
-.add_argument("maxabs", "NDArray-or-Symbol", "Maximum absolute value for scaling. The weights will be multipled by 127.0 / maxabs.")
+.add_argument(
+    "maxabs",
+    "NDArray-or-Symbol",
+    "Maximum absolute value for scaling. The weights will be multipled by 127.0 / maxabs.")
 // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
 // will be reverted after the improvement of CachedOP is done.
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
diff --git a/src/operator/contrib/intgemm/take_weight_op.cc b/src/operator/contrib/intgemm/take_weight_op.cc
index 1299fc791c1a..2cb87c372c8a 100644
--- a/src/operator/contrib/intgemm/take_weight_op.cc
+++ b/src/operator/contrib/intgemm/take_weight_op.cc
@@ -68,7 +68,6 @@ inline bool TakeWeightOpType(const nnvm::NodeAttrs& attrs,
 
   TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
   TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt8);
-  // TODO 64-bit index support.  Though if you're going that far, you're probably overflowing matrix multiply.
   TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kInt32);
   return true;
 }
@@ -103,13 +102,23 @@ void TakeWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
   CHECK(out.CheckContiguous());
   size_t B_cols = indices.shape_[0];
   size_t inner = weight.shape_[weight.shape_.ndim() - 1];
-  CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) << "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows;
-  CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) << "For efficiency, intgemm requires there to be a multiple of " << ::intgemm::Int8::tile_info.b_cols << " indices.";
-  // mxnet doesn't have a uint32_t type so we'll just pointer cast.  But check the sizes are the same.  TODO statically.
+  CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) <<
+    "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows;
+  CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) <<
+    "For efficiency, intgemm requires there to be a multiple of " <<
+    ::intgemm::Int8::tile_info.b_cols << " indices.";
+  // mxnet doesn't have a uint32_t type so we'll just pointer cast. But check the sizes are the
+  // same.  Ideally this should be static.
   assert(sizeof(int32_t) == sizeof(::intgemm::Index));
-  const ::intgemm::Index *index = reinterpret_cast<const ::intgemm::Index*>(indices.dptr<int32_t>());
-
-  ::intgemm::Int8::SelectColumnsB(weight.dptr<int8_t>(), out.dptr<int8_t>(), inner, index, index + B_cols);
+  const ::intgemm::Index *index =
+    reinterpret_cast<const ::intgemm::Index*>(indices.dptr<int32_t>());
+
+  ::intgemm::Int8::SelectColumnsB(
+      weight.dptr<int8_t>(),
+      out.dptr<int8_t>(),
+      inner,
+      index,
+      index + B_cols);
 }
 
 NNVM_REGISTER_OP(_contrib_intgemm_take_weight)
@@ -126,7 +135,10 @@ The indices select the outputs of matrix multiplication, not the inner dot produ
 .set_attr<nnvm::FInferType>("FInferType", TakeWeightOpType)
 .set_attr<FInferStorageType>("FInferStorageType", TakeWeightOpStorageType)
 .set_attr<FCompute>("FCompute<cpu>", TakeWeightOpForwardCPU)
-.add_argument("weight", "NDArray-or-Symbol", "Tensor already in intgemm weight format to select from")
+.add_argument(
+    "weight",
+    "NDArray-or-Symbol",
+    "Tensor already in intgemm weight format to select from")
 .add_argument("indices", "NDArray-or-Symbol", "indices to select on the 0th dimension of weight");
 
 }  // namespace op

From 2e6bf75d682bc957b72555259a4db673ceeba954 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Tue, 3 Mar 2020 13:19:27 +0000
Subject: [PATCH 28/65] Quantizer with arbitrarily many arguments and OpenMP
 support

Benchmark program:
import mxnet as mx
import time

def time_procedure(shape, count, proc):
  data = mx.nd.random_uniform(shape=shape, low=-1.0, high = 1.0)
  mx.nd.waitall()
  begin = time.time()
  for i in range(0, count):
    proc(data)
    mx.nd.waitall()
  return (time.time() - begin) / count

shapes = [(1,1), (128, 128), (256,256), (512, 512), (1024, 1024), (2048, 2048), (20971520,), (8, 4096), (4096, 8)]
count = 1000
one = mx.nd.ones(shape=(1))
minusone = -one

procedures = {
  "quantize" : (lambda data : mx.nd.contrib.quantize(data, minusone, one)),
  "quantize_v2" : (lambda data : mx.nd.contrib.quantize_v2(data, min_calib_range = -1.0, max_calib_range = 1.0)),
  "intgemm" : (lambda data : mx.nd.contrib.intgemm_prepare_data(data, one)),
  "quantize_v2_fit" : (lambda data : mx.nd.contrib.quantize_v2(data)),
  "intgemm_fit" : (lambda data : mx.nd.contrib.intgemm_prepare_data(data, mx.nd.contrib.intgemm_maxabsolute(data))),
}
for s in shapes:
  print("Shape " + str(s))
  stats = {}
  for name, l in procedures.items():
    stats[name] = time_procedure(s, count, l)
    print("{:.7f} seconds for {}".format(stats[name], name))
  best_baseline = min(stats["quantize"], stats["quantize_v2"])
  ratio = best_baseline / stats["intgemm"]
  print("intgemm is {:.1f}x faster with calibration".format(ratio))
  fit_ratio = stats["quantize_v2_fit"] / stats["intgemm_fit"]
  print("intgemm is {:.1f}x faster without calibration".format(fit_ratio))

OMP_NUM_THREADS=24 ./quant_bench.py
Shape (1, 1)
0.0001304 seconds for quantize
0.0001076 seconds for quantize_v2
0.0000310 seconds for intgemm
0.0001114 seconds for quantize_v2_fit
0.0000479 seconds for intgemm_fit
intgemm is 3.5x faster with calibration
intgemm is 2.3x faster without calibration
Shape (128, 128)
0.0001649 seconds for quantize
0.0001399 seconds for quantize_v2
0.0000329 seconds for intgemm
0.0001533 seconds for quantize_v2_fit
0.0000502 seconds for intgemm_fit
intgemm is 4.2x faster with calibration
intgemm is 3.1x faster without calibration
Shape (256, 256)
0.0001660 seconds for quantize
0.0001404 seconds for quantize_v2
0.0000335 seconds for intgemm
0.0001599 seconds for quantize_v2_fit
0.0000505 seconds for intgemm_fit
intgemm is 4.2x faster with calibration
intgemm is 3.2x faster without calibration
Shape (512, 512)
0.0001691 seconds for quantize
0.0001434 seconds for quantize_v2
0.0000342 seconds for intgemm
0.0001813 seconds for quantize_v2_fit
0.0000540 seconds for intgemm_fit
intgemm is 4.2x faster with calibration
intgemm is 3.4x faster without calibration
Shape (1024, 1024)
0.0001920 seconds for quantize
0.0001538 seconds for quantize_v2
0.0000511 seconds for intgemm
0.0002390 seconds for quantize_v2_fit
0.0000827 seconds for intgemm_fit
intgemm is 3.0x faster with calibration
intgemm is 2.9x faster without calibration
Shape (2048, 2048)
0.0002364 seconds for quantize
0.0001989 seconds for quantize_v2
0.0000875 seconds for intgemm
0.0004747 seconds for quantize_v2_fit
0.0001531 seconds for intgemm_fit
intgemm is 2.3x faster with calibration
intgemm is 3.1x faster without calibration
Shape (20971520,)
0.0011446 seconds for quantize
0.0010902 seconds for quantize_v2
0.0008950 seconds for intgemm
0.0023337 seconds for quantize_v2_fit
0.0015005 seconds for intgemm_fit
intgemm is 1.2x faster with calibration
intgemm is 1.6x faster without calibration
Shape (8, 4096)
0.0001636 seconds for quantize
0.0001392 seconds for quantize_v2
0.0000364 seconds for intgemm
0.0001508 seconds for quantize_v2_fit
0.0000651 seconds for intgemm_fit
intgemm is 3.8x faster with calibration
intgemm is 2.3x faster without calibration
Shape (4096, 8)
0.0001642 seconds for quantize
0.0001392 seconds for quantize_v2
0.0000370 seconds for intgemm
0.0001515 seconds for quantize_v2_fit
0.0000654 seconds for intgemm_fit
intgemm is 3.8x faster with calibration
intgemm is 2.3x faster without calibration

OMP_NUM_THREADS=1 ./quant_bench.py
Shape (1, 1)
0.0000630 seconds for quantize
0.0000706 seconds for quantize_v2
0.0000294 seconds for intgemm
0.0000632 seconds for quantize_v2_fit
0.0000475 seconds for intgemm_fit
intgemm is 2.1x faster with calibration
intgemm is 1.3x faster without calibration
Shape (128, 128)
0.0000860 seconds for quantize
0.0000898 seconds for quantize_v2
0.0000324 seconds for intgemm
0.0000996 seconds for quantize_v2_fit
0.0000464 seconds for intgemm_fit
intgemm is 2.6x faster with calibration
intgemm is 2.1x faster without calibration
Shape (256, 256)
0.0000976 seconds for quantize
0.0001028 seconds for quantize_v2
0.0000339 seconds for intgemm
0.0001513 seconds for quantize_v2_fit
0.0000521 seconds for intgemm_fit
intgemm is 2.9x faster with calibration
intgemm is 2.9x faster without calibration
Shape (512, 512)
0.0001724 seconds for quantize
0.0001693 seconds for quantize_v2
0.0000839 seconds for intgemm
0.0004351 seconds for quantize_v2_fit
0.0001420 seconds for intgemm_fit
intgemm is 2.0x faster with calibration
intgemm is 3.1x faster without calibration
Shape (1024, 1024)
0.0003559 seconds for quantize
0.0003481 seconds for quantize_v2
0.0002384 seconds for intgemm
0.0013803 seconds for quantize_v2_fit
0.0004667 seconds for intgemm_fit
intgemm is 1.5x faster with calibration
intgemm is 3.0x faster without calibration
Shape (2048, 2048)
0.0011425 seconds for quantize
0.0010880 seconds for quantize_v2
0.0008497 seconds for intgemm
0.0051828 seconds for quantize_v2_fit
0.0018427 seconds for intgemm_fit
intgemm is 1.3x faster with calibration
intgemm is 2.8x faster without calibration
Shape (20971520,)
0.0101917 seconds for quantize
0.0096956 seconds for quantize_v2
0.0071391 seconds for intgemm
0.0305159 seconds for quantize_v2_fit
0.0140535 seconds for intgemm_fit
intgemm is 1.4x faster with calibration
intgemm is 2.2x faster without calibration
Shape (8, 4096)
0.0000880 seconds for quantize
0.0000950 seconds for quantize_v2
0.0000334 seconds for intgemm
0.0001183 seconds for quantize_v2_fit
0.0000423 seconds for intgemm_fit
intgemm is 2.6x faster with calibration
intgemm is 2.8x faster without calibration
Shape (4096, 8)
0.0000900 seconds for quantize
0.0000949 seconds for quantize_v2
0.0000332 seconds for intgemm
0.0001215 seconds for quantize_v2_fit
0.0000433 seconds for intgemm_fit
intgemm is 2.7x faster with calibration
intgemm is 2.8x faster without calibration
---
 3rdparty/intgemm                              |  2 +-
 .../contrib/intgemm/max_absolute_op.cc        | 33 ++++++++++++++++---
 .../contrib/intgemm/prepare_data_op.cc        |  4 +--
 3 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index 76a6d9f643c0..a520d4362a6e 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit 76a6d9f643c06880549725379b7207a259eb57b5
+Subproject commit a520d4362a6eb0bd1a9ea418d293ec6d9d0a5075
diff --git a/src/operator/contrib/intgemm/max_absolute_op.cc b/src/operator/contrib/intgemm/max_absolute_op.cc
index 475f93bb73dc..edbe9e1872ed 100644
--- a/src/operator/contrib/intgemm/max_absolute_op.cc
+++ b/src/operator/contrib/intgemm/max_absolute_op.cc
@@ -81,12 +81,37 @@ void MaxAbsoluteOpForwardCPU(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out.type_flag_, mshadow::kFloat32);
   CHECK(in.CheckContiguous());
   CHECK(out.CheckContiguous());
-  size_t size = in.shape_.Size();
-  CHECK_EQ(size % (512 / 8 / sizeof(float)), 0) <<
-    "The total size of the input must be a multiple of 16.";
+
+  const std::size_t size = in.shape_.Size();
+  // Doesn't make sense to take MaxAbsolute of nothing.
+  CHECK_GE(size, 1U);
 
   const float *data = in.dptr<float>();
-  KERNEL_ASSIGN(*out.dptr<float>(), req[0], ::intgemm::MaxAbsolute(data, data + size));
+  // To maintain alignment, be a multiple of AVX512 register size.
+  const std::size_t kMultiple = 512 / 8 / sizeof(float);
+  CHECK_EQ(reinterpret_cast<intptr_t>(data) % kMultiple, 0) << "Data must be aligned to " << kMultiple << ".";
+
+#ifdef _OPENMP
+  float result = 0.0f;
+  // Every thread needs some work to do.  Should probably be more aggressive than this.
+  int threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  threads = std::min<int>(threads, (size + kMultiple - 1) / kMultiple);
+  #pragma omp parallel num_threads(threads)
+  {
+    std::size_t num = omp_get_thread_num();
+    std::size_t thread_count = omp_get_num_threads();
+    std::size_t begin = ((num * size) / thread_count) & ~(kMultiple - 1);
+    std::size_t end = (((num + 1) * size) / thread_count) & ~(kMultiple - 1);
+    // Last thread gets the overhang.
+    if (num == thread_count - 1) end = size;
+    float local_result = ::intgemm::MaxAbsolute(data + begin, data + end);
+    #pragma omp critical
+    result = std::max(result, local_result);
+  }
+#else
+  float result = ::intgemm::MaxAbsolute(data, data + size);
+#endif
+  KERNEL_ASSIGN(*out.dptr<float>(), req[0], result);
 }
 
 NNVM_REGISTER_OP(_contrib_intgemm_maxabsolute)
diff --git a/src/operator/contrib/intgemm/prepare_data_op.cc b/src/operator/contrib/intgemm/prepare_data_op.cc
index 95e53fa2af85..0bf40887fb1c 100644
--- a/src/operator/contrib/intgemm/prepare_data_op.cc
+++ b/src/operator/contrib/intgemm/prepare_data_op.cc
@@ -93,13 +93,11 @@ void PrepareDataOpForwardCPU(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out.type_flag_, mshadow::kInt8);
   CHECK(in.CheckContiguous());
   CHECK(out.CheckContiguous());
-  size_t size = in.shape_.Size();
-  CHECK_EQ(size % 16, 0) << "intgemm PrepareData requires the size be a multiple of 16.";
 
   const float *A = in.dptr<float>();
   int8_t *quantA = out.dptr<int8_t>();
   const float multiplier = 127.0 / *inputs[1].dptr<float>();
-  ::intgemm::Int8::Quantize(A, quantA, multiplier, size);
+  ::intgemm::Int8::Quantize(A, quantA, multiplier, in.shape_.Size());
 }
 
 NNVM_REGISTER_OP(_contrib_intgemm_prepare_data)

From 804d78c5991ec90458470eac7abd4f370cb7b0f2 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Tue, 3 Mar 2020 15:19:16 +0000
Subject: [PATCH 29/65] Update intgemm with less warnings

---
 3rdparty/intgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index a520d4362a6e..aa174e619583 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit a520d4362a6eb0bd1a9ea418d293ec6d9d0a5075
+Subproject commit aa174e619583552dcbd9690af961e11ab75c24e2

From 792bf72d20ac6c3215a6eeb16def605b8c3250fd Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Tue, 17 Mar 2020 11:09:59 +0000
Subject: [PATCH 30/65] Updated intgemm, should fix compiler issues.

---
 3rdparty/intgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index aa174e619583..8033cdf974f6 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit aa174e619583552dcbd9690af961e11ab75c24e2
+Subproject commit 8033cdf974f69599edde6ba9126bff01b91f2435

From edc00f61feb53a605b7a04e29a5a24b008383d58 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Tue, 17 Mar 2020 11:15:24 +0000
Subject: [PATCH 31/65] Whitespace

---
 src/operator/contrib/intgemm/max_absolute_op.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/operator/contrib/intgemm/max_absolute_op.cc b/src/operator/contrib/intgemm/max_absolute_op.cc
index edbe9e1872ed..fd14c321fe19 100644
--- a/src/operator/contrib/intgemm/max_absolute_op.cc
+++ b/src/operator/contrib/intgemm/max_absolute_op.cc
@@ -89,7 +89,8 @@ void MaxAbsoluteOpForwardCPU(const nnvm::NodeAttrs& attrs,
   const float *data = in.dptr<float>();
   // To maintain alignment, be a multiple of AVX512 register size.
   const std::size_t kMultiple = 512 / 8 / sizeof(float);
-  CHECK_EQ(reinterpret_cast<intptr_t>(data) % kMultiple, 0) << "Data must be aligned to " << kMultiple << ".";
+  CHECK_EQ(reinterpret_cast<intptr_t>(data) % kMultiple, 0)
+    << "Data must be aligned to " << kMultiple << " bytes.";
 
 #ifdef _OPENMP
   float result = 0.0f;

From 5f3dc65f3d6fbbe18fe9d6dfa4fc52b85dd2b5b7 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Tue, 17 Mar 2020 12:50:11 +0000
Subject: [PATCH 32/65] gcc < 5 is a lost cause for intrinsics.

---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 80ebf976c9b9..84df716c7816 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,9 @@ if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PR
 else()
   option(USE_MKLDNN "Build with MKL-DNN support" OFF)
 endif()
-IF ((CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
+#gcc 4 doesn't support AVX2 and SSSE3 support doesn't work with target attributes so ban gcc < 5 from intgemm.
+if ((CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING) AND 
+    ((NOT CMAKE_COMPILER_IS_GNUCC) OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 5.0)))
   option(USE_INTGEMM          "Build with x86 intgemm library for low-precision multiplication" ON)
 else()
   option(USE_INTGEMM          "Build with x86 intgemm library for low-precision multiplication" OFF)

From a9c26db60f3d08b8d3af34ccd69ade3e8ce843f9 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Tue, 17 Mar 2020 14:39:47 +0000
Subject: [PATCH 33/65] Exclude intgemm operators when compiling with
 -DUSE_INTGEMM=OFF

---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 84df716c7816..7ad97be7dc00 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -544,6 +544,9 @@ FILE(GLOB_RECURSE CUDA "src/*.cu" "src/*.cuh")
 
 if (USE_INTGEMM)
   list(APPEND SOURCE "3rdparty/intgemm/intgemm.cc")
+else()
+  FILE(GLOB_RECURSE INTGEMM_OPERATOR_SOURCE "src/operator/contrib/intgemm/*.cc" "src/operator/contrib/intgemm/*.h")
+  list(REMOVE_ITEM SOURCE ${INTGEMM_OPERATOR_SOURCE})
 endif()
 
 # add nnvm to source

From a438d3deab17fa58fcdf003f5029d4278400197f Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 6 Apr 2020 12:46:54 +0000
Subject: [PATCH 34/65] intgemm with OMP support for multiply

---
 3rdparty/intgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index 8033cdf974f6..468e3d25c65c 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit 8033cdf974f69599edde6ba9126bff01b91f2435
+Subproject commit 468e3d25c65c873062fb4e5b770deb3d209069cb

From f04e70ab6b80295f2e724678843f0fa24d01d305 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 13 Apr 2020 13:02:10 +0000
Subject: [PATCH 35/65] Update intgemm, fix old glibc

---
 3rdparty/intgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index 468e3d25c65c..fb96b0851cf4 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit 468e3d25c65c873062fb4e5b770deb3d209069cb
+Subproject commit fb96b0851cf420ac49c13b361a503afffe386ada

From b02dbc3387c38392b1be1570ba1fa7bfe1673d61 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 20 Apr 2020 10:04:57 +0000
Subject: [PATCH 36/65] Properly allocate temporary space for quantized A

---
 .../intgemm/intgemm_fully_connected_op.cc     | 29 +++++--------------
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
index 79215fa1c4de..8eb61adf278f 100644
--- a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -141,23 +141,6 @@ bool IntgemmFullyConnectedOpType(const nnvm::NodeAttrs& attrs,
   return ((*in_attrs)[0] == mshadow::kInt8 || (*in_attrs)[0] == mshadow::kFloat32);
 }
 
-namespace {
-
-// This is used to free because AlignedVector does not have Reset.
-class FreeMe {
- public:
-  FreeMe() : mem_(nullptr) {}
-  ~FreeMe() { std::free(mem_); }
-  void Reset(int8_t *with) {
-    std::free(mem_);
-    mem_ = with;
-  }
- private:
-  int8_t *mem_;
-};
-
-}  // namespace
-
 void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
                           const std::vector<TBlob>& inputs,
@@ -200,17 +183,15 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
   float out_float_multiplier = *inputs[2].dptr<float>();
 
   int8_t *A_quant;
-  // TODO(kpuatamazon) report this memory consumption?
-  FreeMe A_quant_store;
+  mshadow::Tensor<cpu, 1, int8_t> A_quant_store;
   if (A.type_flag_ == mshadow::kFloat32) {
     const float *A_raw = A.dptr<float>();
     // Quantize A for the user.
     // Future: allow scale to be passed in? Should the induced scale be an output?
     float scale = 127.0 / ::intgemm::MaxAbsolute(A_raw, A_raw + A.shape_.Size());
     out_float_multiplier /= scale;
-    A_quant = static_cast<int8_t*>(aligned_alloc(64, A.shape_.Size()));
-    CHECK(A_quant);
-    A_quant_store.Reset(A_quant);
+    A_quant_store = ctx.requested[0].get_space_typed<cpu, 1, int8_t>(mshadow::Shape1(A.shape_.Size()), ctx.get_stream<cpu>());
+    A_quant = A_quant_store.dptr_;
     ::intgemm::Int8::PrepareA(A_raw, A_quant, scale, A_rows, inner);
   } else {
     CHECK_EQ(A.type_flag_, mshadow::kInt8);
@@ -266,6 +247,10 @@ The out_type can be int32 or float32.  Bias must have the same type.
       std::vector<std::string>{"data", "weight", "scaling"} :
       std::vector<std::string>{"data", "weight", "scaling", "bias"};
   })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
 .set_attr<mxnet::FInferShape>("FInferShape", IntgemmFullyConnectedOpShape)
 .set_attr<nnvm::FInferType>("FInferType", IntgemmFullyConnectedOpType)
 .set_attr<FCompute>("FCompute<cpu>", IntgemmFullyConnectedOpForwardCPU)

From d7cda47b7e74c5a5deac5aa51c4e4d36ed28242e Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 25 May 2020 09:09:17 +0000
Subject: [PATCH 37/65] Fix compile test path for avx512bw

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 3b0b68ff2e70..ea734ffc67a4 100644
--- a/Makefile
+++ b/Makefile
@@ -467,7 +467,7 @@ CUSRC = $(wildcard src/*/*/*/*.cu src/*/*/*.cu src/*/*.cu src/*.cu)
 ifeq ($(USE_INTGEMM), 1)
   $(shell mkdir -p build/3rdparty/intgemm/)
   $(shell echo '#pragma once' >build/3rdparty/intgemm/intgemm_config.h)
-  ifneq ($(shell $(CXX) $(CFLAGS) -mavx512f -mavx512bw -mavx512dq $(ROOTDIR)/3rdparty/intgemm/compile_test_avx512.cc 2>/dev/null && echo \\\#define INTGEMM_COMPILER_SUPPORTS_AVX512 >>build/3rdparty/intgemm/intgemm_config.h; echo $$?), 0)
+  ifneq ($(shell $(CXX) $(CFLAGS) $(ROOTDIR)/3rdparty/intgemm/compile_test_avx512bw.cc 2>/dev/null && echo \\\#define INTGEMM_COMPILER_SUPPORTS_AVX512 >>build/3rdparty/intgemm/intgemm_config.h; echo $$?), 0)
     $(warning WARNING: The compiler is too old for AVX512BW; so these instructions will not be used.)
   endif
   ifneq ($(shell $(CXX) $(CFLAGS) $(ROOTDIR)/3rdparty/intgemm/compile_test_avx512vnni.cc 2>/dev/null && echo \\\#define INTGEMM_COMPILER_SUPPORTS_AVX512VNNI >>build/3rdparty/intgemm/intgemm_config.h; echo $$?), 0)

From 7a20b91e6986fd774d85c838c3465d10391a7d82 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 25 May 2020 09:12:25 +0000
Subject: [PATCH 38/65] Define AVX512BW symbol

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index ea734ffc67a4..91eef9d63528 100644
--- a/Makefile
+++ b/Makefile
@@ -467,7 +467,7 @@ CUSRC = $(wildcard src/*/*/*/*.cu src/*/*/*.cu src/*/*.cu src/*.cu)
 ifeq ($(USE_INTGEMM), 1)
   $(shell mkdir -p build/3rdparty/intgemm/)
   $(shell echo '#pragma once' >build/3rdparty/intgemm/intgemm_config.h)
-  ifneq ($(shell $(CXX) $(CFLAGS) $(ROOTDIR)/3rdparty/intgemm/compile_test_avx512bw.cc 2>/dev/null && echo \\\#define INTGEMM_COMPILER_SUPPORTS_AVX512 >>build/3rdparty/intgemm/intgemm_config.h; echo $$?), 0)
+  ifneq ($(shell $(CXX) $(CFLAGS) $(ROOTDIR)/3rdparty/intgemm/compile_test_avx512bw.cc 2>/dev/null && echo \\\#define INTGEMM_COMPILER_SUPPORTS_AVX512BW >>build/3rdparty/intgemm/intgemm_config.h; echo $$?), 0)
     $(warning WARNING: The compiler is too old for AVX512BW; so these instructions will not be used.)
   endif
   ifneq ($(shell $(CXX) $(CFLAGS) $(ROOTDIR)/3rdparty/intgemm/compile_test_avx512vnni.cc 2>/dev/null && echo \\\#define INTGEMM_COMPILER_SUPPORTS_AVX512VNNI >>build/3rdparty/intgemm/intgemm_config.h; echo $$?), 0)

From acf325df3e2328b21fd05fd776cdcfa9d03362a1 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 8 Jun 2020 17:22:22 +0000
Subject: [PATCH 39/65] Whitespace

---
 src/operator/contrib/intgemm/intgemm_fully_connected_op.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
index 8eb61adf278f..78ab24dd6f8a 100644
--- a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -190,7 +190,9 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
     // Future: allow scale to be passed in? Should the induced scale be an output?
     float scale = 127.0 / ::intgemm::MaxAbsolute(A_raw, A_raw + A.shape_.Size());
     out_float_multiplier /= scale;
-    A_quant_store = ctx.requested[0].get_space_typed<cpu, 1, int8_t>(mshadow::Shape1(A.shape_.Size()), ctx.get_stream<cpu>());
+    A_quant_store = ctx.requested[0].get_space_typed<cpu, 1, int8_t>(
+        mshadow::Shape1(A.shape_.Size()),
+        ctx.get_stream<cpu>());
     A_quant = A_quant_store.dptr_;
     ::intgemm::Int8::PrepareA(A_raw, A_quant, scale, A_rows, inner);
   } else {

From 91c729d04a84b4d289a4d6d37dc517845f2c45bf Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Wed, 22 Jul 2020 17:33:21 +0000
Subject: [PATCH 40/65] Update intgemm including -mno-avx fix

---
 3rdparty/intgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index fb96b0851cf4..225dd7368549 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit fb96b0851cf420ac49c13b361a503afffe386ada
+Subproject commit 225dd7368549a0cdf69d2d212e0754e79894a92d

From 7ab838e792bfe12914f67cc2f0b9f47619bafaf7 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 3 Aug 2020 17:15:55 +0000
Subject: [PATCH 41/65] Align to 64 bytes for intgemm too

---
 CMakeLists.txt                   | 1 +
 src/storage/cpu_device_storage.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index db0fd78feab8..fb2ded6850d4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -299,6 +299,7 @@ if(USE_INTGEMM)
   include_directories(3rdparty/intgemm)
   #intgemm generates a config header based on AVX512 support in the compiler.
   include_directories(${CMAKE_CURRENT_BINARY_DIR}/3rdparty/intgemm)
+  add_definitions(-DMXNET_USE_INTGEMM=1)
 endif()
 
 # Allow Cuda compiles outside of src tree to find things in 'src' and 'include'
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index b81cdbc17e25..27d80f488ad1 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -50,7 +50,7 @@ class CPUDeviceStorage {
   /*!
    * \brief Alignment of allocation.
    */
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 1 || MXNET_USE_INTGEMM == 1
   // MKLDNN requires special alignment. 64 is used by the MKLDNN library in
   // memory allocation.
   static constexpr size_t alignment_ = kMKLDNNAlign;

From b1a9725223d1713d6b4c52f1a626d1526aa16cf4 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 3 Aug 2020 18:57:55 +0000
Subject: [PATCH 42/65] Use intgemm MaxAbsolute's OMP support

---
 3rdparty/intgemm                              |  2 +-
 .../contrib/intgemm/max_absolute_op.cc        | 23 +------------------
 2 files changed, 2 insertions(+), 23 deletions(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index 225dd7368549..3e3586c35e57 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit 225dd7368549a0cdf69d2d212e0754e79894a92d
+Subproject commit 3e3586c35e579a1033e284a61fae21f6b6b427a2
diff --git a/src/operator/contrib/intgemm/max_absolute_op.cc b/src/operator/contrib/intgemm/max_absolute_op.cc
index fd14c321fe19..5bbd48ff05e1 100644
--- a/src/operator/contrib/intgemm/max_absolute_op.cc
+++ b/src/operator/contrib/intgemm/max_absolute_op.cc
@@ -83,35 +83,14 @@ void MaxAbsoluteOpForwardCPU(const nnvm::NodeAttrs& attrs,
   CHECK(out.CheckContiguous());
 
   const std::size_t size = in.shape_.Size();
-  // Doesn't make sense to take MaxAbsolute of nothing.
-  CHECK_GE(size, 1U);
 
   const float *data = in.dptr<float>();
   // To maintain alignment, be a multiple of AVX512 register size.
-  const std::size_t kMultiple = 512 / 8 / sizeof(float);
+  const std::size_t kMultiple = 512 / 8;
   CHECK_EQ(reinterpret_cast<intptr_t>(data) % kMultiple, 0)
     << "Data must be aligned to " << kMultiple << " bytes.";
 
-#ifdef _OPENMP
-  float result = 0.0f;
-  // Every thread needs some work to do.  Should probably be more aggressive than this.
-  int threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-  threads = std::min<int>(threads, (size + kMultiple - 1) / kMultiple);
-  #pragma omp parallel num_threads(threads)
-  {
-    std::size_t num = omp_get_thread_num();
-    std::size_t thread_count = omp_get_num_threads();
-    std::size_t begin = ((num * size) / thread_count) & ~(kMultiple - 1);
-    std::size_t end = (((num + 1) * size) / thread_count) & ~(kMultiple - 1);
-    // Last thread gets the overhang.
-    if (num == thread_count - 1) end = size;
-    float local_result = ::intgemm::MaxAbsolute(data + begin, data + end);
-    #pragma omp critical
-    result = std::max(result, local_result);
-  }
-#else
   float result = ::intgemm::MaxAbsolute(data, data + size);
-#endif
   KERNEL_ASSIGN(*out.dptr<float>(), req[0], result);
 }
 

From 784889e67c697ad2ee5c93ff83900b369ab0746a Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 10 Aug 2020 10:11:12 +0000
Subject: [PATCH 43/65] Update intgemm including MSVC support

---
 3rdparty/intgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index 3e3586c35e57..1ba086e1e380 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit 3e3586c35e579a1033e284a61fae21f6b6b427a2
+Subproject commit 1ba086e1e380269471b1d023a4f0781afb6bf7ca

From ea3bb9dfaabe35fff17c7e358aa4c90cd25e664d Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 10 Aug 2020 17:12:00 +0000
Subject: [PATCH 44/65] More checks for 64-byte alignment

---
 src/operator/contrib/intgemm/prepare_data_op.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/operator/contrib/intgemm/prepare_data_op.cc b/src/operator/contrib/intgemm/prepare_data_op.cc
index 0bf40887fb1c..9be491c951f7 100644
--- a/src/operator/contrib/intgemm/prepare_data_op.cc
+++ b/src/operator/contrib/intgemm/prepare_data_op.cc
@@ -96,6 +96,8 @@ void PrepareDataOpForwardCPU(const nnvm::NodeAttrs& attrs,
 
   const float *A = in.dptr<float>();
   int8_t *quantA = out.dptr<int8_t>();
+  CHECK_EQ(reinterpret_cast<intptr_t>(A) % 64, 0);
+  CHECK_EQ(reinterpret_cast<intptr_t>(quantA) % 64, 0);
   const float multiplier = 127.0 / *inputs[1].dptr<float>();
   ::intgemm::Int8::Quantize(A, quantA, multiplier, in.shape_.Size());
 }

From 7618fb1614c8eb1d9e1803c394b4025d7f0eeedc Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 10 Aug 2020 17:12:18 +0000
Subject: [PATCH 45/65] Don't take a scaling factor for int32 output

---
 .../intgemm/intgemm_fully_connected_op.cc     | 144 ++++++++++++------
 1 file changed, 94 insertions(+), 50 deletions(-)

diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
index 78ab24dd6f8a..b0fc501cfe0f 100644
--- a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -60,30 +60,43 @@ struct IntgemmFullyConnectedParam : public dmlc::Parameter<IntgemmFullyConnected
 DMLC_REGISTER_PARAMETER(IntgemmFullyConnectedParam);
 
 namespace {
-template<class T> void IntgemmFullyConnectedSanity(const nnvm::NodeAttrs& attrs, T* in, T* out) {
+// Parse the above fields into indices for parameters.
+// The order is: data weight [scaling] [bias].
+struct ParameterIndices {
+  explicit ParameterIndices(const IntgemmFullyConnectedParam& param) :
+    data(0),
+    weight(1),
+    scaling(param.out_type == mshadow::kFloat32 ? 2 : kInvalid),
+    bias(param.no_bias ? kInvalid : (HaveScaling() ? 3 : 2)),
+    count(2U + HaveScaling() + HaveBias()) {}
+  bool HaveScaling() const { return scaling != kInvalid; }
+  bool HaveBias() const { return bias != kInvalid; }
+  const unsigned int data;
+  const unsigned int weight;
+  const unsigned int scaling;
+  const unsigned int bias;
+  const unsigned int count;
+  static const unsigned int kInvalid = std::numeric_limits<unsigned int>::max();
+};
+template<class T> ParameterIndices Sanity(const nnvm::NodeAttrs& attrs,
+                                          T* in,
+                                          T* out) {
   // 3-4 parameters: A, B, scaling, and optional bias
-  const IntgemmFullyConnectedParam& param = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
-  CHECK_EQ(in->size(), param.no_bias ? 3U : 4U);
+  ParameterIndices ret(nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed));
+  CHECK_EQ(in->size(), ret.count);
   CHECK_EQ(out->size(), 1U);
+  return ret;
 }
 }  // namespace
 
 inline bool IntgemmFullyConnectedOpShape(const nnvm::NodeAttrs& attrs,
                              mxnet::ShapeVector* in_shape,
                              mxnet::ShapeVector* out_shape) {
-  IntgemmFullyConnectedSanity(attrs, in_shape, out_shape);
-  // This follows FullyConnectedShape except there's no option to flatten and the bias is implied.
+  const ParameterIndices indices(Sanity(attrs, in_shape, out_shape));
   const IntgemmFullyConnectedParam& param = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
-
-  // The rest is copied from FullyConnected.
+  // This follows FullyConnectedShape except for scaling.
   using namespace mshadow;
-  if (!param.no_bias) {
-    CHECK_EQ(in_shape->size(), 4U) << "Input:[data, weight, scaling_factor, bias]";
-  } else {
-    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, scaling_factor]";
-  }
-  CHECK_EQ(out_shape->size(), 1U);
-  mxnet::TShape dshape = (*in_shape)[0];
+  mxnet::TShape dshape = (*in_shape)[indices.data];
   mxnet::TShape oshape = (*out_shape)[0];
   // require data to be known
   if (!mxnet::ndim_is_known(dshape)) return false;
@@ -94,12 +107,14 @@ inline bool IntgemmFullyConnectedOpShape(const nnvm::NodeAttrs& attrs,
   } else {
     num_input = dshape.ProdShape(1, dshape.ndim());
   }
-  SHAPE_ASSIGN_CHECK(*in_shape, 1, Shape2(param.num_hidden, num_input));
-  SHAPE_ASSIGN_CHECK(*in_shape, 2, mxnet::TShape(1, 1));
-  if (!param.no_bias) {
-    if (!shape_assign(&(*in_shape)[3], Shape1(param.num_hidden)) &&
-        !shape_assign(&(*in_shape)[3], Shape2(param.num_hidden, 1))) {
-      LOG(FATAL) << "Unexpected shape for bias " << (*in_shape)[3];
+  SHAPE_ASSIGN_CHECK(*in_shape, indices.weight, Shape2(param.num_hidden, num_input));
+  if (indices.HaveScaling()) {
+    SHAPE_ASSIGN_CHECK(*in_shape, indices.scaling, mxnet::TShape(1, 1));
+  }
+  if (indices.HaveBias()) {
+    if (!shape_assign(&(*in_shape)[indices.bias], Shape1(param.num_hidden)) &&
+        !shape_assign(&(*in_shape)[indices.bias], Shape2(param.num_hidden, 1))) {
+      LOG(FATAL) << "Unexpected shape for bias " << (*in_shape)[indices.bias];
     }
   }
 
@@ -112,7 +127,7 @@ inline bool IntgemmFullyConnectedOpShape(const nnvm::NodeAttrs& attrs,
   }
   if (oshape.ndim() > 0) {
     dshape[0] = oshape[0];
-    SHAPE_ASSIGN_CHECK(*in_shape, 0, dshape);
+    SHAPE_ASSIGN_CHECK(*in_shape, indices.data, dshape);
   }
   return true;
 }
@@ -120,25 +135,27 @@ inline bool IntgemmFullyConnectedOpShape(const nnvm::NodeAttrs& attrs,
 bool IntgemmFullyConnectedOpType(const nnvm::NodeAttrs& attrs,
                             std::vector<int>* in_attrs,
                             std::vector<int>* out_attrs) {
-  IntgemmFullyConnectedSanity(attrs, in_attrs, out_attrs);
+  const ParameterIndices indices(Sanity(attrs, in_attrs, out_attrs));
   const IntgemmFullyConnectedParam& param = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
 
   // Match the configuration for output.
   TYPE_ASSIGN_CHECK(*out_attrs, 0, param.out_type);
-  if (!param.no_bias) {
+  if (indices.HaveBias()) {
     // Bias has same type as output.
-    TYPE_ASSIGN_CHECK(*in_attrs, 3, (*out_attrs)[0]);
-    TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[3]);
+    TYPE_ASSIGN_CHECK(*in_attrs, indices.bias, (*out_attrs)[0]);
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[indices.bias]);
   }
   // Scaling is float32.
-  TYPE_ASSIGN_CHECK(*in_attrs, 2, mshadow::kFloat32);
-  // Users have to prepare B.
-  TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kInt8);
+  if (indices.HaveScaling()) {
+    TYPE_ASSIGN_CHECK(*in_attrs, indices.scaling, mshadow::kFloat32);
+  }
+  // Users have to prepare B. It wasn't intended to be efficient.
+  TYPE_ASSIGN_CHECK(*in_attrs, indices.weight, mshadow::kInt8);
   // A can be a float (in which case it is automatically quantized) or int8.
-  if (type_is_none((*in_attrs)[0])) {
+  if (type_is_none((*in_attrs)[indices.data])) {
     return false;
   }
-  return ((*in_attrs)[0] == mshadow::kInt8 || (*in_attrs)[0] == mshadow::kFloat32);
+  return ((*in_attrs)[indices.data] == mshadow::kInt8 || (*in_attrs)[indices.data] == mshadow::kFloat32);
 }
 
 void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
@@ -146,12 +163,12 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
                           const std::vector<TBlob>& inputs,
                           const std::vector<OpReqType>& req,
                           const std::vector<TBlob>& outputs) {
-  IntgemmFullyConnectedSanity(attrs, &inputs, &outputs);
+  const ParameterIndices indices(Sanity(attrs, &inputs, &outputs));
   const IntgemmFullyConnectedParam& param = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
   CHECK_EQ(req.size(), 1U);
   CHECK_EQ(req[0], kWriteTo) << "TODO: doing more than overwriting for intgemm.";
 
-  const TBlob &A = inputs[0], &B = inputs[1], &C = outputs[0];
+  const TBlob &A = inputs[indices.data], &B = inputs[indices.weight], &C = outputs[0];
 
   CHECK(A.type_flag_ == mshadow::kInt8 || A.type_flag_ == mshadow::kFloat32);
   CHECK_EQ(B.type_flag_, mshadow::kInt8);
@@ -170,17 +187,21 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
 
   bool bias = !param.no_bias;
   if (bias) {
-    CHECK_EQ(inputs[3].type_flag_, mshadow::kFloat32);
-    CHECK_EQ(C.type_flag_, mshadow::kFloat32);
-    CHECK_EQ(inputs[3].shape_.Size(), param.num_hidden);
+    CHECK_EQ(inputs[indices.bias].type_flag_, C.type_flag_);
+    CHECK_EQ(inputs[indices.bias].shape_.Size(), param.num_hidden);
   }
   CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) <<
     "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows;
   CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) <<
     "intgemm requires B have a multiple of " << ::intgemm::Int8::tile_info.b_cols <<
-    " columns inthe equation C = AB.";
+    " columns in the equation C = AB.";
 
-  float out_float_multiplier = *inputs[2].dptr<float>();
+  float out_float_multiplier;
+  if (indices.HaveScaling()) {
+    out_float_multiplier = *inputs[indices.scaling].dptr<float>();
+  } else {
+    out_float_multiplier = 0.0; // Unused; stop compiler from complaining.
+  }
 
   int8_t *A_quant;
   mshadow::Tensor<cpu, 1, int8_t> A_quant_store;
@@ -200,17 +221,30 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
     A_quant = A.dptr<int8_t>();
   }
   const int8_t *B_quant = B.dptr<int8_t>();
+  CHECK_EQ(reinterpret_cast<intptr_t>(A_quant) % 64, 0) << "Pointers should be aligned to a multiple of 64.";
+  CHECK_EQ(reinterpret_cast<intptr_t>(B_quant) % 64, 0) << "Pointers should be aligned to a multiple of 64.";
+  if (C.type_flag_ == mshadow::kFloat32) {
+    CHECK_EQ(reinterpret_cast<intptr_t>(C.dptr<float>()) % 64, 0) <<
+      "Pointers should be aligned to a multiple of 64.";
+  } else {
+    CHECK_EQ(reinterpret_cast<intptr_t>(C.dptr<int32_t>()) % 64, 0) <<
+      "Pointers should be aligned to a multiple of 64.";
+  }
 
   if (bias) {
     if (C.type_flag_ == mshadow::kFloat32) {
+      CHECK_EQ(reinterpret_cast<intptr_t>(inputs[indices.bias].dptr<float>()) % 64, 0) <<
+        "Pointers should be aligned to a multiple of 64.";
       ::intgemm::callbacks::UnquantizeAndAddBiasAndWrite cb(
           out_float_multiplier,
-          inputs[3].dptr<float>(),
+          inputs[indices.bias].dptr<float>(),
           C.dptr<float>());
       ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
     } else {
       // int32
-      ::intgemm::callbacks::AddBiasAndWrite cb(inputs[3].dptr<int32_t>(), C.dptr<int32_t>());
+      CHECK_EQ(reinterpret_cast<intptr_t>(inputs[indices.bias].dptr<int32_t>()) % 64, 0) <<
+        "Pointers should be aligned to a multiple of 64.";
+      ::intgemm::callbacks::AddBiasAndWrite cb(inputs[indices.bias].dptr<int32_t>(), C.dptr<int32_t>());
       ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
     }
   } else {
@@ -226,28 +260,38 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_intgemm_fully_connected)
-.describe(R"code(Multiply matrices using 8-bit integers.
+.describe(R"code(Multiply matrices using 8-bit integers.  data * weight.
+
+Input tensor arguments are: data weight [scaling] [bias]
 
-The data argument can be either float32 or prepared using intgemm_prepare_data.
+data: either float32 or prepared using intgemm_prepare_data (in which case it is int8).
 
-The weight argument must be prepared using intgemm_prepare_weight.
+weight: must be prepared using intgemm_prepare_weight.
 
-If out_type is float32, then a scaling factor is applied before bias.  Typically this is 1/the scaling factor you provided to prepare_weight/the scaling factor you provided to prepare_data (if data is quantized).
+scaling: present if and only if out_type is float32. If so this is multiplied by the result before adding bias. Typically:
+scaling = (max passed to intgemm_prepare_weight)/127.0 if data is in float32
+scaling = (max_passed to intgemm_prepare_data)/127.0 * (max passed to intgemm_prepare_weight)/127.0 if data is in int8
 
-The out_type can be int32 or float32.  Bias must have the same type.
+bias: present if and only if !no_bias. This is added to the output after scaling and has the same number of columns as the output.
+
+out_type: type of the output.
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<IntgemmFullyConnectedParam>)
 .set_num_inputs([](const NodeAttrs& attrs) {
-  const IntgemmFullyConnectedParam& params = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
-  return params.no_bias ? 3 : 4;
+  return ParameterIndices(nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed)).count;
 })
 .set_num_outputs(1)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
-    const IntgemmFullyConnectedParam& params = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
-    return params.no_bias ?
-      std::vector<std::string>{"data", "weight", "scaling"} :
-      std::vector<std::string>{"data", "weight", "scaling", "bias"};
+    std::vector<std::string> ret{"data", "weight"};
+    ParameterIndices indices(nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed));
+    if (indices.HaveScaling()) {
+      ret.push_back("scaling");
+    }
+    if (indices.HaveBias()) {
+      ret.push_back("bias");
+    }
+    return ret;
   })
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {

From 9fa5ffffaca8ba1a2b296378474000b8e8b5471c Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 10 Aug 2020 17:40:25 +0000
Subject: [PATCH 46/65] Tests for intgemm.

Note these currently fail due to 64-byte alignment being broken in master https://github.com/apache/incubator-mxnet/issues/18854
---
 tests/python/unittest/test_contrib_intgemm.py | 201 ++++++++++++++++++
 1 file changed, 201 insertions(+)
 create mode 100644 tests/python/unittest/test_contrib_intgemm.py

diff --git a/tests/python/unittest/test_contrib_intgemm.py b/tests/python/unittest/test_contrib_intgemm.py
new file mode 100644
index 000000000000..8f3afb8411ca
--- /dev/null
+++ b/tests/python/unittest/test_contrib_intgemm.py
@@ -0,0 +1,201 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet.test_utils import same
+from common import with_seed
+import random
+
+@with_seed()
+def test_contrib_intgemm_maxabsolute():
+    if "intgemm_maxabsolute" not in dir(mx.nd.contrib):
+        return
+    shapes = [
+        (3, 2),
+        (9, 17),
+        (2, 7, 1, 8),
+    ]
+    # Test all sizes relevant to register lengths too.
+    for i in range(1, 65):
+        shapes.append((i,))
+    for shape in shapes:
+        m = mx.nd.random_uniform(low=-100.0, high=100.0, shape=shape)
+        fast = mx.nd.contrib.intgemm_maxabsolute(m)
+        slow = mx.nd.max(mx.nd.abs(m))
+        assert same(fast, slow)
+
+@with_seed()
+def test_contrib_intgemm_prepare_data():
+    if "intgemm_prepare_data" not in dir(mx.nd.contrib):
+        return
+    # Try all weird overhang cases
+    shapes = [(i,) for i in range(1, 67)] + [(2,3), (130, 12)]
+    for shape in shapes:
+        for max_quant in [2.0]:#, 1.0, 3.0]:
+            m = mx.nd.random_uniform(low=-3.0, high=3.0, shape=shape)
+            test = mx.nd.contrib.intgemm_prepare_data(m, mx.nd.array([max_quant]))
+            # Reference: scale and round
+            ref = mx.nd.round(m * 127.0 / max_quant)
+            # Clip to [-127, 127].  Because otherwise e.g. -129 casts to +127.
+            ref = mx.nd.broadcast_maximum(ref, mx.nd.array([-127.0]))
+            ref = mx.nd.broadcast_minimum(ref, mx.nd.array([127.0]))
+            # Reference: cast to int8
+            ref = mx.nd.cast(ref, dtype='int8')
+            # Reference: ban -128
+            ref = mx.nd.broadcast_maximum(ref, mx.nd.array([-127], dtype = 'int8'))
+            assert same(test, ref)
+  
+@with_seed()
+def test_contrib_intgemm_weight_consistent():
+    # The weight format is actually CPU-dependent so we don't directly test the
+    # output, but indirectly that it works.
+    if "intgemm_prepare_weight" not in dir(mx.nd.contrib):
+        return
+    max_quant = mx.nd.array([2.0])
+    for shape in [(8, 64), (16, 64), (8, 128), (16, 128), (2, 4, 64)]:
+        m = mx.nd.random_uniform(low=-3.0, high=3.0, shape=shape)
+        direct = mx.nd.contrib.intgemm_prepare_weight(m, max_quant)
+        quant = mx.nd.contrib.intgemm_prepare_data(m, max_quant) 
+        indirect = mx.nd.contrib.intgemm_prepare_weight(quant, already_quantized=True)
+        #Should get the same data from direct call and already_quantized version.
+        assert same(direct, indirect)
+
+@with_seed()
+def test_contrib_intgemm_take_weight():
+    if "intgemm_take_weight" not in dir(mx.nd.contrib):
+        return
+    indices_to_try = [
+        [0,1,2,3,4,5,6,7],
+        [1,2,1,2,1,2,1,2],
+        [7,6,5,4,3,2,1,0],
+        [3,1,4,1,5,9,2,6],
+        [random.randint(0,15) for i in range(8)],
+        [random.randint(0,15) for i in range(16)],
+        [random.randint(0,15) for i in range(24)]
+    ]
+    # Since random_uniform doesn't support int8, use python
+    m = mx.nd.array([random.randint(-127,127) for i in range(16 * 64)], dtype='int8')
+    m = m.reshape((16, 64))
+    for indices in indices_to_try:
+        indices = mx.nd.array(indices, dtype='int32')
+        # Prepare weight then take.
+        test = mx.nd.contrib.intgemm_prepare_weight(m, already_quantized=True)
+        test = mx.nd.contrib.intgemm_take_weight(test, indices)
+        # Take then prepare.
+        ref = m.take(indices, axis=0)
+        ref = mx.nd.contrib.intgemm_prepare_weight(ref, already_quantized=True)
+        assert same(test, ref)
+
+# Test a particular shape of matrix multiplication.
+def single_multiply_shape(data_rows, inner, weight_cols):
+    # Don't use full range (-127, 127) to avoid saturation.
+    data = [random.randint(-64, 64) for i in range(data_rows * inner)]
+    data = mx.nd.array(data, dtype='int8').reshape((data_rows, inner))
+    weight = [random.randint(-64, 64) for i in range(inner * weight_cols)]
+    weight = mx.nd.array(weight, dtype='int8').reshape((weight_cols, inner))
+    weight_prepared = mx.nd.contrib.intgemm_prepare_weight(weight, already_quantized=True)
+
+    # int32 output, no bias
+    test = mx.nd.contrib.intgemm_fully_connected(data,
+                                                 weight_prepared,
+                                                 no_bias=True,
+                                                 flatten=False,
+                                                 out_type='int32',
+                                                 num_hidden=weight_cols)
+    ref = mx.nd.FullyConnected(mx.nd.cast(data, dtype='float32'),
+                               mx.nd.cast(weight, dtype='float32'),
+                               no_bias=True,
+                               flatten=False,
+                               num_hidden=weight_cols)
+    assert (mx.nd.cast(test, dtype='float32') - ref).norm().asscalar() < 0.01
+
+    # float32 output, no bias
+    scale = 3.0
+    test = mx.nd.contrib.intgemm_fully_connected(data,
+                                                 weight_prepared,
+                                                 mx.nd.array(scale),
+                                                 no_bias=True,
+                                                 flatten=False,
+                                                 out_type='float32',
+                                                 num_hidden=weight_cols)
+    assert (test - ref * scale).norm().asscalar() < 0.01
+
+    # int32 output, bias
+    bias = mx.nd.array([random.randint(-60000, 60000) for i in range(weight_cols)], dtype = 'int32')
+    test = mx.nd.contrib.intgemm_fully_connected(data,
+                                                 weight_prepared,
+                                                 bias,
+                                                 no_bias=False,
+                                                 flatten=False,
+                                                 out_type='int32',
+                                                 num_hidden=weight_cols)
+    ref = mx.nd.FullyConnected(mx.nd.cast(data, dtype='float32'),
+                               mx.nd.cast(weight, dtype='float32'),
+                               mx.nd.cast(bias, dtype='float32'),
+                               no_bias=False,
+                               flatten=False,
+                               num_hidden=weight_cols)
+    assert (mx.nd.cast(test, dtype='float32') - ref).norm().asscalar() < 0.01
+
+    # float32 output, bias
+    # Scaling is applied before bias (and bias is not scaled). So to make the
+    # reference comparison easy, just scale the bias beforehand.
+    test = mx.nd.contrib.intgemm_fully_connected(data,
+                                                 weight_prepared,
+                                                 mx.nd.array(scale),
+                                                 mx.nd.cast(bias, dtype='float32') * scale,
+                                                 no_bias=False,
+                                                 flatten=False,
+                                                 out_type='float32',
+                                                 num_hidden=weight_cols)
+    assert (test - ref * scale).norm().asscalar() < 0.01
+
+    # float32 input should work the same as manually prepared int8 input.
+    data_float = mx.nd.array([random.uniform(-3.14, 3.14) for i in range(data_rows * inner)])
+    data_float = data_float.reshape(data_rows, inner)
+    direct = mx.nd.contrib.intgemm_fully_connected(data_float,
+                                                   weight_prepared,
+                                                   mx.nd.array(scale),
+                                                   mx.nd.cast(bias, dtype='float32'),
+                                                   no_bias=False,
+                                                   flatten=False,
+                                                   out_type='float32',
+                                                   num_hidden=weight_cols)
+    maxabs = mx.nd.contrib.intgemm_maxabsolute(data_float)
+    data_prepared = mx.nd.contrib.intgemm_prepare_data(data_float, maxabs)
+    cooked = mx.nd.contrib.intgemm_fully_connected(data_prepared,
+                                                   weight_prepared,
+                                                   mx.nd.array(scale * maxabs / 127.0),
+                                                   mx.nd.cast(bias, dtype='float32'),
+                                                   no_bias=False,
+                                                   flatten=False,
+                                                   out_type='float32',
+                                                   num_hidden=weight_cols)
+    assert (direct - cooked).norm().asscalar() < 0.01
+
+
+def test_contrib_intgemm_multiply():
+    if "intgemm_fully_connected" not in dir(mx.nd.contrib):
+        return
+    #The multiplication routine has approximations so everything is tested
+    #deterministically to ensure bounds are met.
+    random.seed(1)
+    for data_rows in range(1, 5):
+        for inner in range(64, 256, 64):
+            for weight_cols in range(8, 24, 8):
+                single_multiply_shape(data_rows, inner, weight_cols)
+

From 920910542726870e75d85db19afb02be729883ef Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 10 Aug 2020 17:49:19 +0000
Subject: [PATCH 47/65] whitespace

---
 .../contrib/intgemm/intgemm_fully_connected_op.cc | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
index b0fc501cfe0f..8e5434a90081 100644
--- a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -155,7 +155,8 @@ bool IntgemmFullyConnectedOpType(const nnvm::NodeAttrs& attrs,
   if (type_is_none((*in_attrs)[indices.data])) {
     return false;
   }
-  return ((*in_attrs)[indices.data] == mshadow::kInt8 || (*in_attrs)[indices.data] == mshadow::kFloat32);
+  return ((*in_attrs)[indices.data] == mshadow::kInt8 ||
+      (*in_attrs)[indices.data] == mshadow::kFloat32);
 }
 
 void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
@@ -200,7 +201,7 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
   if (indices.HaveScaling()) {
     out_float_multiplier = *inputs[indices.scaling].dptr<float>();
   } else {
-    out_float_multiplier = 0.0; // Unused; stop compiler from complaining.
+    out_float_multiplier = 0.0;  // Unused; stop compiler from complaining.
   }
 
   int8_t *A_quant;
@@ -221,8 +222,10 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
     A_quant = A.dptr<int8_t>();
   }
   const int8_t *B_quant = B.dptr<int8_t>();
-  CHECK_EQ(reinterpret_cast<intptr_t>(A_quant) % 64, 0) << "Pointers should be aligned to a multiple of 64.";
-  CHECK_EQ(reinterpret_cast<intptr_t>(B_quant) % 64, 0) << "Pointers should be aligned to a multiple of 64.";
+  CHECK_EQ(reinterpret_cast<intptr_t>(A_quant) % 64, 0) <<
+    "Pointers should be aligned to a multiple of 64.";
+  CHECK_EQ(reinterpret_cast<intptr_t>(B_quant) % 64, 0) <<
+    "Pointers should be aligned to a multiple of 64.";
   if (C.type_flag_ == mshadow::kFloat32) {
     CHECK_EQ(reinterpret_cast<intptr_t>(C.dptr<float>()) % 64, 0) <<
       "Pointers should be aligned to a multiple of 64.";
@@ -244,7 +247,9 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
       // int32
       CHECK_EQ(reinterpret_cast<intptr_t>(inputs[indices.bias].dptr<int32_t>()) % 64, 0) <<
         "Pointers should be aligned to a multiple of 64.";
-      ::intgemm::callbacks::AddBiasAndWrite cb(inputs[indices.bias].dptr<int32_t>(), C.dptr<int32_t>());
+      ::intgemm::callbacks::AddBiasAndWrite cb(
+          inputs[indices.bias].dptr<int32_t>(),
+          C.dptr<int32_t>());
       ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
     }
   } else {

From a3fa6a049ded114a2220952d636b02c12a8f4845 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 17 Aug 2020 09:19:31 +0000
Subject: [PATCH 48/65] Update intgemm to remove MSVC warnings

---
 3rdparty/intgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index 1ba086e1e380..fdbf2df22648 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit 1ba086e1e380269471b1d023a4f0781afb6bf7ca
+Subproject commit fdbf2df226482f383c5e629bb588121dc5e1dfb6

From c0d93db27a961926721bd490f8559f2be69f4258 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 17 Aug 2020 09:24:50 +0000
Subject: [PATCH 49/65] Also allow intgemm without MKLDNN to have 64-byte
 alignment

---
 include/mxnet/base.h                  | 2 +-
 src/storage/storage_manager_helpers.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index aa0021d543a0..addd7665f5be 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -539,7 +539,7 @@ inline std::ostream& operator<<(std::ostream &out, const Context &ctx) {
 #define ADD_FILELINE "\n\nDefined in " __FILE__ ":L" STRINGIZE(__LINE__)
 
 
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 1 || MXNET_USE_INTGEMM == 1
 constexpr size_t kMKLDNNAlign = 64;
 #endif
 
diff --git a/src/storage/storage_manager_helpers.h b/src/storage/storage_manager_helpers.h
index 1fccb5a08f45..829076d1475c 100644
--- a/src/storage/storage_manager_helpers.h
+++ b/src/storage/storage_manager_helpers.h
@@ -120,7 +120,7 @@ class ContextHelperCPU : public ContextHelper {
   }
 
  private:
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 1 || MXNET_USE_INTGEMM == 1
   // MKLDNN requires special alignment. 64 is used by the MKLDNN library in
   // memory allocation.
   static constexpr size_t alignment_ = kMKLDNNAlign;

From 5d6279a491e9365daa649c411839a127d1187705 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 17 Aug 2020 11:33:45 +0000
Subject: [PATCH 50/65] Pass clang lint

---
 3rdparty/intgemm                                           | 2 +-
 src/operator/contrib/intgemm/intgemm_fully_connected_op.cc | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index fdbf2df22648..c23fcde6afd5 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit fdbf2df226482f383c5e629bb588121dc5e1dfb6
+Subproject commit c23fcde6afd5540de2d1fa7c67ce10ca99527654
diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
index 8e5434a90081..fbb1d23f538a 100644
--- a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -291,10 +291,10 @@ out_type: type of the output.
     std::vector<std::string> ret{"data", "weight"};
     ParameterIndices indices(nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed));
     if (indices.HaveScaling()) {
-      ret.push_back("scaling");
+      ret.emplace_back("scaling");
     }
     if (indices.HaveBias()) {
-      ret.push_back("bias");
+      ret.emplace_back("bias");
     }
     return ret;
   })

From 38424013c6e75724b8f42cf83476174a1bc33c23 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 17 Aug 2020 16:04:39 +0000
Subject: [PATCH 51/65] Mention intgemm is MIT

---
 LICENSE | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/LICENSE b/LICENSE
index 9aa20d166394..4a8f8dd5e6e8 100644
--- a/LICENSE
+++ b/LICENSE
@@ -309,6 +309,8 @@
          Licensed MIT © Zeno Rocha
     11. mx-theme - For details, see docs/python_docs/themes/mx-theme/LICENSE
          Copyright (c) 2016 myyasuda
+    12. intgemm - Refer to 3rdparty/intgemm/LICENSE
+         Copyright (c) 2017--2019 University of Edinburgh, Nikolay Bogoychev, Mateusz Chudyk, Kenneth Heafield, and Microsoft Corporation
 
 
     =======================================================================================

From de3c19d3844c4ba69b853cfbcf87fe475f3a07e5 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 17 Aug 2020 23:14:37 +0000
Subject: [PATCH 52/65] Slight fix for compilers without AVX512BW support

---
 3rdparty/intgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index c23fcde6afd5..0f05c3ebd037 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit c23fcde6afd5540de2d1fa7c67ce10ca99527654
+Subproject commit 0f05c3ebd037eacdf8cff165736fea2b0d125023

From 98588da71af0edd37da1877da383404da8bb6cff Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 17 Aug 2020 23:14:56 +0000
Subject: [PATCH 53/65] Fix flaky test whereby 0.5 could round either way

---
 tests/python/unittest/test_contrib_intgemm.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/python/unittest/test_contrib_intgemm.py b/tests/python/unittest/test_contrib_intgemm.py
index 8f3afb8411ca..53f1a398d2c4 100644
--- a/tests/python/unittest/test_contrib_intgemm.py
+++ b/tests/python/unittest/test_contrib_intgemm.py
@@ -47,6 +47,10 @@ def test_contrib_intgemm_prepare_data():
     for shape in shapes:
         for max_quant in [2.0]:#, 1.0, 3.0]:
             m = mx.nd.random_uniform(low=-3.0, high=3.0, shape=shape)
+            scaled = m * 127.0 / max_quant
+            # Rounding 0.5 can go up or down.  Move values away from 0.5.
+            too_close = mx.nd.abs(mx.nd.round(scaled) - scaled) > 0.45
+            m += max_quant / 127.0 * 0.05 * too_close
             test = mx.nd.contrib.intgemm_prepare_data(m, mx.nd.array([max_quant]))
             # Reference: scale and round
             ref = mx.nd.round(m * 127.0 / max_quant)

From a436cbd11c65239b84a6a0c2c43a5ca2d98dba2d Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 24 Aug 2020 10:22:23 +0000
Subject: [PATCH 54/65] Add npx aliases

---
 src/operator/contrib/intgemm/intgemm_fully_connected_op.cc | 1 +
 src/operator/contrib/intgemm/max_absolute_op.cc            | 1 +
 src/operator/contrib/intgemm/prepare_data_op.cc            | 1 +
 src/operator/contrib/intgemm/prepare_weight_op.cc          | 1 +
 src/operator/contrib/intgemm/take_weight_op.cc             | 1 +
 5 files changed, 5 insertions(+)

diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
index fbb1d23f538a..3c5c3124fa07 100644
--- a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -265,6 +265,7 @@ void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_intgemm_fully_connected)
+.add_alias("_npx_intgemm_fully_connected")
 .describe(R"code(Multiply matrices using 8-bit integers.  data * weight.
 
 Input tensor arguments are: data weight [scaling] [bias]
diff --git a/src/operator/contrib/intgemm/max_absolute_op.cc b/src/operator/contrib/intgemm/max_absolute_op.cc
index 5bbd48ff05e1..eab17302c66a 100644
--- a/src/operator/contrib/intgemm/max_absolute_op.cc
+++ b/src/operator/contrib/intgemm/max_absolute_op.cc
@@ -95,6 +95,7 @@ void MaxAbsoluteOpForwardCPU(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_intgemm_maxabsolute)
+.add_alias("_npx_intgemm_maxabsolute")
 .describe(R"code(Compute the maximum absolute value in a tensor of float32 fast on a CPU.  The tensor's total size must be a multiple of 16 and aligned to a multiple of 64 bytes.
 mxnet.nd.contrib.intgemm_maxabsolute(arr) == arr.abs().max()
 )code" ADD_FILELINE)
diff --git a/src/operator/contrib/intgemm/prepare_data_op.cc b/src/operator/contrib/intgemm/prepare_data_op.cc
index 9be491c951f7..978c64d7dc2d 100644
--- a/src/operator/contrib/intgemm/prepare_data_op.cc
+++ b/src/operator/contrib/intgemm/prepare_data_op.cc
@@ -103,6 +103,7 @@ void PrepareDataOpForwardCPU(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_intgemm_prepare_data)
+.add_alias("_npx_intgemm_prepare_data")
 .describe(R"code(This operator converts quantizes float32 to int8 while also banning -128.
 
 It it suitable for preparing an data matrix for use by intgemm's C=data * weights operation.
diff --git a/src/operator/contrib/intgemm/prepare_weight_op.cc b/src/operator/contrib/intgemm/prepare_weight_op.cc
index 68790c30f70a..5de97415783c 100644
--- a/src/operator/contrib/intgemm/prepare_weight_op.cc
+++ b/src/operator/contrib/intgemm/prepare_weight_op.cc
@@ -142,6 +142,7 @@ void PrepareWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_intgemm_prepare_weight)
+.add_alias("_npx_intgemm_prepare_weight")
 .describe(R"code(This operator converts a weight matrix in column-major format to intgemm's internal fast representation of weight matrices.  MXNet customarily stores weight matrices in column-major (transposed) format. This operator is not meant to be fast; it is meant to be run offline to quantize a model.
 
 In other words, it prepares weight for the operation C = data * weight^T.
diff --git a/src/operator/contrib/intgemm/take_weight_op.cc b/src/operator/contrib/intgemm/take_weight_op.cc
index 2cb87c372c8a..c94ecbceb5b3 100644
--- a/src/operator/contrib/intgemm/take_weight_op.cc
+++ b/src/operator/contrib/intgemm/take_weight_op.cc
@@ -122,6 +122,7 @@ void TakeWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_intgemm_take_weight)
+.add_alias("_npx_intgemm_take_weight")
 .describe(R"code(Index a weight matrix stored in intgemm's weight format.
 The indices select the outputs of matrix multiplication, not the inner dot product dimension.
 )code" ADD_FILELINE)

From 8e6739b909870de15a6b7759037e66bc78ed715e Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 24 Aug 2020 18:14:41 +0000
Subject: [PATCH 55/65] Update tests to support numpy, refactor to
 pytest.mark.parametrize

---
 tests/python/unittest/test_contrib_intgemm.py | 300 +++++++++---------
 1 file changed, 157 insertions(+), 143 deletions(-)

diff --git a/tests/python/unittest/test_contrib_intgemm.py b/tests/python/unittest/test_contrib_intgemm.py
index 53f1a398d2c4..a6c28fdc7448 100644
--- a/tests/python/unittest/test_contrib_intgemm.py
+++ b/tests/python/unittest/test_contrib_intgemm.py
@@ -16,190 +16,204 @@
 # under the License.
 
 import mxnet as mx
-from mxnet.test_utils import same
+from mxnet import np, npx
+from mxnet.test_utils import same, use_np, assert_almost_equal
 from common import with_seed
 import random
+import pytest
 
+@use_np
 @with_seed()
-def test_contrib_intgemm_maxabsolute():
+@pytest.mark.parametrize('shape',
+    [(3, 2), (9,17), (2, 7, 1, 8)] + [(i,) for i in range(1,65)])
+def test_contrib_intgemm_maxabsolute(shape):
     if "intgemm_maxabsolute" not in dir(mx.nd.contrib):
         return
-    shapes = [
-        (3, 2),
-        (9, 17),
-        (2, 7, 1, 8),
-    ]
-    # Test all sizes relevant to register lengths too.
-    for i in range(1, 65):
-        shapes.append((i,))
-    for shape in shapes:
-        m = mx.nd.random_uniform(low=-100.0, high=100.0, shape=shape)
-        fast = mx.nd.contrib.intgemm_maxabsolute(m)
-        slow = mx.nd.max(mx.nd.abs(m))
-        assert same(fast, slow)
+    # mx.nd API
+    m = mx.nd.random_uniform(low=-100.0, high=100.0, shape=shape)
+    fast = mx.nd.contrib.intgemm_maxabsolute(m)
+    slow = mx.nd.max(mx.nd.abs(m))
+    assert same(fast, slow)
+    # np API
+    m = np.random.uniform(low=-100.0, high=100.0, size=shape)
+    fast = npx.intgemm_maxabsolute(m).reshape(())
+    slow = np.max(np.abs(m))
+    assert same(fast, slow)
 
+@use_np
 @with_seed()
-def test_contrib_intgemm_prepare_data():
+@pytest.mark.parametrize('shape', [(i,) for i in range(1, 67)] + [(2,3), (130, 12)])
+@pytest.mark.parametrize('max_quant', [2.0])
+def test_contrib_intgemm_prepare_data(shape, max_quant):
     if "intgemm_prepare_data" not in dir(mx.nd.contrib):
         return
-    # Try all weird overhang cases
-    shapes = [(i,) for i in range(1, 67)] + [(2,3), (130, 12)]
-    for shape in shapes:
-        for max_quant in [2.0]:#, 1.0, 3.0]:
-            m = mx.nd.random_uniform(low=-3.0, high=3.0, shape=shape)
-            scaled = m * 127.0 / max_quant
-            # Rounding 0.5 can go up or down.  Move values away from 0.5.
-            too_close = mx.nd.abs(mx.nd.round(scaled) - scaled) > 0.45
-            m += max_quant / 127.0 * 0.05 * too_close
-            test = mx.nd.contrib.intgemm_prepare_data(m, mx.nd.array([max_quant]))
-            # Reference: scale and round
-            ref = mx.nd.round(m * 127.0 / max_quant)
-            # Clip to [-127, 127].  Because otherwise e.g. -129 casts to +127.
-            ref = mx.nd.broadcast_maximum(ref, mx.nd.array([-127.0]))
-            ref = mx.nd.broadcast_minimum(ref, mx.nd.array([127.0]))
-            # Reference: cast to int8
-            ref = mx.nd.cast(ref, dtype='int8')
-            # Reference: ban -128
-            ref = mx.nd.broadcast_maximum(ref, mx.nd.array([-127], dtype = 'int8'))
-            assert same(test, ref)
-  
+    m = mx.nd.random_uniform(low=-3.0, high=3.0, shape=shape)
+    scaled = m * 127.0 / max_quant
+    # Rounding 0.5 can go up or down.  Move values away from 0.5.
+    too_close = mx.nd.abs(mx.nd.round(scaled) - scaled) > 0.45
+    m += max_quant / 127.0 * 0.05 * too_close
+
+    # Reference: scale and round
+    ref = mx.nd.round(m * 127.0 / max_quant)
+    # Clip to [-127, 127].  Because otherwise e.g. -129 casts to +127.
+    ref = mx.nd.broadcast_maximum(ref, mx.nd.array([-127.0]))
+    ref = mx.nd.broadcast_minimum(ref, mx.nd.array([127.0]))
+    # Reference: cast to int8
+    ref = mx.nd.cast(ref, dtype='int8')
+    # Reference: ban -128
+    ref = mx.nd.broadcast_maximum(ref, mx.nd.array([-127], dtype = 'int8'))
+
+    test = mx.nd.contrib.intgemm_prepare_data(m, mx.nd.array([max_quant]))
+    assert same(test, ref)
+    test = npx.intgemm_prepare_data(m.as_np_ndarray(), np.array([max_quant]))
+    assert same(test, ref.as_np_ndarray())
+
+@use_np  
 @with_seed()
-def test_contrib_intgemm_weight_consistent():
+@pytest.mark.parametrize('shape', [(8, 64), (16, 64), (8, 128), (16, 128), (2, 4, 64)])
+@pytest.mark.parametrize('max_quant', [0.2, 3.0])
+@pytest.mark.parametrize('api', [(mx.nd.contrib, mx.nd), (npx, np)])
+def test_contrib_intgemm_weight_consistent(shape, max_quant, api):
     # The weight format is actually CPU-dependent so we don't directly test the
-    # output, but indirectly that it works.
+    # output, but indirectly test that it works.
     if "intgemm_prepare_weight" not in dir(mx.nd.contrib):
         return
-    max_quant = mx.nd.array([2.0])
-    for shape in [(8, 64), (16, 64), (8, 128), (16, 128), (2, 4, 64)]:
-        m = mx.nd.random_uniform(low=-3.0, high=3.0, shape=shape)
-        direct = mx.nd.contrib.intgemm_prepare_weight(m, max_quant)
-        quant = mx.nd.contrib.intgemm_prepare_data(m, max_quant) 
-        indirect = mx.nd.contrib.intgemm_prepare_weight(quant, already_quantized=True)
-        #Should get the same data from direct call and already_quantized version.
-        assert same(direct, indirect)
-
+    contrib, top = api
+    max_array = top.array([max_quant])
+    if top == mx.nd:
+        m = top.random_uniform(low=-3.0, high=3.0, shape=shape)
+    else:
+        m = np.random.uniform(size=shape)
+    direct = contrib.intgemm_prepare_weight(m, max_array)
+    quant = contrib.intgemm_prepare_data(m, max_array) 
+    indirect = contrib.intgemm_prepare_weight(quant, already_quantized=True)
+    # Should get the same data from direct call and already_quantized version.
+    assert same(direct, indirect)
+    
+@use_np
 @with_seed()
-def test_contrib_intgemm_take_weight():
-    if "intgemm_take_weight" not in dir(mx.nd.contrib):
-        return
-    indices_to_try = [
+@pytest.mark.parametrize('indices', [
         [0,1,2,3,4,5,6,7],
         [1,2,1,2,1,2,1,2],
         [7,6,5,4,3,2,1,0],
         [3,1,4,1,5,9,2,6],
+        # Since random_uniform doesn't support int8, use python
         [random.randint(0,15) for i in range(8)],
         [random.randint(0,15) for i in range(16)],
         [random.randint(0,15) for i in range(24)]
-    ]
-    # Since random_uniform doesn't support int8, use python
-    m = mx.nd.array([random.randint(-127,127) for i in range(16 * 64)], dtype='int8')
+    ])
+@pytest.mark.parametrize('api', [(mx.nd.contrib, mx.nd), (npx, np)])
+def test_contrib_intgemm_take_weight(indices, api):
+    if "intgemm_take_weight" not in dir(mx.nd.contrib):
+        return
+    contrib, top = api
+    m = top.array([random.randint(-127,127) for i in range(16 * 64)], dtype='int8')
     m = m.reshape((16, 64))
-    for indices in indices_to_try:
-        indices = mx.nd.array(indices, dtype='int32')
-        # Prepare weight then take.
-        test = mx.nd.contrib.intgemm_prepare_weight(m, already_quantized=True)
-        test = mx.nd.contrib.intgemm_take_weight(test, indices)
-        # Take then prepare.
-        ref = m.take(indices, axis=0)
-        ref = mx.nd.contrib.intgemm_prepare_weight(ref, already_quantized=True)
-        assert same(test, ref)
+    indices = top.array(indices, dtype='int32')
+    # Prepare weight then take.
+    test = contrib.intgemm_prepare_weight(m, already_quantized=True)
+    test = contrib.intgemm_take_weight(test, indices)
+    # Take then prepare.
+    ref = m.take(indices, axis=0)
+    ref = contrib.intgemm_prepare_weight(ref, already_quantized=True)
+    assert same(test, ref)
+
+@use_np
+@pytest.mark.parametrize('data_rows', range(1, 5))
+@pytest.mark.parametrize('inner', range(64, 256, 64))
+@pytest.mark.parametrize('weight_cols', range(8, 24, 8))
+@pytest.mark.parametrize('api', [
+    (mx.nd.contrib, mx.nd, mx.nd.FullyConnected, mx.nd.cast),
+    (npx, np, npx.fully_connected, npx.cast)])
+def test_contrib_intgemm_multiply(data_rows, inner, weight_cols, api):
+    if "intgemm_fully_connected" not in dir(mx.nd.contrib):
+        return
+    contrib, top, fully_connected, cast = api
+    #The multiplication routine has approximations so everything is tested
+    #deterministically to ensure bounds are met.
+    random.seed(1)
 
-# Test a particular shape of matrix multiplication.
-def single_multiply_shape(data_rows, inner, weight_cols):
     # Don't use full range (-127, 127) to avoid saturation.
     data = [random.randint(-64, 64) for i in range(data_rows * inner)]
-    data = mx.nd.array(data, dtype='int8').reshape((data_rows, inner))
+    data = top.array(data, dtype='int8').reshape((data_rows, inner))
     weight = [random.randint(-64, 64) for i in range(inner * weight_cols)]
-    weight = mx.nd.array(weight, dtype='int8').reshape((weight_cols, inner))
-    weight_prepared = mx.nd.contrib.intgemm_prepare_weight(weight, already_quantized=True)
+    weight = top.array(weight, dtype='int8').reshape((weight_cols, inner))
+    weight_prepared = contrib.intgemm_prepare_weight(weight, already_quantized=True)
 
     # int32 output, no bias
-    test = mx.nd.contrib.intgemm_fully_connected(data,
-                                                 weight_prepared,
-                                                 no_bias=True,
-                                                 flatten=False,
-                                                 out_type='int32',
-                                                 num_hidden=weight_cols)
-    ref = mx.nd.FullyConnected(mx.nd.cast(data, dtype='float32'),
-                               mx.nd.cast(weight, dtype='float32'),
-                               no_bias=True,
-                               flatten=False,
-                               num_hidden=weight_cols)
-    assert (mx.nd.cast(test, dtype='float32') - ref).norm().asscalar() < 0.01
+    test = contrib.intgemm_fully_connected(data,
+                                           weight_prepared,
+                                           no_bias=True,
+                                           flatten=False,
+                                           out_type='int32',
+                                           num_hidden=weight_cols)
+    ref = fully_connected(cast(data, dtype='float32'),
+                          cast(weight, dtype='float32'),
+                          no_bias=True,
+                          flatten=False,
+                          num_hidden=weight_cols)
+    assert_almost_equal(cast(test, dtype='float32'), ref, rtol=0.01, atol=0.01)
 
     # float32 output, no bias
     scale = 3.0
-    test = mx.nd.contrib.intgemm_fully_connected(data,
-                                                 weight_prepared,
-                                                 mx.nd.array(scale),
-                                                 no_bias=True,
-                                                 flatten=False,
-                                                 out_type='float32',
-                                                 num_hidden=weight_cols)
-    assert (test - ref * scale).norm().asscalar() < 0.01
+    test = contrib.intgemm_fully_connected(data,
+                                           weight_prepared,
+                                           top.array([scale]),
+                                           no_bias=True,
+                                           flatten=False,
+                                           out_type='float32',
+                                           num_hidden=weight_cols)
+    assert_almost_equal(test, ref * scale, rtol=0.01, atol=0.01)
 
     # int32 output, bias
-    bias = mx.nd.array([random.randint(-60000, 60000) for i in range(weight_cols)], dtype = 'int32')
-    test = mx.nd.contrib.intgemm_fully_connected(data,
-                                                 weight_prepared,
-                                                 bias,
-                                                 no_bias=False,
-                                                 flatten=False,
-                                                 out_type='int32',
-                                                 num_hidden=weight_cols)
-    ref = mx.nd.FullyConnected(mx.nd.cast(data, dtype='float32'),
-                               mx.nd.cast(weight, dtype='float32'),
-                               mx.nd.cast(bias, dtype='float32'),
+    bias = top.array([random.randint(-60000, 60000) for i in range(weight_cols)], dtype = 'int32')
+    test = contrib.intgemm_fully_connected(data,
+                                           weight_prepared,
+                                           bias,
+                                           no_bias=False,
+                                           flatten=False,
+                                           out_type='int32',
+                                           num_hidden=weight_cols)
+    ref = fully_connected(cast(data, dtype='float32'),
+                               cast(weight, dtype='float32'),
+                               cast(bias, dtype='float32'),
                                no_bias=False,
                                flatten=False,
                                num_hidden=weight_cols)
-    assert (mx.nd.cast(test, dtype='float32') - ref).norm().asscalar() < 0.01
+    assert_almost_equal(cast(test, dtype='float32'), ref, rtol=0.01, atol=0.01)
 
     # float32 output, bias
     # Scaling is applied before bias (and bias is not scaled). So to make the
     # reference comparison easy, just scale the bias beforehand.
-    test = mx.nd.contrib.intgemm_fully_connected(data,
-                                                 weight_prepared,
-                                                 mx.nd.array(scale),
-                                                 mx.nd.cast(bias, dtype='float32') * scale,
-                                                 no_bias=False,
-                                                 flatten=False,
-                                                 out_type='float32',
-                                                 num_hidden=weight_cols)
-    assert (test - ref * scale).norm().asscalar() < 0.01
+    test = contrib.intgemm_fully_connected(data,
+                                           weight_prepared,
+                                           top.array([scale]),
+                                           cast(bias, dtype='float32') * scale,
+                                           no_bias=False,
+                                           flatten=False,
+                                           out_type='float32',
+                                           num_hidden=weight_cols)
+    assert_almost_equal(test, ref * scale, rtol=0.01, atol=0.01)
 
     # float32 input should work the same as manually prepared int8 input.
-    data_float = mx.nd.array([random.uniform(-3.14, 3.14) for i in range(data_rows * inner)])
+    data_float = top.array([random.uniform(-3.14, 3.14) for i in range(data_rows * inner)])
     data_float = data_float.reshape(data_rows, inner)
-    direct = mx.nd.contrib.intgemm_fully_connected(data_float,
-                                                   weight_prepared,
-                                                   mx.nd.array(scale),
-                                                   mx.nd.cast(bias, dtype='float32'),
-                                                   no_bias=False,
-                                                   flatten=False,
-                                                   out_type='float32',
-                                                   num_hidden=weight_cols)
-    maxabs = mx.nd.contrib.intgemm_maxabsolute(data_float)
-    data_prepared = mx.nd.contrib.intgemm_prepare_data(data_float, maxabs)
-    cooked = mx.nd.contrib.intgemm_fully_connected(data_prepared,
-                                                   weight_prepared,
-                                                   mx.nd.array(scale * maxabs / 127.0),
-                                                   mx.nd.cast(bias, dtype='float32'),
-                                                   no_bias=False,
-                                                   flatten=False,
-                                                   out_type='float32',
-                                                   num_hidden=weight_cols)
-    assert (direct - cooked).norm().asscalar() < 0.01
-
-
-def test_contrib_intgemm_multiply():
-    if "intgemm_fully_connected" not in dir(mx.nd.contrib):
-        return
-    #The multiplication routine has approximations so everything is tested
-    #deterministically to ensure bounds are met.
-    random.seed(1)
-    for data_rows in range(1, 5):
-        for inner in range(64, 256, 64):
-            for weight_cols in range(8, 24, 8):
-                single_multiply_shape(data_rows, inner, weight_cols)
-
+    direct = contrib.intgemm_fully_connected(data_float,
+                                             weight_prepared,
+                                             top.array([scale]),
+                                             cast(bias, dtype='float32'),
+                                             no_bias=False,
+                                             flatten=False,
+                                             out_type='float32',
+                                             num_hidden=weight_cols)
+    maxabs = contrib.intgemm_maxabsolute(data_float)
+    data_prepared = contrib.intgemm_prepare_data(data_float, maxabs)
+    cooked = contrib.intgemm_fully_connected(data_prepared,
+                                             weight_prepared,
+                                             top.array(scale * maxabs / 127.0),
+                                             cast(bias, dtype='float32'),
+                                             no_bias=False,
+                                             flatten=False,
+                                             out_type='float32',
+                                             num_hidden=weight_cols)
+    assert_almost_equal(direct, cooked, rtol=0.01, atol=0.01)

From 29cc97022b10fbcb33f3955d2a8d1d686cc69440 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 24 Aug 2020 18:15:33 +0000
Subject: [PATCH 56/65] Remove transpose

---
 .../contrib/intgemm/prepare_weight_op.cc        | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/operator/contrib/intgemm/prepare_weight_op.cc b/src/operator/contrib/intgemm/prepare_weight_op.cc
index 5de97415783c..76054386497b 100644
--- a/src/operator/contrib/intgemm/prepare_weight_op.cc
+++ b/src/operator/contrib/intgemm/prepare_weight_op.cc
@@ -118,22 +118,21 @@ void PrepareWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
     "weight matrix) to be a multiple of " << ::intgemm::Int8::tile_info.b_cols << ".";
 
   int8_t *quantB = out.dptr<int8_t>();
+  CHECK_EQ(reinterpret_cast<intptr_t>(quantB) % 64, 0) <<
+    "Pointers should be aligned to a multiple of 64.";
   CHECK(in.type_flag_ == mshadow::kFloat32 || in.type_flag_ == mshadow::kInt8) <<
     "Expected either 32-bit values to be quantized or 8-bit values to rearrange.";
   if (in.type_flag_ == mshadow::kInt8) {
     const int8_t *B = in.dptr<int8_t>();
+    CHECK_EQ(reinterpret_cast<intptr_t>(B) % 64, 0) <<
+      "Pointers should be aligned to a multiple of 64.";
     ::intgemm::Int8::PrepareBQuantizedTransposed(B, quantB, inner, B_cols);
   } else if (in.type_flag_ == mshadow::kFloat32) {
     const float *B = in.dptr<float>();
-    // TODO(kpuatamazon): eliminate transpose here with https://github.com/kpu/intgemm/pull/56
-    intgemm::AlignedVector<float> B_transpose(inner * B_cols);
-    for (size_t i = 0; i < inner; ++i) {
-      for (size_t j = 0; j < B_cols; ++j) {
-        B_transpose[i * B_cols + j] = B[i + inner * j];
-      }
-    }
-    ::intgemm::Int8::PrepareB(
-        B_transpose.begin(),
+    CHECK_EQ(reinterpret_cast<intptr_t>(B) % 64, 0) <<
+      "Pointers should be aligned to a multiple of 64.";
+    ::intgemm::Int8::PrepareBTransposed(
+        B,
         quantB,
         127.0 / *inputs[1].dptr<float>(),
         inner,

From d03342adbe0d68528b3e8e26c8517b4fbdb0eec9 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Fri, 28 Aug 2020 11:49:43 +0000
Subject: [PATCH 57/65] gcc7 is already required. You don't need any special
 handling here.

---
 CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b835db3a1ee..a2ca899462ed 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,9 +64,7 @@ if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PR
 else()
   option(USE_MKLDNN "Build with MKL-DNN support" OFF)
 endif()
-#gcc 4 doesn't support AVX2 and SSSE3 support doesn't work with target attributes so ban gcc < 5 from intgemm.
-if ((CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING) AND 
-    ((NOT CMAKE_COMPILER_IS_GNUCC) OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 5.0)))
+if ((CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
   option(USE_INTGEMM          "Build with x86 intgemm library for low-precision multiplication" ON)
 else()
   option(USE_INTGEMM          "Build with x86 intgemm library for low-precision multiplication" OFF)

From d7a8ef4c6afcd0be7651104a0e418a269bc13f7e Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Fri, 28 Aug 2020 14:16:56 +0000
Subject: [PATCH 58/65] EXCLUDE_FROM_ALL

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a2ca899462ed..e52e198bbf43 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -285,7 +285,7 @@ endif()
 
 if(USE_INTGEMM)
   message(STATUS "Using intgemm")
-  add_subdirectory(3rdparty/intgemm)
+  add_subdirectory(3rdparty/intgemm EXCLUDE_FROM_ALL)
   include_directories(3rdparty/intgemm)
   #intgemm generates a config header based on AVX512 support in the compiler.
   include_directories(${CMAKE_CURRENT_BINARY_DIR}/3rdparty/intgemm)

From a5a441ea3e63b97564e560fcebd596ea57fbd6a3 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Fri, 28 Aug 2020 14:51:04 +0000
Subject: [PATCH 59/65] Change to downloaded intgemm

---
 .gitmodules                                    |  3 ---
 3rdparty/intgemm                               |  1 -
 CMakeLists.txt                                 | 18 ++++++++++++++----
 .../intgemm/intgemm_fully_connected_op.cc      |  3 +--
 .../contrib/intgemm/max_absolute_op.cc         |  2 +-
 .../contrib/intgemm/prepare_data_op.cc         |  2 +-
 .../contrib/intgemm/prepare_weight_op.cc       |  3 +--
 src/operator/contrib/intgemm/take_weight_op.cc |  2 +-
 8 files changed, 19 insertions(+), 15 deletions(-)
 delete mode 160000 3rdparty/intgemm

diff --git a/.gitmodules b/.gitmodules
index dee98eabd157..b215ed430d14 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -25,6 +25,3 @@
 [submodule "3rdparty/nvidia_cub"]
 	path = 3rdparty/nvidia_cub
 	url = https://github.com/NVlabs/cub.git
-[submodule "3rdparty/intgemm"]
-	path = 3rdparty/intgemm
-	url = https://github.com/kpu/intgemm
diff --git a/3rdparty/intgemm b/3rdparty/intgemm
deleted file mode 160000
index 0f05c3ebd037..000000000000
--- a/3rdparty/intgemm
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 0f05c3ebd037eacdf8cff165736fea2b0d125023
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e52e198bbf43..e8f2601e491b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -285,10 +285,20 @@ endif()
 
 if(USE_INTGEMM)
   message(STATUS "Using intgemm")
-  add_subdirectory(3rdparty/intgemm EXCLUDE_FROM_ALL)
-  include_directories(3rdparty/intgemm)
+  include(FetchContent)
+  FetchContent_Declare(
+    intgemm
+    GIT_REPOSITORY https://github.com/kpu/intgemm.git
+    GIT_TAG        0f05c3ebd037eacdf8cff165736fea2b0d125023
+  )
+  FetchContent_GetProperties(intgemm)
+  if(NOT intgemm_POPULATED)
+    FetchContent_Populate(intgemm)
+    add_subdirectory(${intgemm_SOURCE_DIR} ${intgemm_BINARY_DIR} EXCLUDE_FROM_ALL)
+  endif()
+  include_directories(${intgemm_SOURCE_DIR})
   #intgemm generates a config header based on AVX512 support in the compiler.
-  include_directories(${CMAKE_CURRENT_BINARY_DIR}/3rdparty/intgemm)
+  include_directories(${intgemm_BINARY_DIR})
   add_definitions(-DMXNET_USE_INTGEMM=1)
 endif()
 
@@ -489,7 +499,7 @@ FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h" "include/*.h")
 FILE(GLOB_RECURSE CUDA "src/*.cu" "src/*.cuh")
 
 if (USE_INTGEMM)
-  list(APPEND SOURCE "3rdparty/intgemm/intgemm.cc")
+  list(APPEND SOURCE "${intgemm_SOURCE_DIR}/intgemm.cc")
 else()
   FILE(GLOB_RECURSE INTGEMM_OPERATOR_SOURCE "src/operator/contrib/intgemm/*.cc" "src/operator/contrib/intgemm/*.h")
   list(REMOVE_ITEM SOURCE ${INTGEMM_OPERATOR_SOURCE})
diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
index 3c5c3124fa07..f092829e3d8e 100644
--- a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -30,8 +30,7 @@
 #include "../../operator_common.h"
 #include "../../tensor/init_op.h"
 
-#include "../../../../3rdparty/intgemm/aligned.h"
-#include "../../../../3rdparty/intgemm/intgemm.h"
+#include "intgemm.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/contrib/intgemm/max_absolute_op.cc b/src/operator/contrib/intgemm/max_absolute_op.cc
index eab17302c66a..0f5ed2589622 100644
--- a/src/operator/contrib/intgemm/max_absolute_op.cc
+++ b/src/operator/contrib/intgemm/max_absolute_op.cc
@@ -29,7 +29,7 @@
 #include "../../operator_common.h"
 #include "../../tensor/init_op.h"
 
-#include "../../../../3rdparty/intgemm/intgemm.h"
+#include "intgemm.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/contrib/intgemm/prepare_data_op.cc b/src/operator/contrib/intgemm/prepare_data_op.cc
index 978c64d7dc2d..ec1120448cf2 100644
--- a/src/operator/contrib/intgemm/prepare_data_op.cc
+++ b/src/operator/contrib/intgemm/prepare_data_op.cc
@@ -31,7 +31,7 @@
 #include "../../operator_common.h"
 #include "../../tensor/init_op.h"
 
-#include "../../../../3rdparty/intgemm/intgemm.h"
+#include "intgemm.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/contrib/intgemm/prepare_weight_op.cc b/src/operator/contrib/intgemm/prepare_weight_op.cc
index 76054386497b..5425cac879a5 100644
--- a/src/operator/contrib/intgemm/prepare_weight_op.cc
+++ b/src/operator/contrib/intgemm/prepare_weight_op.cc
@@ -29,8 +29,7 @@
 #include "../../operator_common.h"
 #include "../../tensor/init_op.h"
 
-#include "../../../../3rdparty/intgemm/aligned.h"
-#include "../../../../3rdparty/intgemm/intgemm.h"
+#include "intgemm.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/contrib/intgemm/take_weight_op.cc b/src/operator/contrib/intgemm/take_weight_op.cc
index c94ecbceb5b3..5c87e5b0a352 100644
--- a/src/operator/contrib/intgemm/take_weight_op.cc
+++ b/src/operator/contrib/intgemm/take_weight_op.cc
@@ -31,7 +31,7 @@
 #include "../../operator_common.h"
 #include "../../tensor/init_op.h"
 
-#include "../../../../3rdparty/intgemm/intgemm.h"
+#include "intgemm.h"
 
 namespace mxnet {
 namespace op {

From 33ad78208c911a5e9d770416266b0d3510767213 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Fri, 28 Aug 2020 16:29:17 +0000
Subject: [PATCH 60/65] Change intgemm.cc to linked library

---
 CMakeLists.txt | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e8f2601e491b..8a5161be17a7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -498,9 +498,7 @@ endif()
 FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h" "include/*.h")
 FILE(GLOB_RECURSE CUDA "src/*.cu" "src/*.cuh")
 
-if (USE_INTGEMM)
-  list(APPEND SOURCE "${intgemm_SOURCE_DIR}/intgemm.cc")
-else()
+if(NOT USE_INTGEMM)
   FILE(GLOB_RECURSE INTGEMM_OPERATOR_SOURCE "src/operator/contrib/intgemm/*.cc" "src/operator/contrib/intgemm/*.h")
   list(REMOVE_ITEM SOURCE ${INTGEMM_OPERATOR_SOURCE})
 endif()
@@ -781,6 +779,10 @@ if(USE_MKLDNN)
       ${CMAKE_BINARY_DIR}/3rdparty/mkldnn/include/dnnl_version.h  ${CMAKE_SOURCE_DIR}/include/mkldnn/)
 endif()
 
+if(USE_INTGEMM)
+  target_link_libraries(mxnet PRIVATE intgemm)
+endif()
+
 function(BuildTVMOP)
   # scope the variables in BuildTVM.cmake to avoid conflict
   include(cmake/BuildTVM.cmake)

From 03732c7bee38d98b4997d138ea82adb1d455551d Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Fri, 28 Aug 2020 18:06:47 +0000
Subject: [PATCH 61/65] Use target_link_libraries to pick up intgemm
 compilation test header

---
 CMakeLists.txt | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8a5161be17a7..72ac584177c1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -289,16 +289,14 @@ if(USE_INTGEMM)
   FetchContent_Declare(
     intgemm
     GIT_REPOSITORY https://github.com/kpu/intgemm.git
-    GIT_TAG        0f05c3ebd037eacdf8cff165736fea2b0d125023
+    GIT_TAG        2ea4d8681eb553bd5311ccfa740ce6b8c0b055e3
   )
   FetchContent_GetProperties(intgemm)
   if(NOT intgemm_POPULATED)
     FetchContent_Populate(intgemm)
-    add_subdirectory(${intgemm_SOURCE_DIR} ${intgemm_BINARY_DIR} EXCLUDE_FROM_ALL)
   endif()
+  add_subdirectory(${intgemm_SOURCE_DIR} ${intgemm_BINARY_DIR} EXCLUDE_FROM_ALL)
   include_directories(${intgemm_SOURCE_DIR})
-  #intgemm generates a config header based on AVX512 support in the compiler.
-  include_directories(${intgemm_BINARY_DIR})
   add_definitions(-DMXNET_USE_INTGEMM=1)
 endif()
 

From 8aaa23c5b1f1b3dbef78be229abdc83dbe96216d Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Fri, 28 Aug 2020 18:35:58 +0000
Subject: [PATCH 62/65] Change to a cmake_dependent_option

---
 CMakeLists.txt | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 72ac584177c1..a6e73cdaa71a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,11 +64,7 @@ if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PR
 else()
   option(USE_MKLDNN "Build with MKL-DNN support" OFF)
 endif()
-if ((CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
-  option(USE_INTGEMM          "Build with x86 intgemm library for low-precision multiplication" ON)
-else()
-  option(USE_INTGEMM          "Build with x86 intgemm library for low-precision multiplication" OFF)
-endif()
+cmake_dependent_option(USE_INTGEMM "Build with x86_64 intgemm library for low-precision multiplication" ON "CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64" OFF)
 if(NOT MSVC)
   option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON)
 else()

From 8ac7fe6e882046cae32b5884f2e5cd09bdff4f63 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Fri, 28 Aug 2020 18:58:36 +0000
Subject: [PATCH 63/65] Revert "Change to downloaded intgemm" and remove header
 reference from CMakeLists.txt

This reverts commit a5a441ea3e63b97564e560fcebd596ea57fbd6a3.
---
 .gitmodules                                         |  3 +++
 3rdparty/intgemm                                    |  1 +
 CMakeLists.txt                                      | 13 +------------
 .../contrib/intgemm/intgemm_fully_connected_op.cc   |  3 ++-
 src/operator/contrib/intgemm/max_absolute_op.cc     |  2 +-
 src/operator/contrib/intgemm/prepare_data_op.cc     |  2 +-
 src/operator/contrib/intgemm/prepare_weight_op.cc   |  3 ++-
 src/operator/contrib/intgemm/take_weight_op.cc      |  2 +-
 8 files changed, 12 insertions(+), 17 deletions(-)
 create mode 160000 3rdparty/intgemm

diff --git a/.gitmodules b/.gitmodules
index b215ed430d14..dee98eabd157 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -25,3 +25,6 @@
 [submodule "3rdparty/nvidia_cub"]
 	path = 3rdparty/nvidia_cub
 	url = https://github.com/NVlabs/cub.git
+[submodule "3rdparty/intgemm"]
+	path = 3rdparty/intgemm
+	url = https://github.com/kpu/intgemm
diff --git a/3rdparty/intgemm b/3rdparty/intgemm
new file mode 160000
index 000000000000..2ea4d8681eb5
--- /dev/null
+++ b/3rdparty/intgemm
@@ -0,0 +1 @@
+Subproject commit 2ea4d8681eb553bd5311ccfa740ce6b8c0b055e3
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a6e73cdaa71a..577d121e5c9a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -281,18 +281,7 @@ endif()
 
 if(USE_INTGEMM)
   message(STATUS "Using intgemm")
-  include(FetchContent)
-  FetchContent_Declare(
-    intgemm
-    GIT_REPOSITORY https://github.com/kpu/intgemm.git
-    GIT_TAG        2ea4d8681eb553bd5311ccfa740ce6b8c0b055e3
-  )
-  FetchContent_GetProperties(intgemm)
-  if(NOT intgemm_POPULATED)
-    FetchContent_Populate(intgemm)
-  endif()
-  add_subdirectory(${intgemm_SOURCE_DIR} ${intgemm_BINARY_DIR} EXCLUDE_FROM_ALL)
-  include_directories(${intgemm_SOURCE_DIR})
+  add_subdirectory(3rdparty/intgemm EXCLUDE_FROM_ALL)
   add_definitions(-DMXNET_USE_INTGEMM=1)
 endif()
 
diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
index f092829e3d8e..3c5c3124fa07 100644
--- a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -30,7 +30,8 @@
 #include "../../operator_common.h"
 #include "../../tensor/init_op.h"
 
-#include "intgemm.h"
+#include "../../../../3rdparty/intgemm/aligned.h"
+#include "../../../../3rdparty/intgemm/intgemm.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/contrib/intgemm/max_absolute_op.cc b/src/operator/contrib/intgemm/max_absolute_op.cc
index 0f5ed2589622..eab17302c66a 100644
--- a/src/operator/contrib/intgemm/max_absolute_op.cc
+++ b/src/operator/contrib/intgemm/max_absolute_op.cc
@@ -29,7 +29,7 @@
 #include "../../operator_common.h"
 #include "../../tensor/init_op.h"
 
-#include "intgemm.h"
+#include "../../../../3rdparty/intgemm/intgemm.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/contrib/intgemm/prepare_data_op.cc b/src/operator/contrib/intgemm/prepare_data_op.cc
index ec1120448cf2..978c64d7dc2d 100644
--- a/src/operator/contrib/intgemm/prepare_data_op.cc
+++ b/src/operator/contrib/intgemm/prepare_data_op.cc
@@ -31,7 +31,7 @@
 #include "../../operator_common.h"
 #include "../../tensor/init_op.h"
 
-#include "intgemm.h"
+#include "../../../../3rdparty/intgemm/intgemm.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/contrib/intgemm/prepare_weight_op.cc b/src/operator/contrib/intgemm/prepare_weight_op.cc
index 5425cac879a5..76054386497b 100644
--- a/src/operator/contrib/intgemm/prepare_weight_op.cc
+++ b/src/operator/contrib/intgemm/prepare_weight_op.cc
@@ -29,7 +29,8 @@
 #include "../../operator_common.h"
 #include "../../tensor/init_op.h"
 
-#include "intgemm.h"
+#include "../../../../3rdparty/intgemm/aligned.h"
+#include "../../../../3rdparty/intgemm/intgemm.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/contrib/intgemm/take_weight_op.cc b/src/operator/contrib/intgemm/take_weight_op.cc
index 5c87e5b0a352..c94ecbceb5b3 100644
--- a/src/operator/contrib/intgemm/take_weight_op.cc
+++ b/src/operator/contrib/intgemm/take_weight_op.cc
@@ -31,7 +31,7 @@
 #include "../../operator_common.h"
 #include "../../tensor/init_op.h"
 
-#include "intgemm.h"
+#include "../../../../3rdparty/intgemm/intgemm.h"
 
 namespace mxnet {
 namespace op {

From e6ddba887f0e96b8be288b3d3044c9a7e1531fca Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 31 Aug 2020 10:13:43 +0000
Subject: [PATCH 64/65] Change to #include <intgemm/intgemm.h>

---
 3rdparty/intgemm                                           | 2 +-
 src/operator/contrib/intgemm/intgemm_fully_connected_op.cc | 3 +--
 src/operator/contrib/intgemm/max_absolute_op.cc            | 2 +-
 src/operator/contrib/intgemm/prepare_data_op.cc            | 2 +-
 src/operator/contrib/intgemm/prepare_weight_op.cc          | 3 +--
 src/operator/contrib/intgemm/take_weight_op.cc             | 2 +-
 6 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/3rdparty/intgemm b/3rdparty/intgemm
index 2ea4d8681eb5..02f671cf537f 160000
--- a/3rdparty/intgemm
+++ b/3rdparty/intgemm
@@ -1 +1 @@
-Subproject commit 2ea4d8681eb553bd5311ccfa740ce6b8c0b055e3
+Subproject commit 02f671cf537fdbc818cf8111d1d9e557a8650d7a
diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
index 3c5c3124fa07..216f5ce47ecc 100644
--- a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -30,8 +30,7 @@
 #include "../../operator_common.h"
 #include "../../tensor/init_op.h"
 
-#include "../../../../3rdparty/intgemm/aligned.h"
-#include "../../../../3rdparty/intgemm/intgemm.h"
+#include "intgemm/intgemm.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/contrib/intgemm/max_absolute_op.cc b/src/operator/contrib/intgemm/max_absolute_op.cc
index eab17302c66a..01e10b0f9908 100644
--- a/src/operator/contrib/intgemm/max_absolute_op.cc
+++ b/src/operator/contrib/intgemm/max_absolute_op.cc
@@ -29,7 +29,7 @@
 #include "../../operator_common.h"
 #include "../../tensor/init_op.h"
 
-#include "../../../../3rdparty/intgemm/intgemm.h"
+#include "intgemm/intgemm.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/contrib/intgemm/prepare_data_op.cc b/src/operator/contrib/intgemm/prepare_data_op.cc
index 978c64d7dc2d..1d5719de36d2 100644
--- a/src/operator/contrib/intgemm/prepare_data_op.cc
+++ b/src/operator/contrib/intgemm/prepare_data_op.cc
@@ -31,7 +31,7 @@
 #include "../../operator_common.h"
 #include "../../tensor/init_op.h"
 
-#include "../../../../3rdparty/intgemm/intgemm.h"
+#include "intgemm/intgemm.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/contrib/intgemm/prepare_weight_op.cc b/src/operator/contrib/intgemm/prepare_weight_op.cc
index 76054386497b..ad106ebca00b 100644
--- a/src/operator/contrib/intgemm/prepare_weight_op.cc
+++ b/src/operator/contrib/intgemm/prepare_weight_op.cc
@@ -29,8 +29,7 @@
 #include "../../operator_common.h"
 #include "../../tensor/init_op.h"
 
-#include "../../../../3rdparty/intgemm/aligned.h"
-#include "../../../../3rdparty/intgemm/intgemm.h"
+#include "intgemm/intgemm.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/contrib/intgemm/take_weight_op.cc b/src/operator/contrib/intgemm/take_weight_op.cc
index c94ecbceb5b3..09e320e47327 100644
--- a/src/operator/contrib/intgemm/take_weight_op.cc
+++ b/src/operator/contrib/intgemm/take_weight_op.cc
@@ -31,7 +31,7 @@
 #include "../../operator_common.h"
 #include "../../tensor/init_op.h"
 
-#include "../../../../3rdparty/intgemm/intgemm.h"
+#include "intgemm/intgemm.h"
 
 namespace mxnet {
 namespace op {

From 7e7b0c2018ed1cd2973530450b6ef96335176d55 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 31 Aug 2020 10:39:36 +0000
Subject: [PATCH 65/65] Fetch intgemm in build

---
 .gitmodules      |  3 ---
 3rdparty/intgemm |  1 -
 CMakeLists.txt   | 12 +++++++++++-
 3 files changed, 11 insertions(+), 5 deletions(-)
 delete mode 160000 3rdparty/intgemm

diff --git a/.gitmodules b/.gitmodules
index dee98eabd157..b215ed430d14 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -25,6 +25,3 @@
 [submodule "3rdparty/nvidia_cub"]
 	path = 3rdparty/nvidia_cub
 	url = https://github.com/NVlabs/cub.git
-[submodule "3rdparty/intgemm"]
-	path = 3rdparty/intgemm
-	url = https://github.com/kpu/intgemm
diff --git a/3rdparty/intgemm b/3rdparty/intgemm
deleted file mode 160000
index 02f671cf537f..000000000000
--- a/3rdparty/intgemm
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 02f671cf537fdbc818cf8111d1d9e557a8650d7a
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 577d121e5c9a..a48a63032098 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -281,7 +281,17 @@ endif()
 
 if(USE_INTGEMM)
   message(STATUS "Using intgemm")
-  add_subdirectory(3rdparty/intgemm EXCLUDE_FROM_ALL)
+  include(FetchContent)
+  FetchContent_Declare(
+    intgemm
+    GIT_REPOSITORY https://github.com/kpu/intgemm.git
+    GIT_TAG        02f671cf537fdbc818cf8111d1d9e557a8650d7a
+  )
+  FetchContent_GetProperties(intgemm)
+  if(NOT intgemm_POPULATED)
+    FetchContent_Populate(intgemm)
+  endif()
+  add_subdirectory(${intgemm_SOURCE_DIR} ${intgemm_BINARY_DIR} EXCLUDE_FROM_ALL)
   add_definitions(-DMXNET_USE_INTGEMM=1)
 endif()