From 2cb8faf30aa607c720fa382d1822487a8d1f480b Mon Sep 17 00:00:00 2001
From: Steffen Rochel <steffenrochel@gmail.com>
Date: Thu, 29 Nov 2018 19:24:11 -0800
Subject: [PATCH 01/28] updated to v1.5.0

---
 R-package/DESCRIPTION | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 46702eff9ed7..338dc84d7e7b 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,17 +1,17 @@
 Package: mxnet
 Type: Package
 Title: MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems
-Version: 1.4.0
+Version: 1.5.0
 Date: 2017-06-27
 Author: Tianqi Chen, Qiang Kou, Tong He
 Maintainer: Qiang Kou <qkou@qkou.info>
-Repository: DMLC
+Repository: Apache
 Description: MXNet is a deep learning framework designed for both efficiency
     and flexibility. It allows you to mix the flavours of deep learning programs
     together to maximize the efficiency and your productivity.
 License: Apache License (== 2.0)
-URL: https://github.com/dmlc/mxnet/tree/master/R-package
-BugReports: https://github.com/dmlc/mxnet/issues
+URL: https://github.com/apache/incubator-mxnet/tree/master/R-package
+BugReports: https://github.com/apache/incubator-mxnet/issues
 Imports:
     methods,
     Rcpp (>= 0.12.1),

From e4af8e7bf6954e72fd1f44b5937ef18c3aa098ad Mon Sep 17 00:00:00 2001
From: Steffen Rochel <steffenrochel@gmail.com>
Date: Thu, 29 Nov 2018 19:51:04 -0800
Subject: [PATCH 02/28] Bumped minor version from 1.4.0 to 1.5.0 on master

---
 contrib/clojure-package/README.md                | 16 ++++++++--------
 .../examples/cnn-text-classification/project.clj |  2 +-
 contrib/clojure-package/examples/gan/project.clj |  2 +-
 .../examples/imclassification/project.clj        |  2 +-
 .../clojure-package/examples/module/project.clj  |  2 +-
 .../examples/multi-label/project.clj             |  2 +-
 .../examples/neural-style/project.clj            |  2 +-
 .../examples/pre-trained-models/project.clj      |  2 +-
 .../examples/profiler/project.clj                |  2 +-
 contrib/clojure-package/examples/rnn/project.clj |  2 +-
 .../examples/tutorial/project.clj                |  6 +++---
 .../examples/visualization/project.clj           |  2 +-
 contrib/clojure-package/project.clj              |  4 ++--
 docs/tutorials/scala/mxnet_scala_on_intellij.md  |  4 ++--
 include/mxnet/base.h                             |  2 +-
 python/mxnet/libinfo.py                          |  2 +-
 scala-package/assembly/linux-x86_64-cpu/pom.xml  |  8 ++++----
 scala-package/assembly/linux-x86_64-gpu/pom.xml  |  8 ++++----
 scala-package/assembly/osx-x86_64-cpu/pom.xml    |  8 ++++----
 scala-package/assembly/pom.xml                   |  2 +-
 scala-package/core/pom.xml                       |  6 +++---
 scala-package/examples/pom.xml                   |  6 +++---
 scala-package/infer/pom.xml                      |  4 ++--
 scala-package/init-native/linux-x86_64/pom.xml   |  4 ++--
 scala-package/init-native/osx-x86_64/pom.xml     |  4 ++--
 scala-package/init-native/pom.xml                |  2 +-
 scala-package/init/pom.xml                       |  2 +-
 scala-package/macros/pom.xml                     |  6 +++---
 scala-package/native/linux-x86_64-cpu/pom.xml    |  4 ++--
 scala-package/native/linux-x86_64-gpu/pom.xml    |  4 ++--
 scala-package/native/osx-x86_64-cpu/pom.xml      |  4 ++--
 scala-package/native/pom.xml                     |  2 +-
 scala-package/pom.xml                            |  2 +-
 scala-package/spark/pom.xml                      |  4 ++--
 snapcraft.yaml                                   |  2 +-
 .../train_mxnet_legacy_models.sh                 |  4 ++--
 36 files changed, 70 insertions(+), 70 deletions(-)

diff --git a/contrib/clojure-package/README.md b/contrib/clojure-package/README.md
index bc6100b86123..10b3ed770582 100644
--- a/contrib/clojure-package/README.md
+++ b/contrib/clojure-package/README.md
@@ -105,9 +105,9 @@ brew install opencv
 - Create a new project with `lein new my-mxnet`
 - Edit your `project.clj` and add one of the following entries to `:dependencies`, based on your system and the compute device you want to use:
 
-  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu "1.4.0"]`
-  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-gpu "1.4.0"]`
-  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu "1.4.0"]`
+  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu "1.5.0"]`
+  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-gpu "1.5.0"]`
+  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu "1.5.0"]`
 
 After making this change and running `lein deps`, you should be able to run example code like this [NDArray Tutorial](https://github.com/apache/incubator-mxnet/blob/master/contrib/clojure-package/examples/tutorial/src/tutorial/ndarray.clj).
 
@@ -116,20 +116,20 @@ After making this change and running `lein deps`, you should be able to run exam
 With this option, you will install a Git revision of the Clojure package source and a [Scala package jar from Maven](https://search.maven.org/search?q=g:org.apache.mxnet) with native dependencies baked in.
 
 - Install additional dependencies as described in [the corresponding section for Option 1](#installing-additional-dependencies),
-- Recursively clone the MXNet repository and checkout the desired revision. Here we assume the `1.4.0` tag and a clone into the `~/mxnet` directory:
+- Recursively clone the MXNet repository and checkout the desired revision. Here we assume the `1.5.0` tag and a clone into the `~/mxnet` directory:
 
   ```bash
   git clone --recursive https://github.com/apache/incubator-mxnet.git ~/mxnet
   cd ~/mxnet
   git tag --list  # Find the tag that matches the Scala package version
-  git checkout tags/1.4.0 -b my_mxnet
+  git checkout tags/1.5.0 -b my_mxnet
   git submodule update --init --recursive
   cd contrib/clojure
   ```
 
 - Edit `project.clj` to include the desired Scala jar from Maven:
 
-      [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.4.0”]
+      [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.5.0”]
 
 - Run `lein test`. All the tests should run without error.
 - At this point you can run `lein install` to build and install the Clojure jar locally.
@@ -147,7 +147,7 @@ The first step is to recursively clone the MXNet repository and checkout the des
   ```bash
   git clone --recursive https://github.com/apache/incubator-mxnet.git ~/mxnet
   cd ~/mxnet
-  git checkout tags/1.4.0 -b my_mxnet  # this is optional
+  git checkout tags/1.5.0 -b my_mxnet  # this is optional
   git submodule update --init --recursive
   ```
 
@@ -176,7 +176,7 @@ The outcome of this step will be a shared library `lib/libmxnet.so` that is used
 
 #### Building the Clojure jar
  
-- Enter the `contrib/clojure` directory and edit the `project.clj` file. Add the Scala jar that was just created and installed, e.g., `[org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu "1.4.0-SNAPSHOT"]`, to the `:dependencies`.
+- Enter the `contrib/clojure` directory and edit the `project.clj` file. Add the Scala jar that was just created and installed, e.g., `[org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu "1.5.0-SNAPSHOT"]`, to the `:dependencies`.
 - Run `lein test`. All the tests should run without an error.
 - Run `lein install` to build and install the Clojure jar locally.
 
diff --git a/contrib/clojure-package/examples/cnn-text-classification/project.clj b/contrib/clojure-package/examples/cnn-text-classification/project.clj
index 3eed0ddf9d9c..29ebefe5d200 100644
--- a/contrib/clojure-package/examples/cnn-text-classification/project.clj
+++ b/contrib/clojure-package/examples/cnn-text-classification/project.clj
@@ -19,6 +19,6 @@
   :description "CNN text classification with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :pedantic? :skip
   :main cnn-text-classification.classifier)
diff --git a/contrib/clojure-package/examples/gan/project.clj b/contrib/clojure-package/examples/gan/project.clj
index 36b7c6cb3089..b8f6903cabba 100644
--- a/contrib/clojure-package/examples/gan/project.clj
+++ b/contrib/clojure-package/examples/gan/project.clj
@@ -19,6 +19,6 @@
   :description "GAN MNIST with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
                  [nu.pattern/opencv "2.4.9-7"]]
   :main gan.gan-mnist)
diff --git a/contrib/clojure-package/examples/imclassification/project.clj b/contrib/clojure-package/examples/imclassification/project.clj
index 0dbede5052ac..5f77cf55cf35 100644
--- a/contrib/clojure-package/examples/imclassification/project.clj
+++ b/contrib/clojure-package/examples/imclassification/project.clj
@@ -19,6 +19,6 @@
   :description "Clojure examples for image classification"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :pedantic? :skip
   :main imclassification.train-mnist)
diff --git a/contrib/clojure-package/examples/module/project.clj b/contrib/clojure-package/examples/module/project.clj
index a9a0a5f23e6e..b667a2a4e122 100644
--- a/contrib/clojure-package/examples/module/project.clj
+++ b/contrib/clojure-package/examples/module/project.clj
@@ -19,7 +19,7 @@
   :description "Clojure examples for module"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :pedantic? :skip
   :main mnist-mlp)
 
diff --git a/contrib/clojure-package/examples/multi-label/project.clj b/contrib/clojure-package/examples/multi-label/project.clj
index 8923738b946d..6e6a14340d36 100644
--- a/contrib/clojure-package/examples/multi-label/project.clj
+++ b/contrib/clojure-package/examples/multi-label/project.clj
@@ -19,5 +19,5 @@
   :description "Example of multi-label classification"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :main multi-label.core)
diff --git a/contrib/clojure-package/examples/neural-style/project.clj b/contrib/clojure-package/examples/neural-style/project.clj
index 5a8eebea783f..b6d29f7c0e87 100644
--- a/contrib/clojure-package/examples/neural-style/project.clj
+++ b/contrib/clojure-package/examples/neural-style/project.clj
@@ -19,7 +19,7 @@
   :description "Neural Style Transfer with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
                  [net.mikera/imagez "0.12.0"]
                  [thinktopic/think.image "0.4.16"]]
   :main neural-style.core)
diff --git a/contrib/clojure-package/examples/pre-trained-models/project.clj b/contrib/clojure-package/examples/pre-trained-models/project.clj
index 58b591ce5307..11e002503464 100644
--- a/contrib/clojure-package/examples/pre-trained-models/project.clj
+++ b/contrib/clojure-package/examples/pre-trained-models/project.clj
@@ -19,7 +19,7 @@
   :description "Example of using pre-trained models with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
                  [net.mikera/imagez "0.12.0"]
                  [thinktopic/think.image "0.4.16"]]
   :main pre-trained-models.fine-tune)
diff --git a/contrib/clojure-package/examples/profiler/project.clj b/contrib/clojure-package/examples/profiler/project.clj
index fa30eafa0daf..cc50482d0418 100644
--- a/contrib/clojure-package/examples/profiler/project.clj
+++ b/contrib/clojure-package/examples/profiler/project.clj
@@ -18,5 +18,5 @@
 (defproject profiler "0.1.0-SNAPSHOT"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :main profiler.core)
diff --git a/contrib/clojure-package/examples/rnn/project.clj b/contrib/clojure-package/examples/rnn/project.clj
index 291f2bd46e3a..64f4c290741c 100644
--- a/contrib/clojure-package/examples/rnn/project.clj
+++ b/contrib/clojure-package/examples/rnn/project.clj
@@ -19,5 +19,5 @@
   :description "RNN example"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :main rnn.train-char-rnn)
diff --git a/contrib/clojure-package/examples/tutorial/project.clj b/contrib/clojure-package/examples/tutorial/project.clj
index 8a78ec6a6abf..9c4f1b96f9e0 100644
--- a/contrib/clojure-package/examples/tutorial/project.clj
+++ b/contrib/clojure-package/examples/tutorial/project.clj
@@ -20,6 +20,6 @@
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
                  ;; Uncomment the one appropriate for your machine & configuration:
-                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu "1.4.0"]
-                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-gpu "1.4.0"]
-                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu "1.4.0"]])
+                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu "1.5.0"]
+                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-gpu "1.5.0"]
+                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu "1.5.0"]])
diff --git a/contrib/clojure-package/examples/visualization/project.clj b/contrib/clojure-package/examples/visualization/project.clj
index d56ddfb23f0c..d91ace3188e6 100644
--- a/contrib/clojure-package/examples/visualization/project.clj
+++ b/contrib/clojure-package/examples/visualization/project.clj
@@ -19,5 +19,5 @@
   :description "Visualization example"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :main visualization.core)
diff --git a/contrib/clojure-package/project.clj b/contrib/clojure-package/project.clj
index ae7ccd67fd9c..12a0504e02d5 100644
--- a/contrib/clojure-package/project.clj
+++ b/contrib/clojure-package/project.clj
@@ -15,7 +15,7 @@
 ;; limitations under the License.
 ;;
 
-(defproject org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"
+(defproject org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"
   :description "Clojure package for MXNet"
   :url "https://github.com/apache/incubator-mxnet"
   :license {:name "Apache License"
@@ -29,7 +29,7 @@
                  ;[org.apache.mxnet/mxnet-full_2.11-linux-x86_64-gpu "1.2.1"]
 
                  ;;; CI
-                 [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.4.0-SNAPSHOT"]
+                 [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.5.0-SNAPSHOT"]
 
                  [org.clojure/tools.logging "0.4.0"]
                  [org.apache.logging.log4j/log4j-core "2.8.1"]
diff --git a/docs/tutorials/scala/mxnet_scala_on_intellij.md b/docs/tutorials/scala/mxnet_scala_on_intellij.md
index 174e3018098b..a0bf24e34e28 100644
--- a/docs/tutorials/scala/mxnet_scala_on_intellij.md
+++ b/docs/tutorials/scala/mxnet_scala_on_intellij.md
@@ -385,14 +385,14 @@ If you chose to "Build from Source" when following the [install instructions](ht
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.version}-${platform}-sources</artifactId>
       <scope>system</scope>
-      <systemPath>/PathToMXNetSource/incubator-mxnet/scala-package/assembly/osx-x86_64-cpu/target/mxnet-full_${scala.version}-osx-x86_64-cpu-1.4.0-SNAPSHOT-sources.jar</systemPath>
+      <systemPath>/PathToMXNetSource/incubator-mxnet/scala-package/assembly/osx-x86_64-cpu/target/mxnet-full_${scala.version}-osx-x86_64-cpu-1.5.0-SNAPSHOT-sources.jar</systemPath>
     </dependency>
 
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-full_${scala.version}-${platform}</artifactId>
       <scope>system</scope>
-      <systemPath>/PathToMXNetSource/incubator-mxnet/scala-package/assembly/osx-x86_64-cpu/target/mxnet-full_${scala.version}-osx-x86_64-cpu-1.4.0-SNAPSHOT.jar</systemPath>
+      <systemPath>/PathToMXNetSource/incubator-mxnet/scala-package/assembly/osx-x86_64-cpu/target/mxnet-full_${scala.version}-osx-x86_64-cpu-1.5.0-SNAPSHOT.jar</systemPath>
     </dependency>
 ```
 
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index f773139d6c3e..92d9c2699d63 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -102,7 +102,7 @@
 /*! \brief major version */
 #define MXNET_MAJOR 1
 /*! \brief minor version */
-#define MXNET_MINOR 4
+#define MXNET_MINOR 5
 /*! \brief patch version */
 #define MXNET_PATCH 0
 /*! \brief mxnet version */
diff --git a/python/mxnet/libinfo.py b/python/mxnet/libinfo.py
index 0669a03c6520..928580148417 100644
--- a/python/mxnet/libinfo.py
+++ b/python/mxnet/libinfo.py
@@ -103,4 +103,4 @@ def find_include_path():
 
 
 # current version
-__version__ = "1.4.0"
+__version__ = "1.5.0"
diff --git a/scala-package/assembly/linux-x86_64-cpu/pom.xml b/scala-package/assembly/linux-x86_64-cpu/pom.xml
index fbc0ab027ac7..abefead175c7 100644
--- a/scala-package/assembly/linux-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,18 +18,18 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-scala-linux-x86_64-cpu</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
   </dependencies>
 
diff --git a/scala-package/assembly/linux-x86_64-gpu/pom.xml b/scala-package/assembly/linux-x86_64-gpu/pom.xml
index a1a94808e918..96ffa38c6af2 100644
--- a/scala-package/assembly/linux-x86_64-gpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,18 +18,18 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-scala-linux-x86_64-gpu</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
   </dependencies>
 
diff --git a/scala-package/assembly/osx-x86_64-cpu/pom.xml b/scala-package/assembly/osx-x86_64-cpu/pom.xml
index bb6af0353762..5c5733a9a4ce 100644
--- a/scala-package/assembly/osx-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/osx-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,18 +18,18 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-scala-osx-x86_64-cpu</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jnilib</type>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
   </dependencies>
 
diff --git a/scala-package/assembly/pom.xml b/scala-package/assembly/pom.xml
index 8de320eb2ade..c1d1a3b8e721 100644
--- a/scala-package/assembly/pom.xml
+++ b/scala-package/assembly/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 3425bb15f62a..484fbbd96790 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -100,13 +100,13 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-macros_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/scala-package/examples/pom.xml b/scala-package/examples/pom.xml
index 9e8e119c3c4f..8d3d156a0b18 100644
--- a/scala-package/examples/pom.xml
+++ b/scala-package/examples/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -149,13 +149,13 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/scala-package/infer/pom.xml b/scala-package/infer/pom.xml
index 3e6980cb6f4b..ac76cdd19f3b 100644
--- a/scala-package/infer/pom.xml
+++ b/scala-package/infer/pom.xml
@@ -6,7 +6,7 @@
     <parent>
         <artifactId>mxnet-parent_2.11</artifactId>
         <groupId>org.apache.mxnet</groupId>
-        <version>1.4.0-SNAPSHOT</version>
+        <version>1.5.0-SNAPSHOT</version>
         <relativePath>../pom.xml</relativePath>
     </parent>
 
@@ -89,7 +89,7 @@
         <dependency>
             <groupId>org.apache.mxnet</groupId>
             <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-            <version>1.4.0-SNAPSHOT</version>
+            <version>1.5.0-SNAPSHOT</version>
             <scope>provided</scope>
         </dependency>
         <!-- https://mvnrepository.com/artifact/org.mockito/mockito-all -->
diff --git a/scala-package/init-native/linux-x86_64/pom.xml b/scala-package/init-native/linux-x86_64/pom.xml
index 12a36bd6e944..b71d7cf71528 100644
--- a/scala-package/init-native/linux-x86_64/pom.xml
+++ b/scala-package/init-native/linux-x86_64/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/init-native/osx-x86_64/pom.xml b/scala-package/init-native/osx-x86_64/pom.xml
index d0290942ef84..b4a0b1d6584a 100644
--- a/scala-package/init-native/osx-x86_64/pom.xml
+++ b/scala-package/init-native/osx-x86_64/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/init-native/pom.xml b/scala-package/init-native/pom.xml
index 17a829c0c217..bed216e45035 100644
--- a/scala-package/init-native/pom.xml
+++ b/scala-package/init-native/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/init/pom.xml b/scala-package/init/pom.xml
index a5b88c308637..4278df6f2e73 100644
--- a/scala-package/init/pom.xml
+++ b/scala-package/init/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
 <!--  <relativePath>../pom.xml</relativePath>-->
   </parent>
 
diff --git a/scala-package/macros/pom.xml b/scala-package/macros/pom.xml
index d435e211ceeb..cd56060b4b36 100644
--- a/scala-package/macros/pom.xml
+++ b/scala-package/macros/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -53,13 +53,13 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-init-scala-${platform}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
       <type>${libtype}</type>
     </dependency>
diff --git a/scala-package/native/linux-x86_64-cpu/pom.xml b/scala-package/native/linux-x86_64-cpu/pom.xml
index ac8e4a45e67a..2415cf7d26db 100644
--- a/scala-package/native/linux-x86_64-cpu/pom.xml
+++ b/scala-package/native/linux-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/linux-x86_64-gpu/pom.xml b/scala-package/native/linux-x86_64-gpu/pom.xml
index cdba5774f6a0..0186217234bc 100644
--- a/scala-package/native/linux-x86_64-gpu/pom.xml
+++ b/scala-package/native/linux-x86_64-gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/osx-x86_64-cpu/pom.xml b/scala-package/native/osx-x86_64-cpu/pom.xml
index 333486c67392..0ab7ca1dd0f0 100644
--- a/scala-package/native/osx-x86_64-cpu/pom.xml
+++ b/scala-package/native/osx-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/pom.xml b/scala-package/native/pom.xml
index e267c8d797ab..2f6425d21104 100644
--- a/scala-package/native/pom.xml
+++ b/scala-package/native/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index 76bf00b54ba6..151462cbcc68 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -10,7 +10,7 @@
   </parent>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-parent_2.11</artifactId>
-  <version>1.4.0-SNAPSHOT</version>
+  <version>1.5.0-SNAPSHOT</version>
   <name>MXNet Scala Package - Parent</name>
   <url>https://github.com/apache/incubator-mxnet/tree/master/scala-package</url>
   <description>
diff --git a/scala-package/spark/pom.xml b/scala-package/spark/pom.xml
index ee4f3efa98e4..2db3bee8c78d 100644
--- a/scala-package/spark/pom.xml
+++ b/scala-package/spark/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -40,7 +40,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/snapcraft.yaml b/snapcraft.yaml
index e70bf6e5b4b3..d8d0e301e6b1 100644
--- a/snapcraft.yaml
+++ b/snapcraft.yaml
@@ -1,5 +1,5 @@
 name: mxnet
-version: '1.4.0'
+version: '1.5.0'
 summary: MXNet is a deep learning framework designed for efficiency and flexibility.
 description: |
   MXNet is a deep learning framework designed for both efficiency and 
diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh
index 02d480d9d3ba..bda47f9e650d 100755
--- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh
+++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh
@@ -61,8 +61,8 @@ echo `pwd`
 ## This list is sorted in descending order chronologically.
 ## Sample output for the below git tag command is : 1.2.0 utils 1.1.0 1.0.0 0.12.1
 ## so from this sample, we will pick up all the versions matching with the current latest version
-## Now while performing inference the latest version could be 1.4.0, which will help in validating models trained
-## on 1.1.0 and 1.2.0 by loading them on the latest version (1.4.0)
+## Now while performing inference the latest version could be 1.5.0, which will help in validating models trained
+## on 1.1.0 and 1.2.0 by loading them on the latest version (1.5.0)
 ## Over a period of time, the model repository will grow since with every new release we
 ## upload models trained on newer versions as well through this script
 previous_versions=($(git tag --sort=-creatordate | grep --invert-match rc))

From 49bbcbcaf8b3482d7152a4ea3e56f84e944c9869 Mon Sep 17 00:00:00 2001
From: Steffen Rochel <steffenrochel@gmail.com>
Date: Sat, 1 Dec 2018 16:52:18 -0800
Subject: [PATCH 03/28] added Anirudh as maintainer for R package

... adding something useful and re-trigger PR check
---
 R-package/DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 338dc84d7e7b..da098996c68b 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -4,7 +4,7 @@ Title: MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneou
 Version: 1.5.0
 Date: 2017-06-27
 Author: Tianqi Chen, Qiang Kou, Tong He
-Maintainer: Qiang Kou <qkou@qkou.info>
+Maintainer: Qiang Kou <qkou@qkou.info>, anirudhacharya <https://github.com/anirudhacharya>
 Repository: Apache
 Description: MXNet is a deep learning framework designed for both efficiency
     and flexibility. It allows you to mix the flavours of deep learning programs

From 42c6db0c933be4b1498d8c3e01bf24b5749d3921 Mon Sep 17 00:00:00 2001
From: Steffen Rochel <steffenrochel@gmail.com>
Date: Sun, 2 Dec 2018 13:34:52 -0800
Subject: [PATCH 04/28] Updated license file for clojure, onnx-tensorrt, gtest,
 R-package

---
 LICENSE | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 88 insertions(+), 6 deletions(-)

diff --git a/LICENSE b/LICENSE
index a8b57e583764..2eb9c329e532 100644
--- a/LICENSE
+++ b/LICENSE
@@ -218,16 +218,20 @@
     1. MXNet Cpp-package - For details, /cpp-package/LICENSE
     2. MXNet rcnn - For details, see, example/rcnn/LICENSE
     3. scala-package - For details, see, scala-package/LICENSE
-    4. Warp-CTC - For details, see, src/operator/contrib/ctc_include/LICENSE
+    4. Warp-CTC - For details, see, 3rdparty/ctc_include/LICENSE
     5. 3rdparty/dlpack - For details, see, 3rdparty/dlpack/LICENSE
     6. 3rdparty/dmlc-core - For details, see, 3rdparty/dmlc-core/LICENSE
     7. 3rdparty/mshadow - For details, see, 3rdparty/mshadow/LICENSE
     8. 3rdparty/tvm - For details, see, 3rdparty/tvm/LICENSE
     9. 3rdparty/tvm/dmlc-core - For details, see, 3rdparty/tvm/dmlc-core/LICENSE
-    10. 3rdparty/tvm/nnvm - For details, see, 3rdparty/tvm/nnvm/LICENSE
-    11. 3rdparty/ps-lite - For details, see, 3rdparty/ps-lite/LICENSE
-    12. 3rdparty/mkldnn - For details, see, 3rdparty/mkldnn/LICENSE
-    13. googlemock scripts/generator - For details, see, 3rdparty/googletest/googlemock/scripts/generator/LICENSE
+    10. 3rdparty/tvm/dlpack - For details, see, 3rdparty/tvm/3rdparty/dlpack/LICENSE
+    11. 3rdparty/tvm/nnvm - For details, see, 3rdparty/tvm/nnvm/LICENSE
+    12. 3rdparty/ps-lite - For details, see, 3rdparty/ps-lite/LICENSE
+    13. 3rdparty/mkldnn - For details, see, 3rdparty/mkldnn/LICENSE
+    14. googlemock scripts/generator - For details, see, 3rdparty/googletest/googlemock/scripts/generator/LICENSE
+    15. clojure-package - For details, see, contrib/clojure-package/LICENSE
+    16. R-package - For details, see, R-package/LICENSE
+    17. ONNX-TensorRT benchmark package - For details, see, 3rdparty/onnx-tensorrt/third_party/onnx/third_party/benchmark/LICENSE
 
 
     =======================================================================================
@@ -239,6 +243,9 @@
     3. tree_lstm - For details, see example/gluon/tree_lstm/LICENSE
     4. OpenMP - For details, see 3rdparty/openmp/LICENSE.txt
     5. HalideIR - For details, see nnvm/tvm/HalideIR/LICENSE
+    6. HalideIR - For details, see 3rdparty/tvm/3rdparty/HalideIR/LICENSE
+    7. ONNX-TensorRT - For details, see 3rdparty/onnx-tensorrt/LICENSE
+    8. ONNX-TensorRT - For details, see 3rdparty/onnx-tensorrt/third_party/onnx/LICENSE
 
 
     =======================================================================================
@@ -246,7 +253,7 @@
     =======================================================================================
 
     1. Moderngpu
-    For details, see, src/operator/contrib/ctc_include/contrib/moderngpu/LICENSE
+    For details, see, 3rdparty/ctc_include/contrib/moderngpu/LICENSE
 
     /******************************************************************************
     * Redistribution and use in source and binary forms, with or without
@@ -559,4 +566,79 @@
     #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+    =======================================================================================
+
+    12. Google tests
+        For details, ses, 3rdparty/mkldnn/tests/gtests/gtest/LICENSE
+
+    Copyright 2008, Google Inc.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+        * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+        * Neither the name of Google Inc. nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    =======================================================================================
+
+    13. ONNX python bindings
+    For details, see, 3rdparty/onnx-tensorrt/third_party/onnx/third_party/pybind11/LICENSE
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice, this
+       list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the documentation
+       and/or other materials provided with the distribution.
+
+    3. Neither the name of the copyright holder nor the names of its contributors
+       may be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You are under no obligation whatsoever to provide any bug fixes, patches, or
+    upgrades to the features, functionality or performance of the source code
+    ("Enhancements") to anyone; however, if you choose to make your Enhancements
+    available either publicly, or directly to the author of this software, without
+    imposing a separate written license agreement for such Enhancements, then you
+    hereby grant the following license: a non-exclusive, royalty-free perpetual
+    license to install, use, modify, prepare derivative works, incorporate into
+    other computer software, distribute, and sublicense such enhancements or
+    derivative works thereof, in binary and source code form.
 

From 408a55d95c9907dbeee4373c77dc9a9eb0b6095e Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 29 Nov 2018 19:23:52 -0800
Subject: [PATCH 05/28] Get the correct include path in pip package (#13452)

* add find_include_path API

* address reviewer comment

* change return type from list to string

* add unit test

* address reviewer comment

* address reviewer comment

* address reviewer comment

* address reviewer comment

* fix include path problem in pip package

* add comment

* fix lint error

* address reviewer comment

* address reviewer comment
---
 python/mxnet/libinfo.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/libinfo.py b/python/mxnet/libinfo.py
index 928580148417..ff795f914a4b 100644
--- a/python/mxnet/libinfo.py
+++ b/python/mxnet/libinfo.py
@@ -96,10 +96,18 @@ def find_include_path():
             logging.warning("MXNET_INCLUDE_PATH '%s' doesn't exist", incl_from_env)
 
     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    incl_path = os.path.join(curr_path, '../../include/')
-    if not os.path.isdir(incl_path):
-        raise RuntimeError('Cannot find the MXNet include path.\n')
-    return incl_path
+    # include path in pip package
+    pip_incl_path = os.path.join(curr_path, 'include/')
+    if os.path.isdir(pip_incl_path):
+        return pip_incl_path
+    else:
+        # include path if build from source
+        src_incl_path = os.path.join(curr_path, '../../include/')
+        if os.path.isdir(src_incl_path):
+            return src_incl_path
+        else:
+            raise RuntimeError('Cannot find the MXNet include path in either ' + pip_incl_path +
+                               ' or ' + src_incl_path + '\n')
 
 
 # current version

From 7b67d8fe7f5d69df9f234e431136bcce2527920b Mon Sep 17 00:00:00 2001
From: Pedro Larroy <928489+larroy@users.noreply.github.com>
Date: Fri, 30 Nov 2018 15:50:33 +0100
Subject: [PATCH 06/28] Use ~/.ccache as default ccache directory so is not
 cache is not erased on reboot (#13431)

---
 ci/build.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/build.py b/ci/build.py
index acf277dd2c32..0069392d9a2a 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -200,7 +200,7 @@ def default_ccache_dir() -> str:
         ccache_dir = "/tmp/_mxnet_ccache"
         os.makedirs(ccache_dir, exist_ok=True)
         return ccache_dir
-    return os.path.join(tempfile.gettempdir(), "ci_ccache")
+    return os.path.join(os.path.expanduser("~"), ".ccache")
 
 
 def trim_container_id(cid):

From f9e661ee5580a953238c1e226763b66d80ed7c51 Mon Sep 17 00:00:00 2001
From: Chaitanya Prakash Bapat <chai.bapat@gmail.com>
Date: Fri, 30 Nov 2018 09:47:53 -0800
Subject: [PATCH 07/28] Skip flaky test
 https://github.com/apache/incubator-mxnet/issues/13446 (#13480)

---
 tests/python/unittest/test_random.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index 3436e9a9e80e..3026d31c0f96 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -860,6 +860,7 @@ def test_randint_extremes():
     assert a>=50000000 and a<=50000010
 
 @with_seed()
+@unittest.skip("Flaky test: https://github.com/apache/incubator-mxnet/issues/13446")
 def test_randint_generator():
     ctx = mx.context.current_context()
     for dtype in ['int32', 'int64']:

From b902878fb1f91a806831f383b87e58a33e5d25fa Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Fri, 30 Nov 2018 10:09:47 -0800
Subject: [PATCH 08/28] Rewrite dataloader with process pool, improves
 responsiveness and reliability (#13447)

* fix recordio.py

* rewrite dataloader with pool

* fix batch as tuple

* fix prefetching

* fix pylint

* picklable function

* use pickle

* add missing commit
---
 python/mxnet/gluon/data/dataloader.py | 223 ++++++++++++++++++++++----
 python/mxnet/recordio.py              |  17 ++
 2 files changed, 209 insertions(+), 31 deletions(-)

diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index 86cb835f5128..ad0f534d16dd 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -36,7 +36,6 @@
 
 from . import sampler as _sampler
 from ... import nd, context
-from ...recordio import MXRecordIO
 
 if sys.platform == 'darwin' or sys.platform == 'win32':
     def rebuild_ndarray(*args):
@@ -159,29 +158,9 @@ def _as_in_context(data, ctx):
         return [_as_in_context(d, ctx) for d in data]
     return data
 
-def _recursive_fork_recordio(obj, depth, max_depth=1000):
-    """Recursively find instance of MXRecordIO and reset file handler.
-    This is required for MXRecordIO which holds a C pointer to a opened file after fork.
-    """
-    if depth >= max_depth:
-        return
-    if isinstance(obj, MXRecordIO):
-        obj.close()
-        obj.open()  # re-obtain file hanlder in new process
-    elif (hasattr(obj, '__dict__')):
-        for _, v in obj.__dict__.items():
-            _recursive_fork_recordio(v, depth + 1, max_depth)
-
-def worker_loop(dataset, key_queue, data_queue, batchify_fn):
-    """Worker loop for multiprocessing DataLoader."""
-    # re-fork a new recordio handler in new process if applicable
-    # for a dataset with transform function, the depth of MXRecordIO is 1
-    # for a lazy transformer, the depth is 2
-    # for a user defined transformer, the depth is unknown, try a reasonable depth
-    limit = sys.getrecursionlimit()
-    max_recursion_depth = min(limit - 5, max(10, limit // 2))
-    _recursive_fork_recordio(dataset, 0, max_recursion_depth)
 
+def worker_loop_v1(dataset, key_queue, data_queue, batchify_fn):
+    """Worker loop for multiprocessing DataLoader."""
     while True:
         idx, samples = key_queue.get()
         if idx is None:
@@ -189,7 +168,7 @@ def worker_loop(dataset, key_queue, data_queue, batchify_fn):
         batch = batchify_fn([dataset[i] for i in samples])
         data_queue.put((idx, batch))
 
-def fetcher_loop(data_queue, data_buffer, pin_memory=False, data_buffer_lock=None):
+def fetcher_loop_v1(data_queue, data_buffer, pin_memory=False, data_buffer_lock=None):
     """Fetcher loop for fetching data from queue and put in reorder dict."""
     while True:
         idx, batch = data_queue.get()
@@ -206,10 +185,10 @@ def fetcher_loop(data_queue, data_buffer, pin_memory=False, data_buffer_lock=Non
             data_buffer[idx] = batch
 
 
-class _MultiWorkerIter(object):
-    """Interal multi-worker iterator for DataLoader."""
+class _MultiWorkerIterV1(object):
+    """Internal multi-worker iterator for DataLoader."""
     def __init__(self, num_workers, dataset, batchify_fn, batch_sampler, pin_memory=False,
-                 worker_fn=worker_loop):
+                 worker_fn=worker_loop_v1):
         assert num_workers > 0, "_MultiWorkerIter is not for {} workers".format(num_workers)
         self._num_workers = num_workers
         self._dataset = dataset
@@ -237,7 +216,7 @@ def __init__(self, num_workers, dataset, batchify_fn, batch_sampler, pin_memory=
         self._workers = workers
 
         self._fetcher = threading.Thread(
-            target=fetcher_loop,
+            target=fetcher_loop_v1,
             args=(self._data_queue, self._data_buffer, pin_memory, self._data_buffer_lock))
         self._fetcher.daemon = True
         self._fetcher.start()
@@ -299,7 +278,7 @@ def shutdown(self):
             self._shutdown = True
 
 
-class DataLoader(object):
+class DataLoaderV1(object):
     """Loads data from a dataset and returns mini-batches of data.
 
     Parameters
@@ -390,8 +369,190 @@ def same_process_iter():
             return same_process_iter()
 
         # multi-worker
-        return _MultiWorkerIter(self._num_workers, self._dataset,
-                                self._batchify_fn, self._batch_sampler, self._pin_memory)
+        return _MultiWorkerIterV1(self._num_workers, self._dataset,
+                                  self._batchify_fn, self._batch_sampler, self._pin_memory)
+
+    def __len__(self):
+        return len(self._batch_sampler)
+
+_worker_dataset = None
+def _worker_initializer(dataset):
+    """Initialier for processing pool."""
+    # global dataset is per-process based and only available in worker processes
+    # this is only necessary to handle MXIndexedRecordIO because otherwise dataset
+    # can be passed as argument
+    global _worker_dataset
+    _worker_dataset = dataset
+
+def _worker_fn(samples, batchify_fn):
+    """Function for processing data in worker process."""
+    # it is required that each worker process has to fork a new MXIndexedRecordIO handle
+    # preserving dataset as global variable can save tons of overhead and is safe in new process
+    global _worker_dataset
+    batch = batchify_fn([_worker_dataset[i] for i in samples])
+    buf = io.BytesIO()
+    ForkingPickler(buf, pickle.HIGHEST_PROTOCOL).dump(batch)
+    return buf.getvalue()
+
+class _MultiWorkerIter(object):
+    """Internal multi-worker iterator for DataLoader."""
+    def __init__(self, worker_pool, batchify_fn, batch_sampler, pin_memory=False,
+                 worker_fn=_worker_fn, prefetch=0):
+        self._worker_pool = worker_pool
+        self._batchify_fn = batchify_fn
+        self._batch_sampler = batch_sampler
+        self._data_buffer = {}
+        self._rcvd_idx = 0
+        self._sent_idx = 0
+        self._iter = iter(self._batch_sampler)
+        self._worker_fn = worker_fn
+        self._pin_memory = pin_memory
+        # pre-fetch
+        for _ in range(prefetch):
+            self._push_next()
+
+    def __len__(self):
+        return len(self._batch_sampler)
+
+    def _push_next(self):
+        """Assign next batch workload to workers."""
+        r = next(self._iter, None)
+        if r is None:
+            return
+        async_ret = self._worker_pool.apply_async(self._worker_fn, (r, self._batchify_fn))
+        self._data_buffer[self._sent_idx] = async_ret
+        self._sent_idx += 1
+
+    def __next__(self):
+        self._push_next()
+        if self._rcvd_idx == self._sent_idx:
+            assert not self._data_buffer, "Data buffer should be empty at this moment"
+            raise StopIteration
+
+        assert self._rcvd_idx < self._sent_idx, "rcvd_idx must be smaller than sent_idx"
+        assert self._rcvd_idx in self._data_buffer, "fatal error with _push_next, rcvd_idx missing"
+        ret = self._data_buffer.pop(self._rcvd_idx)
+        batch = pickle.loads(ret.get())
+        if self._pin_memory:
+            batch = _as_in_context(batch, context.cpu_pinned())
+        batch = batch[0] if len(batch) == 1 else batch
+        self._rcvd_idx += 1
+        return batch
+
+    def next(self):
+        return self.__next__()
+
+    def __iter__(self):
+        return self
+
+
+class DataLoader(object):
+    """Loads data from a dataset and returns mini-batches of data.
+
+    Parameters
+    ----------
+    dataset : Dataset
+        Source dataset. Note that numpy and mxnet arrays can be directly used
+        as a Dataset.
+    batch_size : int
+        Size of mini-batch.
+    shuffle : bool
+        Whether to shuffle the samples.
+    sampler : Sampler
+        The sampler to use. Either specify sampler or shuffle, not both.
+    last_batch : {'keep', 'discard', 'rollover'}
+        How to handle the last batch if batch_size does not evenly divide
+        `len(dataset)`.
+
+        keep - A batch with less samples than previous batches is returned.
+        discard - The last batch is discarded if its incomplete.
+        rollover - The remaining samples are rolled over to the next epoch.
+    batch_sampler : Sampler
+        A sampler that returns mini-batches. Do not specify batch_size,
+        shuffle, sampler, and last_batch if batch_sampler is specified.
+    batchify_fn : callable
+        Callback function to allow users to specify how to merge samples
+        into a batch. Defaults to `default_batchify_fn`::
+
+            def default_batchify_fn(data):
+                if isinstance(data[0], nd.NDArray):
+                    return nd.stack(*data)
+                elif isinstance(data[0], tuple):
+                    data = zip(*data)
+                    return [default_batchify_fn(i) for i in data]
+                else:
+                    data = np.asarray(data)
+                    return nd.array(data, dtype=data.dtype)
+
+    num_workers : int, default 0
+        The number of multiprocessing workers to use for data preprocessing.
+    pin_memory : boolean, default False
+        If ``True``, the dataloader will copy NDArrays into pinned memory
+        before returning them. Copying from CPU pinned memory to GPU is faster
+        than from normal CPU memory.
+    prefetch : int, default is `num_workers * 2`
+        The number of prefetching batches only works if `num_workers` > 0.
+        If `prefetch` > 0, it allow worker process to prefetch certain batches before
+        acquiring data from iterators.
+        Note that using large prefetching batch will provide smoother bootstrapping performance,
+        but will consume more shared_memory. Using smaller number may forfeit the purpose of using
+        multiple worker processes, try reduce `num_workers` in this case.
+        By default it defaults to `num_workers * 2`.
+    """
+    def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
+                 last_batch=None, batch_sampler=None, batchify_fn=None,
+                 num_workers=0, pin_memory=False, prefetch=None):
+        self._dataset = dataset
+        self._pin_memory = pin_memory
+
+        if batch_sampler is None:
+            if batch_size is None:
+                raise ValueError("batch_size must be specified unless " \
+                                 "batch_sampler is specified")
+            if sampler is None:
+                if shuffle:
+                    sampler = _sampler.RandomSampler(len(dataset))
+                else:
+                    sampler = _sampler.SequentialSampler(len(dataset))
+            elif shuffle:
+                raise ValueError("shuffle must not be specified if sampler is specified")
+
+            batch_sampler = _sampler.BatchSampler(
+                sampler, batch_size, last_batch if last_batch else 'keep')
+        elif batch_size is not None or shuffle or sampler is not None or \
+                last_batch is not None:
+            raise ValueError("batch_size, shuffle, sampler and last_batch must " \
+                             "not be specified if batch_sampler is specified.")
+
+        self._batch_sampler = batch_sampler
+        self._num_workers = num_workers if num_workers >= 0 else 0
+        self._worker_pool = None
+        self._prefetch = max(0, int(prefetch) if prefetch is not None else 2 * self._num_workers)
+        if self._num_workers > 0:
+            self._worker_pool = multiprocessing.Pool(
+                self._num_workers, initializer=_worker_initializer, initargs=[self._dataset])
+        if batchify_fn is None:
+            if num_workers > 0:
+                self._batchify_fn = default_mp_batchify_fn
+            else:
+                self._batchify_fn = default_batchify_fn
+        else:
+            self._batchify_fn = batchify_fn
+
+    def __iter__(self):
+        if self._num_workers == 0:
+            def same_process_iter():
+                for batch in self._batch_sampler:
+                    ret = self._batchify_fn([self._dataset[idx] for idx in batch])
+                    if self._pin_memory:
+                        ret = _as_in_context(ret, context.cpu_pinned())
+                    yield ret
+            return same_process_iter()
+
+        # multi-worker
+        return _MultiWorkerIter(self._worker_pool, self._batchify_fn, self._batch_sampler,
+                                pin_memory=self._pin_memory, worker_fn=_worker_fn,
+                                prefetch=self._prefetch)
 
     def __len__(self):
         return len(self._batch_sampler)
diff --git a/python/mxnet/recordio.py b/python/mxnet/recordio.py
index 2def141c9340..bdc63235d702 100644
--- a/python/mxnet/recordio.py
+++ b/python/mxnet/recordio.py
@@ -18,6 +18,7 @@
 """Read and write for the RecordIO data format."""
 from __future__ import absolute_import
 from collections import namedtuple
+from multiprocessing import current_process
 
 import ctypes
 import struct
@@ -65,6 +66,7 @@ def __init__(self, uri, flag):
         self.uri = c_str(uri)
         self.handle = RecordIOHandle()
         self.flag = flag
+        self.pid = None
         self.is_open = False
         self.open()
 
@@ -78,6 +80,7 @@ def open(self):
             self.writable = False
         else:
             raise ValueError("Invalid flag %s"%self.flag)
+        self.pid = current_process().pid
         self.is_open = True
 
     def __del__(self):
@@ -109,6 +112,14 @@ def __setstate__(self, d):
         if is_open:
             self.open()
 
+    def _check_pid(self, allow_reset=False):
+        """Check process id to ensure integrity, reset if in new process."""
+        if not self.pid == current_process().pid:
+            if allow_reset:
+                self.reset()
+            else:
+                raise RuntimeError("Forbidden operation in multiple processes")
+
     def close(self):
         """Closes the record file."""
         if not self.is_open:
@@ -118,6 +129,7 @@ def close(self):
         else:
             check_call(_LIB.MXRecordIOReaderFree(self.handle))
         self.is_open = False
+        self.pid = None
 
     def reset(self):
         """Resets the pointer to first item.
@@ -156,6 +168,7 @@ def write(self, buf):
             Buffer to write.
         """
         assert self.writable
+        self._check_pid(allow_reset=False)
         check_call(_LIB.MXRecordIOWriterWriteRecord(self.handle,
                                                     ctypes.c_char_p(buf),
                                                     ctypes.c_size_t(len(buf))))
@@ -182,6 +195,9 @@ def read(self):
             Buffer read.
         """
         assert not self.writable
+        # trying to implicitly read from multiple processes is forbidden,
+        # there's no elegant way to handle unless lock is introduced
+        self._check_pid(allow_reset=False)
         buf = ctypes.c_char_p()
         size = ctypes.c_size_t()
         check_call(_LIB.MXRecordIOReaderReadRecord(self.handle,
@@ -255,6 +271,7 @@ def seek(self, idx):
         This function is internally called by `read_idx(idx)` to find the current
         reader pointer position. It doesn't return anything."""
         assert not self.writable
+        self._check_pid(allow_reset=True)
         pos = ctypes.c_size_t(self.idx[idx])
         check_call(_LIB.MXRecordIOReaderSeek(self.handle, pos))
 

From 819a04a7ad358d4d70153c3db51c066c3b6af228 Mon Sep 17 00:00:00 2001
From: Aaron Markham <markhama@amazon.com>
Date: Fri, 30 Nov 2018 10:41:07 -0800
Subject: [PATCH 09/28] Fix errors in docstrings for subgraph op; use code
 directive (#13463)

---
 src/operator/contrib/dgl_graph.cc | 52 ++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/src/operator/contrib/dgl_graph.cc b/src/operator/contrib/dgl_graph.cc
index 1bb47b89bdea..74ad3d435648 100644
--- a/src/operator/contrib/dgl_graph.cc
+++ b/src/operator/contrib/dgl_graph.cc
@@ -1118,19 +1118,24 @@ sets of vertices as input. For each set of vertices, it returns a pair
 of CSR matrices if return_mapping is True: the first matrix contains edges
 with new edge Ids, the second matrix contains edges with the original
 edge Ids.
-Example::
-  x=[[1, 0, 0, 2],
-     [3, 0, 4, 0],
-     [0, 5, 0, 0],
-     [0, 6, 7, 0]]
-  v = [0, 1, 2]
-  dgl_subgraph(x, v, return_mapping=True) =
-    [[1, 0, 0],
-     [2, 0, 3],
-     [0, 4, 0]],
-    [[1, 0, 0],
-     [3, 0, 4],
-     [0, 5, 0]]
+
+Example:
+
+   .. code:: python
+
+     x=[[1, 0, 0, 2],
+       [3, 0, 4, 0],
+       [0, 5, 0, 0],
+       [0, 6, 7, 0]]
+     v = [0, 1, 2]
+     dgl_subgraph(x, v, return_mapping=True) =
+       [[1, 0, 0],
+        [2, 0, 3],
+        [0, 4, 0]],
+       [[1, 0, 0],
+        [3, 0, 4],
+        [0, 5, 0]]
+
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<DGLSubgraphParam>)
 .set_num_inputs([](const NodeAttrs& attrs) {
@@ -1296,13 +1301,17 @@ NNVM_REGISTER_OP(_contrib_edge_id)
 stored in a CSR matrix (the value of the CSR stores the edge Id of the graph).
 output[i] = input[u[i], v[i]] if there is an edge between u[i] and v[i]],
 otherwise output[i] will be -1. Both u and v should be 1D vectors.
-Example::
-  x = [[ 1, 0, 0 ],
-       [ 0, 2, 0 ],
-       [ 0, 0, 3 ]]
-  u = [ 0, 0, 1, 1, 2, 2 ]
-  v = [ 0, 1, 1, 2, 0, 2 ]
-  edge_id(x, u, v) = [ 1, -1, 2, -1, -1, 3 ]
+
+Example:
+
+   .. code:: python
+
+      x = [[ 1, 0, 0 ],
+           [ 0, 2, 0 ],
+           [ 0, 0, 3 ]]
+      u = [ 0, 0, 1, 1, 2, 2 ]
+      v = [ 0, 1, 1, 2, 0, 2 ]
+      edge_id(x, u, v) = [ 1, -1, 2, -1, -1, 3 ]
 
 The storage type of ``edge_id`` output depends on storage types of inputs
   - edge_id(csr, default, default) = default
@@ -1367,7 +1376,8 @@ NNVM_REGISTER_OP(_contrib_dgl_adjacency)
 .describe(R"code(This operator converts a CSR matrix whose values are edge Ids
 to an adjacency matrix whose values are ones. The output CSR matrix always has
 the data value of float32.
-Example::
+
+Example:
 
   x = [[ 1, 0, 0 ],
        [ 0, 2, 0 ],

From ddf6980fc0880d734844fdb7a5f4a7abfe5c174e Mon Sep 17 00:00:00 2001
From: Naveen Swamy <mn.naveen@gmail.com>
Date: Fri, 30 Nov 2018 10:54:12 -0800
Subject: [PATCH 10/28] [MXNET-1158] JVM Memory Management Documentation
 (#13105)

* update train_mnist

* Add documentation for JVM Memory Management

* update doc

* address nit picks

* address nit picks

* Grammar and clarity edits for memory management doc

* Edits for scala memory management

* Update memory-management.md

* Update memory-management.md

* Update memory-management.md

* capitalization fix
---
 .../examples/scripts/run_train_mnist.sh       |  24 +++-
 scala-package/memory-management.md            | 118 ++++++++++++++++++
 2 files changed, 138 insertions(+), 4 deletions(-)
 create mode 100644 scala-package/memory-management.md

diff --git a/scala-package/examples/scripts/run_train_mnist.sh b/scala-package/examples/scripts/run_train_mnist.sh
index ea53c1ade66f..d27b7cbb3657 100755
--- a/scala-package/examples/scripts/run_train_mnist.sh
+++ b/scala-package/examples/scripts/run_train_mnist.sh
@@ -19,15 +19,31 @@
 
 set -e
 
+hw_type=cpu
+if [[ $1 = gpu ]]
+then
+    hw_type=gpu
+fi
+
+platform=linux-x86_64
+
+if [[ $OSTYPE = [darwin]* ]]
+then
+    platform=osx-x86_64
+    hw_type=cpu
+fi
+
 MXNET_ROOT=$(cd "$(dirname $0)/../../.."; pwd)
 echo $MXNET_ROOT
-CLASS_PATH=$MXNET_ROOT/scala-package/assembly/linux-x86_64-cpu/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*:$MXNET_ROOT/scala-package/infer/target/*
+CLASS_PATH=$MXNET_ROOT/scala-package/assembly/$platform-$hw_type/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
 
 # model dir
 DATA_PATH=$2
 
-java -XX:+PrintGC -Xms256M -Xmx512M -Dmxnet.traceLeakedObjects=false -cp $CLASS_PATH \
-        org.apache.mxnetexamples.imclassification.TrainMnist \
-        --data-dir /home/ubuntu/mxnet_scala/scala-package/examples/mnist/ \
+java -XX:+PrintGC -Dmxnet.traceLeakedObjects=false -cp $CLASS_PATH \
+        org.apache.mxnetexamples.imclassification.TrainModel \
+        --data-dir $MXNET_ROOT/scala-package/examples/mnist/ \
+        --network mlp \
+        --num-layers 50 \
         --num-epochs 10000000 \
         --batch-size 1024
\ No newline at end of file
diff --git a/scala-package/memory-management.md b/scala-package/memory-management.md
new file mode 100644
index 000000000000..33c36b6e6ab0
--- /dev/null
+++ b/scala-package/memory-management.md
@@ -0,0 +1,118 @@
+# JVM Memory Management
+The Scala and Java bindings of Apache MXNet use native memory (memory from the C++ heap in either RAM or GPU memory) for most of the MXNet objects such as NDArray, Symbol, Executor, KVStore, Data Iterators, etc.
+The associated Scala classes act only as wrappers. The operations done on these wrapper objects are then directed to the high performance MXNet C++ backend via the Java Native Interface (JNI). Therefore, the bytes are stored in the C++ native heap which allows for fast access.
+
+However, the JVM Garbage Collector only manages objects allocated in the JVM Heap and is not aware of the memory footprint of these objects in the native memory. Hence, the allocation/deallocation of native memory must be managed by MXNet Scala.
+Allocating native memory is straight forward and is done during the construction of the object by calling the associated C++ API through JNI. However, since JVM languages do not have destructors, the deallocation of these objects must be done explicitly.
+MXNet Scala provides a few easy modes of operation which are explained in detail below.
+
+## Memory Management in Scala 
+### 1.  [ResourceScope.using](https://github.com/apache/incubator-mxnet/blob/master/scala-package/core/src/main/scala/org/apache/mxnet/ResourceScope.scala#L106) (Recommended)
+`ResourceScope.using` provides the familiar Java try-with-resources primitive in Scala and will automatically manage the memory of all the MXNet objects created in the associated code block (`body`). It works by tracking the allocations performed inside the code block deallocating when exiting the block. 
+Passing MXNet objects out of a using block can be easily accomplished by simply returning an object or an iterable containing multiple MXNet objects. If you have nested using blocks, then the returned objects will be moved into the parent scope as well.
+
+**Usage** 
+```scala
+ResourceScope.using() {
+    ResourceScope.using() {
+        val r1 = NDArray.ones(Shape(2, 2))
+        val r2 = NDArray.ones(Shape(3, 4))
+        val r3 = NDArray.ones(Shape(5, 6))
+        val r4 = NDArray.ones(Shape(7, 8))
+        (r3, r4)
+    }
+    r4
+}
+```
+In the example above, we have two ResourceScopes stacked together. In the inner scope, 4 NDArrays `(r1, r2, r3, r4)` are created and the NDArrays 
+`(r3, r4)` are returned. The inner ResourceScope recognizes that it should not deallocate these objects and automatically moves `r3` and  `r4` to the outer scope. When the outer scope 
+returns `r4` from its code-block, it will only deallocate `r3` and will remove `r4` from its list of objects to be deallocated. All other objects are automatically released by calling the C++ backend to free the native memory.
+
+**Note:**
+You should consider nesting ResourceScopes when you have layers of functionality in your application code or create a lot of MXNet objects such as NDArrays.  
+For example, holding onto all the memory that is created for an entire training loop can result in running out of memory, especially when training on GPUs which might only have 8 to 16 GB.  
+It is recommended not to use a single ResourceScope block which spans the entire training code. You should instead nest multiple scopes: an innermost scope where you run forward-backward passes on each batch, a middle scope for each epoch, and an outer scope that runs the entire training script. This is demonstrated in the example below:
+
+```scala
+ResourceScope.using() {
+ val m = Module()
+ m.bind()
+ val k = KVStore(...)
+ ResourceScope.using() {
+     val itr = MXIterator(..)
+     val num_epochs: Int = 100
+     //... 
+     for (i <- 0 until num_epoch) {
+     ResourceScope.using() {
+        val dataBatch = itr.next()
+        while(itr.next()) {
+           m.forward(dataBatch)
+           m.backward(dataBatch)
+           m.update()
+        }
+     }
+   }
+ }
+}
+
+```  
+       
+### 2.  Using Phantom References (Recommended for some use cases)
+
+Apache MXNet uses [Phantom References](https://docs.oracle.com/javase/8/docs/api/java/lang/ref/PhantomReference.html) to track all MXNet Objects that have native memory associated with it. 
+When the Garbage Collector runs, it identifies unreachable Scala/Java objects in the JVM Heap and finalizes them. 
+It then enqueues objects which are ready to be reclaimed into a reference queue. We take advantage of this and do a 
+pre-mortem cleanup on these wrapper objects by freeing the corresponding native memory as well.
+ 
+This approach is automatic and does not require any special code to clean up the native memory. However, the Garbage Collector is not aware of the potentially large amount of native memory used and therefore may not free up memory often enough with it's standard behavior.
+You can control the frequency of garbage collection by calling System.gc() at strategic points such as the end of an epoch or the end of a mini-batch.
+
+This approach could be suitable for some use cases such as inference on CPUs where you have a large amount of Memory (RAM) on your system.
+
+**Note:**
+Calling GC too frequently can also cause your application to perform poorly. This approach might not be suitable 
+for use cases which quickly allocate a large number of large NDArrays such as when training a GAN model.
+
+### 3. Using dispose Pattern (least Recommended)
+ 
+There might be situations where you want to manually manage the lifecycle of Apache MXNet objects. For such use-cases, we have provided the `dispose()` method which will manually deallocate the associated native memory when called. We have also
+made all MXNet objects [AutoCloseable](https://docs.oracle.com/javase/8/docs/api/java/lang/AutoCloseable.html). If you are using Java8 and above you can use it with try-with-resources or call close() in the finally block.
+
+**Note:**
+We recommend you avoid manually managing MXNet objects and instead use `ResourceScope.using`. This creates less readable code and could leak memory if you miss calling dispose (until it is cleaned up by the Garbage Collector through the Phantom References).
+
+```scala
+def showDispose(): Unit = {
+    val r = NDArray.ones(Shape (2, 2))
+    r.dispose()
+}
+```
+
+## Memory Management in Java
+Memory Management in MXNet Java is similar to Scala. We recommend you use [ResourceScope](https://github.com/apache/incubator-mxnet/blob/master/scala-package/core/src/main/scala/org/apache/mxnet/ResourceScope.scala#L32) in a `try-with-resources` block or in a `try-finally` block.
+The [try-with-resource](https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html) tracks the resources declared in the try block and automatically closes them upon exiting (supported from Java 7 onwards). 
+The ResourceScope discussed above implements AutoCloseable and tracks all MXNet Objects created at a Thread Local scope level. 
+
+```java
+try(ResourceScope scope = new ResourceScope()) {
+    NDArray test = NDArray.ones((Shape (2,2))
+}
+```
+or 
+```java
+try {
+    ResourceScope scope = new ResourceScope()
+    NDArray test = NDArray.ones((Shape(2,2))
+} finally {
+    scope.close()
+}
+``` 
+
+**Note:**
+A ResourceScope within a try block tracks all MXNet Native Object Allocations (NDArray, Symbol, Executor, etc.,) and deallocates them at
+the end of the try block. This is also true of the objects that are returned e.g. in the example above, the native memory associated with `test` would be deallocated even if it were to be returned. 
+If you use the object outside of the try block, the process might crash due to illegal memory access.
+
+To retain certain objects created within try blocks, you should explicitly remove them from the scope by calling `scope.moveToOuterScope`.
+It is highly recommended to nest multiple try-with-resource ResourceScopes so you do not have to explicitly manage the lifecycle of the Native objects.
+

From 4d342ef297aeb5f07ccf13cf1a0e1cdff701097b Mon Sep 17 00:00:00 2001
From: Haibin Lin <linhaibin.eric@gmail.com>
Date: Fri, 30 Nov 2018 11:58:16 -0800
Subject: [PATCH 11/28] Update row_sparse tutorial (#13414)

Update row_sparse tutorial
---
 docs/tutorials/sparse/row_sparse.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tutorials/sparse/row_sparse.md b/docs/tutorials/sparse/row_sparse.md
index 27cc0d3d903e..46a5edad075e 100644
--- a/docs/tutorials/sparse/row_sparse.md
+++ b/docs/tutorials/sparse/row_sparse.md
@@ -459,7 +459,7 @@ Note that warning messages will be printed when such a storage fallback event ha
 
 ## Sparse Optimizers
 
-In MXNet, sparse gradient updates are applied when weight, state and gradient are all in `row_sparse` storage.
+In MXNet, sparse gradient updates are applied when gradient is in `row_sparse` storage and the optimizer is created with `lazy_update=True`.
 The sparse optimizers only update the row slices of the weight and the states whose indices appear
 in `gradient.indices`. For example, the default update rule for SGD optimizer is:
 

From fb92a66cd621d48d304e6538f62729941e2d2675 Mon Sep 17 00:00:00 2001
From: Sina Afrooze <sina.beh@gmail.com>
Date: Fri, 30 Nov 2018 13:33:16 -0800
Subject: [PATCH 12/28] Add resiliency to onnx export code (#13426)

* Added resiliency to onnx export code

- With previous infer-shape implementation, if input shape was list instead of tuple or if extra non-existent parameters were provided, the code would still work. The fixes in this commit make sure that behavior is restored to prevent any compatibility issues with existing export code.

* Fixed name of net in unittest

* Fix pylint
---
 .../mxnet/contrib/onnx/mx2onnx/export_onnx.py |  5 +++--
 .../onnx/export/mxnet_export_test.py          | 21 +++++++++++++++++--
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/contrib/onnx/mx2onnx/export_onnx.py b/python/mxnet/contrib/onnx/mx2onnx/export_onnx.py
index 14c674f56f2d..84db5decd503 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/export_onnx.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/export_onnx.py
@@ -134,9 +134,10 @@ def get_outputs(sym, params, in_shape, in_label):
         # remove any input listed in params from sym.list_inputs() and bind them to the input shapes provided
         # by user. Also remove in_label, which is the name of the label symbol that may have been used
         # as the label for loss during training.
-        inputs = {n: s for n, s in zip([n for n in sym.list_inputs() if n not in params and n != in_label], in_shape)}
+        inputs = {n: tuple(s) for n, s in zip([n for n in sym.list_inputs() if n not in params and n != in_label],
+                                              in_shape)}
         # Add params and their shape to list of inputs
-        inputs.update({n: v.shape for n, v in params.items()})
+        inputs.update({n: v.shape for n, v in params.items() if n in sym.list_inputs()})
         # Provide input data as well as input params to infer_shape()
         _, out_shapes, _ = sym.infer_shape(**inputs)
 
diff --git a/tests/python-pytest/onnx/export/mxnet_export_test.py b/tests/python-pytest/onnx/export/mxnet_export_test.py
index f4144fd6c7fa..964d0e760cae 100644
--- a/tests/python-pytest/onnx/export/mxnet_export_test.py
+++ b/tests/python-pytest/onnx/export/mxnet_export_test.py
@@ -286,18 +286,19 @@ def _optional_group(symbols, group=False):
         return symbols
 
 
-def _check_onnx_export(net, group_outputs=False):
+def _check_onnx_export(net, group_outputs=False, shape_type=tuple, extra_params={}):
     net.initialize()
     data = nd.random.uniform(0, 1, (1, 1024))
     output = _force_list(net(data))  # initialize weights
     net_sym = _optional_group(net(sym.Variable('data')), group_outputs)
     net_params = {name:param._reduce() for name, param in net.collect_params().items()}
+    net_params.update(extra_params)
     with tempfile.TemporaryDirectory() as tmpdirname:
         onnx_file_path = os.path.join(tmpdirname, 'net.onnx')
         export_path = onnx_mxnet.export_model(
             sym=net_sym,
             params=net_params,
-            input_shape=[data.shape],
+            input_shape=[shape_type(data.shape)],
             onnx_file_path=onnx_file_path)
         assert export_path == onnx_file_path
         # Try importing the model to symbol
@@ -340,6 +341,22 @@ def hybrid_forward(self, F, x):
     _check_onnx_export(net, group_outputs=True)
 
 
+@with_seed()
+def test_onnx_export_list_shape():
+    net = nn.HybridSequential(prefix='list_shape_net')
+    with net.name_scope():
+        net.add(nn.Dense(100, activation='relu'), nn.Dense(10))
+    _check_onnx_export(net, shape_type=list)
+
+
+@with_seed()
+def test_onnx_export_extra_params():
+    net = nn.HybridSequential(prefix='extra_params_net')
+    with net.name_scope():
+        net.add(nn.Dense(100, activation='relu'), nn.Dense(10))
+    _check_onnx_export(net, extra_params={'extra_param': nd.array([1, 2])})
+
+
 if __name__ == '__main__':
     test_models("bvlc_googlenet", (1, 3, 224, 224), (1, 1000))
     test_models("bvlc_reference_caffenet", (1, 3, 224, 224), (1, 1000))

From 0bb26acf2709d690d9f5906075abb9cc9be04cd4 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 30 Nov 2018 21:48:20 -0800
Subject: [PATCH 13/28] [MXNET-1185] Support large array in several operators
 (part 1) (#13418)

* fix a few operators with large arrays (# of elements)

* fix bug in broadcast_div and add tests

* address reviewer comment

* add unit test

* add empty line

* retrigger CI
---
 src/operator/elemwise_op_common.h             |   8 +-
 src/operator/mxnet_op.h                       |  68 +++---
 src/operator/random/sampler.h                 |  43 ++--
 src/operator/tensor/broadcast_reduce-inl.h    |  94 ++++----
 .../tensor/elemwise_binary_broadcast_op.h     |  14 +-
 src/operator/tensor/indexing_op.cc            |  26 +--
 src/operator/tensor/indexing_op.cu            |  10 +-
 src/operator/tensor/indexing_op.h             |  39 ++--
 src/operator/tensor/init_op.h                 |   6 +-
 src/operator/tensor/matrix_op-inl.h           | 219 +++++++++---------
 tests/nightly/test_large_array.py             | 128 +++++++++-
 11 files changed, 384 insertions(+), 271 deletions(-)

diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index cf44da699156..4b8663bba6ea 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -100,7 +100,7 @@ inline bool ElemwiseStorageAttr(const nnvm::NodeAttrs& attrs,
  *  \tparam rsp whether row sparse stype is supported
  *  \tparam rsp whether csr stype is supported
  */
-template<int n_in, int n_out, bool cpu_only, bool rsp, bool csr>
+template<index_t n_in, index_t n_out, bool cpu_only, bool rsp, bool csr>
 inline bool ElemwiseStorageType(const nnvm::NodeAttrs& attrs,
                                 const int dev_mask,
                                 DispatchMode* dispatch_mode,
@@ -115,7 +115,7 @@ inline bool ElemwiseStorageType(const nnvm::NodeAttrs& attrs,
 template<typename AttrType, bool (*is_none)(const AttrType&),
          bool (*assign)(AttrType*, const AttrType&), bool reverse_infer,
          std::string (*attr_string)(const AttrType&),
-         int n_in = -1, int n_out = -1>
+         index_t n_in = -1, index_t n_out = -1>
 inline bool ElemwiseAttr(const nnvm::NodeAttrs& attrs,
                          std::vector<AttrType> *in_attrs,
                          std::vector<AttrType> *out_attrs,
@@ -154,7 +154,7 @@ inline bool ElemwiseAttr(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-template<int n_in, int n_out>
+template<index_t n_in, index_t n_out>
 inline bool ElemwiseShape(const nnvm::NodeAttrs& attrs,
                           std::vector<TShape> *in_attrs,
                           std::vector<TShape> *out_attrs) {
@@ -168,7 +168,7 @@ inline bool ElemwiseShape(const nnvm::NodeAttrs& attrs,
     attrs, in_attrs, out_attrs, TShape());
 }
 
-template<int n_in, int n_out>
+template<index_t n_in, index_t n_out>
 inline bool ElemwiseType(const nnvm::NodeAttrs& attrs,
                          std::vector<int> *in_attrs,
                          std::vector<int> *out_attrs) {
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 5b106afd8d5b..6cab1990858b 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -289,8 +289,8 @@ inline int get_num_threads<cpu>(const int N) {
 
 /* \brief Compute flattened index given coordinates and shape. */
 template<int ndim>
-MSHADOW_XINLINE int ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
-  int ret = 0;
+MSHADOW_XINLINE index_t ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
+  index_t ret = 0;
   #pragma unroll
   for (int i = 0; i < ndim; ++i) {
     ret = ret * shape[i] + (shape[i] > coord[i]) * coord[i];
@@ -301,11 +301,11 @@ MSHADOW_XINLINE int ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
 
 /* Compute coordinates from flattened index given shape */
 template<int ndim>
-MSHADOW_XINLINE Shape<ndim> unravel(const int idx, const Shape<ndim>& shape) {
+MSHADOW_XINLINE Shape<ndim> unravel(const index_t idx, const Shape<ndim>& shape) {
   Shape<ndim> ret;
   #pragma unroll
-  for (int i = ndim-1, j = idx; i >=0; --i) {
-    int tmp = j / shape[i];
+  for (index_t i = ndim-1, j = idx; i >=0; --i) {
+    auto tmp = j / shape[i];
     ret[i] = j - tmp*shape[i];
     j = tmp;
   }
@@ -315,8 +315,8 @@ MSHADOW_XINLINE Shape<ndim> unravel(const int idx, const Shape<ndim>& shape) {
 
 /* Compute dot product of two vector */
 template<int ndim>
-MSHADOW_XINLINE int dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
-  int ret = 0;
+MSHADOW_XINLINE index_t dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
+  index_t ret = 0;
   #pragma unroll
   for (int i = 0; i < ndim; ++i) {
     ret += coord[i] * stride[i];
@@ -327,12 +327,12 @@ MSHADOW_XINLINE int dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
 
 /* Combining unravel and dot */
 template<int ndim>
-MSHADOW_XINLINE int unravel_dot(const int idx, const Shape<ndim>& shape,
+MSHADOW_XINLINE index_t unravel_dot(const index_t idx, const Shape<ndim>& shape,
   const Shape<ndim>& stride) {
-  int ret = 0;
+  index_t ret = 0;
   #pragma unroll
-  for (int i = ndim-1, j = idx; i >=0; --i) {
-    int tmp = j / shape[i];
+  for (index_t i = ndim-1, j = idx; i >=0; --i) {
+    auto tmp = j / shape[i];
     ret += (j - tmp*shape[i])*stride[i];
     j = tmp;
   }
@@ -433,51 +433,51 @@ struct op_with_req {
 
   /*! \brief input is one tensor */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *in) {
     KERNEL_ASSIGN(out[i], req, OP::Map(in[i]));
   }
 
   /*! \brief inputs are two tensors */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *lhs, const DType *rhs) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *lhs, const DType *rhs) {
     KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], rhs[i]));
   }
 
   /*! \brief input is tensor and a scalar value */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in, const DType value) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *in, const DType value) {
     KERNEL_ASSIGN(out[i], req, OP::Map(in[i], value));
   }
 
   /*! \brief input is tensor and two scalar value */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in,
+  MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *in,
                                   const DType value_1, const DType value_2) {
     KERNEL_ASSIGN(out[i], req, OP::Map(in[i], value_1, value_2));
   }
 
   /*! \brief No inputs (ie fill to constant value) */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out) {
     KERNEL_ASSIGN(out[i], req, OP::Map());
   }
 
   /*! \brief input is single scalar value */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out, const DType value) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType value) {
     KERNEL_ASSIGN(out[i], req, OP::Map(value));
   }
 
   /*! \brief inputs are two tensors and a scalar value */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out,
+  MSHADOW_XINLINE static void Map(index_t i, DType *out,
                                   const DType *input_1, const DType *input_2, const DType value) {
     KERNEL_ASSIGN(out[i], req, OP::Map(input_1[i], input_2[i], value));
   }
 
   /*! \brief inputs are three tensors (ie backward grad with binary grad function) */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out,
+  MSHADOW_XINLINE static void Map(index_t i, DType *out,
                                   const DType *input_1,
                                   const DType *input_2,
                                   const DType *input_3) {
@@ -503,21 +503,21 @@ struct Kernel<OP, cpu> {
    * \param args Varargs to eventually pass to the OP::Map() function
    */
   template<typename ...Args>
-  inline static bool Launch(mshadow::Stream<cpu> *, const int N, Args... args) {
+  inline static bool Launch(mshadow::Stream<cpu> *, const size_t N, Args... args) {
 #ifdef _OPENMP
     const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
     if (omp_threads < 2) {
-      for (int i = 0; i < N; ++i) {
+      for (size_t i = 0; i < N; ++i) {
         OP::Map(i, args...);
       }
     } else {
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < static_cast<index_t>(N); ++i) {
         OP::Map(i, args...);
       }
     }
 #else
-    for (int i = 0; i < N; ++i) {
+    for (size_t i = 0; i < N; ++i) {
       OP::Map(i, args...);
     }
 #endif
@@ -567,22 +567,22 @@ struct Kernel<OP, cpu> {
    * \param args Varargs to eventually pass to the OP::Map() function
    */
   template<typename PRIMITIVE_OP, typename DType, typename ...Args>
-  static void LaunchTuned(mshadow::Stream<cpu> *, const int N, Args... args) {
+  static void LaunchTuned(mshadow::Stream<cpu> *, const size_t N, Args... args) {
 #ifdef _OPENMP
     const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
     if (omp_threads < 2 || !tuned_op<PRIMITIVE_OP, DType>::UseOMP(
-      static_cast<size_t>(N), static_cast<size_t>(omp_threads))) {
-      for (int i = 0; i < N; ++i) {
+      N, static_cast<size_t>(omp_threads))) {
+      for (size_t i = 0; i < N; ++i) {
         OP::Map(i, args...);
       }
     } else {
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < static_cast<index_t>(N); ++i) {
         OP::Map(i, args...);
       }
     }
 #else
-    for (int i = 0; i < N; ++i) {
+    for (size_t i = 0; i < N; ++i) {
       OP::Map(i, args...);
     }
 #endif
@@ -596,15 +596,15 @@ struct Kernel<OP, cpu> {
    * \param args Varargs to eventually pass to the UseOMP() and OP::Map() functions
    */
   template<typename ...Args>
-  inline static void LaunchEx(mshadow::Stream<cpu> *s, const int N, Args... args) {
+  inline static void LaunchEx(mshadow::Stream<cpu> *s, const size_t N, Args... args) {
 #ifdef _OPENMP
     const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
     if (omp_threads < 2) {
       OP::Map(0, N, args...);
     } else {
-      const int length = (N + omp_threads - 1) / omp_threads;
+      const auto length = (N + omp_threads - 1) / omp_threads;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i += length) {
+      for (index_t i = 0; i < static_cast<index_t>(N); i += length) {
         OP::Map(i, i + length > N ? N - i : length, args...);
       }
     }
@@ -626,7 +626,7 @@ struct Kernel<OP, cpu> {
   template<typename DType, typename T = OP, typename ...Args>
   static MSHADOW_CINLINE
   typename std::enable_if<std::is_base_of<tunable, T>::value, bool>::type
-  Launch(mshadow::Stream<cpu> *s, const int N, DType *dest, Args... args) {
+  Launch(mshadow::Stream<cpu> *s, const size_t N, DType *dest, Args... args) {
     LaunchTuned<T, DType>(s, N, dest, args...);
     return true;
   }
@@ -644,7 +644,7 @@ struct Kernel<OP, cpu> {
   template<typename DType, typename T = OP, typename ...Args>
   static MSHADOW_CINLINE
   typename std::enable_if<std::is_base_of<tunable, typename T::Operation>::value, bool>::type
-  Launch(mshadow::Stream<cpu> *s, const int N, DType *dest, Args... args) {
+  Launch(mshadow::Stream<cpu> *s, const size_t N, DType *dest, Args... args) {
     LaunchTuned<typename T::Operation, DType>(s, N, dest, args...);
     return true;
   }
@@ -700,7 +700,7 @@ template<int val>
 struct set_to_int : public tunable {
   // mxnet_op version (when used directly with Kernel<>::Launch()) */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out) {
     out[i] = DType(val);
   }
   // mshadow_op version (when used with op_with_req<>)
diff --git a/src/operator/random/sampler.h b/src/operator/random/sampler.h
index ca764e706c64..00963a6785ee 100644
--- a/src/operator/random/sampler.h
+++ b/src/operator/random/sampler.h
@@ -43,32 +43,33 @@ namespace op {
 template<typename OP, typename xpu, typename GType, typename ...Args>
 inline static void LaunchRNG(mshadow::Stream<xpu> *s,
                              common::random::RandGenerator<xpu, GType> *gen,
-                             const int N, Args... args) {
+                             const index_t N, Args... args) {
   // minimal check to avoid division by zero, below.
   // if `N` is zero the map operation is a no-op in any case.
   if (N <= 0) {
     return;
   }
-  const int nloop = (N + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
+  const index_t nloop = (N + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
                     RandGenerator<xpu>::kMinNumRandomPerThread;
-  const int nthread = std::min(nloop, RandGenerator<xpu>::kNumRandomStates);
-  const int step = (N + nthread - 1) / nthread;
+  const index_t nthread = std::min(nloop,
+                                   static_cast<index_t>(RandGenerator<xpu>::kNumRandomStates));
+  const index_t step = (N + nthread - 1) / nthread;
   Kernel<OP, xpu>::Launch(s, nthread, *gen, N, step, args...);
 }
 
 #define RNG_KERNEL_LOOP(xpu, GType, thread_id, gen, N, step, ...)        \
-  const int start = thread_id * step;                                    \
-  const int end = start + step;                                          \
+  const index_t start = thread_id * step;                                    \
+  const index_t end = start + step;                                          \
   typename RandGenerator<xpu, GType>::Impl genImpl(&gen, thread_id);     \
-  for (int i = start; i < end && i < N; ++i) {                           \
+  for (index_t i = start; i < end && i < N; ++i) {                           \
     {__VA_ARGS__}                                                        \
   }
 
 template<typename xpu>
 struct SampleUniformKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, OType> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *lower, const IType *upper, OType *out) {
     RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
@@ -127,8 +128,8 @@ struct RandIntSampler {
 template<typename xpu>
 struct SampleNormalKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, OType> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *mean, const IType *std, OType *out) {
     RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
@@ -154,8 +155,8 @@ struct NormalSampler {
 template<typename xpu>
 struct SampleExponentialKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, OType> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *lambda, OType *out) {
     RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
@@ -202,8 +203,8 @@ MSHADOW_XINLINE OType SampleGamma(IType a, IType b, typename RandGenerator<xpu,
 template<typename xpu>
 struct SampleGammaKernel {
   template<typename IType, typename OType, typename FType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, FType> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, FType> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *alpha, const IType *beta, OType *out) {
     RNG_KERNEL_LOOP(xpu, FType, id, gen, N, step, {
@@ -264,8 +265,8 @@ MSHADOW_XINLINE int SamplePoisson(float lambda, typename RandGenerator<xpu, floa
 template<typename xpu>
 struct SamplePoissonKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, float> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, float> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *lambda, OType *out) {
     RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
@@ -291,8 +292,8 @@ struct PoissonSampler {
 template<typename xpu>
 struct SampleNegativeBinomialKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, float> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, float> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *k, const IType *p, OType *out) {
     RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
@@ -323,8 +324,8 @@ struct NegativeBinomialSampler {
 template<typename xpu>
 struct SampleGeneralizedNegativeBinomialKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, float> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, float> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *mu, const IType *alpha, OType *out) {
     RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 167fa34b083f..141d2fb83d0d 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -53,14 +53,14 @@ MSHADOW_XINLINE Shape<ndim> calc_stride(const Shape<ndim>& shape) {
 }
 
 template<int ndim>
-MSHADOW_XINLINE void unravel_dot(const int idx, const Shape<ndim>& shape,
-  const Shape<ndim>& stridej, const Shape<ndim>& stridek, int* j, int* k) {
+MSHADOW_XINLINE void unravel_dot(const index_t idx, const Shape<ndim>& shape,
+  const Shape<ndim>& stridej, const Shape<ndim>& stridek, index_t* j, index_t* k) {
   *j = 0;
   *k = 0;
   #pragma unroll
-  for (int i = ndim-1, idx_t = idx; i >=0; --i) {
-    const int tmp = idx_t / shape[i];
-    const int coord = idx_t - tmp*shape[i];
+  for (index_t i = ndim-1, idx_t = idx; i >=0; --i) {
+    const auto tmp = idx_t / shape[i];
+    const auto coord = idx_t - tmp*shape[i];
     *j += coord*stridej[i];
     *k += coord*stridek[i];
     idx_t = tmp;
@@ -68,11 +68,11 @@ MSHADOW_XINLINE void unravel_dot(const int idx, const Shape<ndim>& shape,
 }
 
 template<int ndim>
-MSHADOW_XINLINE Shape<ndim> unravel(const int idx, const Shape<ndim>& shape) {
+MSHADOW_XINLINE Shape<ndim> unravel(const index_t idx, const Shape<ndim>& shape) {
   Shape<ndim> ret;
   #pragma unroll
-  for (int i = ndim-1, j = idx; i >=0; --i) {
-    int tmp = j / shape[i];
+  for (index_t i = ndim-1, j = idx; i >=0; --i) {
+    auto tmp = j / shape[i];
     ret[i] = j - tmp*shape[i];
     j = tmp;
   }
@@ -80,10 +80,10 @@ MSHADOW_XINLINE Shape<ndim> unravel(const int idx, const Shape<ndim>& shape) {
 }
 
 template<int ndim>
-MSHADOW_XINLINE int ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
-  int ret = 0;
+MSHADOW_XINLINE index_t ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
+  index_t ret = 0;
   #pragma unroll
-  for (int i = 0; i < ndim; ++i) {
+  for (index_t i = 0; i < ndim; ++i) {
     ret = ret * shape[i] + (shape[i] > 1) * coord[i];
   }
   return ret;
@@ -111,12 +111,12 @@ MSHADOW_XINLINE int diff(const Shape<ndim>& small, const Shape<ndim>& big, Shape
 }
 
 template<int ndim>
-MSHADOW_XINLINE int unravel_dot(const int idx, const Shape<ndim>& shape,
+MSHADOW_XINLINE index_t unravel_dot(const index_t idx, const Shape<ndim>& shape,
   const Shape<ndim>& stride) {
-  int ret = 0;
+  index_t ret = 0;
   #pragma unroll
-  for (int i = ndim-1, j = idx; i >=0; --i) {
-    int tmp = j / shape[i];
+  for (index_t i = ndim-1, j = idx; i >=0; --i) {
+    auto tmp = j / shape[i];
     ret += (j - tmp*shape[i])*stride[i];
     j = tmp;
   }
@@ -124,8 +124,8 @@ MSHADOW_XINLINE int unravel_dot(const int idx, const Shape<ndim>& shape,
 }
 
 template<int ndim>
-MSHADOW_XINLINE int dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
-  int ret = 0;
+MSHADOW_XINLINE index_t dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
+  index_t ret = 0;
   #pragma unroll
   for (int i = 0; i < ndim; ++i)
     ret += coord[i] * stride[i];
@@ -142,27 +142,27 @@ MSHADOW_XINLINE void assign(DType* dst, const bool addto, const DType src) {
 }
 
 template<int ndim, typename DType, typename OP>
-MSHADOW_XINLINE void binary_broadcast_assign(const int idx, const bool addto,
+MSHADOW_XINLINE void binary_broadcast_assign(const index_t idx, const bool addto,
                                              const DType* __restrict lhs,
                                              const DType* __restrict rhs, DType* out,
                                              const Shape<ndim>& lshape, const Shape<ndim>& rshape,
                                              const Shape<ndim>& oshape) {
   const Shape<ndim> coord = unravel(idx, oshape);
-  const int j = ravel(coord, lshape);
-  const int k = ravel(coord, rshape);
+  const index_t j = ravel(coord, lshape);
+  const index_t k = ravel(coord, rshape);
   assign(&out[idx], addto, OP::Map(lhs[j], rhs[k]));
 }
 
 template<typename Reducer, int ndim, typename DType, typename OP>
-MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool addto,
+MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const bool addto,
                                        const DType* __restrict big, DType *small,
                                        const Shape<ndim>& bshape, const Shape<ndim>& sshape,
                                        const Shape<ndim>& rshape, const Shape<ndim>& rstride) {
   Shape<ndim> coord = unravel(idx, sshape);
-  int j = ravel(coord, bshape);
+  index_t j = ravel(coord, bshape);
   DType val, residual;
   Reducer::SetInitValue(val, residual);
-  for (int k = 0; k < M; ++k) {
+  for (size_t k = 0; k < M; ++k) {
     coord = unravel(k, rshape);
     Reducer::Reduce(val, OP::Map(big[j + dot(coord, rstride)]), residual);
   }
@@ -176,10 +176,10 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool ad
 #else
 
 template<int ndim, typename DType, typename OP>
-void binary_broadcast_compute(const int N, const bool addto, const DType *lhs,
+void binary_broadcast_compute(const size_t N, const bool addto, const DType *lhs,
                               const DType *rhs, DType *out, const Shape<ndim> lshape,
                               const Shape<ndim> rshape, const Shape<ndim> oshape) {
-  for (int idx = 0; idx < N; ++idx) {
+  for (size_t idx = 0; idx < N; ++idx) {
     binary_broadcast_assign<ndim, DType, OP>(idx, addto, lhs, rhs, out, lshape, rshape, oshape);
   }
 }
@@ -188,26 +188,26 @@ template<int ndim, typename DType, typename OP>
 void BinaryBroadcastComputeImpl(Stream<cpu> *s, const OpReqType req,
                                 const TBlob& lhs, const TBlob& rhs, const TBlob& out) {
   if (req == kNullOp) return;
-  int N = out.shape_.Size();
+  size_t N = out.shape_.Size();
   binary_broadcast_compute<ndim, DType, OP>(N, req == kAddTo, lhs.dptr<DType>(), rhs.dptr<DType>(),
                            out.dptr<DType>(), lhs.shape_.get<ndim>(), rhs.shape_.get<ndim>(),
                            out.shape_.get<ndim>());
 }
 
 template<typename Reducer, int ndim, typename DType, typename OP>
-void seq_reduce_compute(const int N, const int M, const bool addto,
+void seq_reduce_compute(const size_t N, const size_t M, const bool addto,
                         const DType *big, DType *small, const Shape<ndim> bshape,
                         const Shape<ndim> sshape, const Shape<ndim> rshape,
                         const Shape<ndim> rstride) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (int idx = 0; idx < N; ++idx) {
+  for (index_t idx = 0; idx < static_cast<index_t>(N); ++idx) {
     seq_reduce_assign<Reducer, ndim, DType, OP>(idx, M, addto, big, small, bshape, sshape, rshape,
       rstride);
   }
 }
 
 template <typename Reducer, int ndim, typename DType, typename OP>
-void seq_reduce_compute_extra_mem(const int N, const int M, const bool addto,
+void seq_reduce_compute_extra_mem(const size_t N, const size_t M, const bool addto,
                                   const DType* big, DType* small,
                                   const Shape<ndim> bshape,
                                   const Shape<ndim> sshape,
@@ -215,12 +215,12 @@ void seq_reduce_compute_extra_mem(const int N, const int M, const bool addto,
                                   const Shape<ndim> rstride,
                                   const index_t* ws_dptr) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (int idx = 0; idx < N; ++idx) {
+  for (index_t idx = 0; idx < static_cast<index_t>(N); ++idx) {
     Shape<ndim> coord = unravel(idx, sshape);
-    int j = ravel(coord, bshape);
+    index_t j = ravel(coord, bshape);
     DType val, residual;
     Reducer::SetInitValue(val, residual);
-    for (int k = 0; k < M; ++k) {
+    for (size_t k = 0; k < M; ++k) {
       Reducer::Reduce(val, OP::Map(big[j + ws_dptr[k]]), residual);
     }
     assign(&small[idx], addto, val);
@@ -233,7 +233,7 @@ void Reduce(Stream<cpu>* s, const TBlob& small, const OpReqType req,
   if (req == kNullOp) return;
   Shape<ndim> rshape, rstride;
   diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &rshape, &rstride);
-  int N = small.shape_.Size(), M = rshape.Size();
+  size_t N = small.shape_.Size(), M = rshape.Size();
   seq_reduce_compute<Reducer, ndim, DType, OP>(
     N, M, req == kAddTo, big.dptr<DType>(), small.dptr<DType>(),
     big.shape_.get<ndim>(), small.shape_.get<ndim>(), rshape, rstride);
@@ -247,9 +247,9 @@ void ReduceWithExtraMem(Stream<cpu>* s, const TBlob& small, const OpReqType req,
   Shape<ndim> rshape, rstride;
   diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &rshape, &rstride);
   index_t* ws_dptr = reinterpret_cast<index_t*>(workspace.dptr_);
-  int N = small.shape_.Size(), M = rshape.Size();
+  size_t N = small.shape_.Size(), M = rshape.Size();
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (int k = 0; k < M; k++) {
+  for (index_t k = 0; k < static_cast<index_t>(M); k++) {
     Shape<ndim> coord = unravel(k, rshape);
     ws_dptr[k] = dot(coord, rstride);
   }
@@ -272,7 +272,7 @@ size_t ReduceWorkspaceSize(Stream<cpu> *s, const TShape& small, const OpReqType
 }
 
 template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2>
-MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool addto,
+MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const bool addto,
                                        const DType* __restrict big, const DType* __restrict lhs,
                                        const DType* __restrict rhs, DType *small,
                                        const Shape<ndim>& big_shape, const Shape<ndim>& lhs_shape0,
@@ -282,20 +282,20 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool ad
                                        const Shape<ndim>& rstride, const Shape<ndim>& lhs_stride,
                                        const Shape<ndim>& rhs_stride) {
   Shape<ndim> coord = unravel(idx, small_shape);
-  const int idx_big0 = ravel(coord, big_shape);
-  const int idx_lhs0 = ravel(coord, lhs_shape0);
-  const int idx_rhs0 = ravel(coord, rhs_shape0);
+  const index_t idx_big0 = ravel(coord, big_shape);
+  const index_t idx_lhs0 = ravel(coord, lhs_shape0);
+  const index_t idx_rhs0 = ravel(coord, rhs_shape0);
   DType val, residual;
   Reducer::SetInitValue(val, residual);
-  for (int k = 0; k < M; ++k) {
+  for (size_t k = 0; k < M; ++k) {
     Shape<ndim> coord_big = unravel(k, rshape);
-    int idx_big = idx_big0 + dot(coord_big, rstride);
+    index_t idx_big = idx_big0 + dot(coord_big, rstride);
 
     Shape<ndim> coord_lhs = unravel(k, lhs_shape);
-    int idx_lhs = idx_lhs0 + dot(coord_lhs, lhs_stride);
+    index_t idx_lhs = idx_lhs0 + dot(coord_lhs, lhs_stride);
 
     Shape<ndim> coord_rhs = unravel(k, rhs_shape);
-    int idx_rhs = idx_rhs0 + dot(coord_rhs, rhs_stride);
+    index_t idx_rhs = idx_rhs0 + dot(coord_rhs, rhs_stride);
 
     Reducer::Reduce(val, OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs], rhs[idx_rhs])), residual);
   }
@@ -304,7 +304,7 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool ad
 }
 
 template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2>
-void seq_reduce_compute(const int N, const int M, const bool addto,
+void seq_reduce_compute(const size_t N, const size_t M, const bool addto,
                         const DType *big, const DType *lhs, const DType *rhs, DType *small,
                         const Shape<ndim> big_shape, const Shape<ndim> small_shape,
                         const Shape<ndim> rshape, const Shape<ndim> rstride,
@@ -312,7 +312,7 @@ void seq_reduce_compute(const int N, const int M, const bool addto,
                         const Shape<ndim> rhs_shape, const Shape<ndim> rhs_stride,
                         const Shape<ndim>& lhs_shape0, const Shape<ndim>& rhs_shape0) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (int idx = 0; idx < N; ++idx) {
+  for (index_t idx = 0; idx < static_cast<index_t>(N); ++idx) {
     seq_reduce_assign<Reducer, ndim, DType, OP1, OP2>(idx, M, addto, big, lhs, rhs, small,
       big_shape, lhs_shape0, rhs_shape0, small_shape, rshape, lhs_shape, rhs_shape, rstride,
       lhs_stride, rhs_stride);
@@ -326,8 +326,8 @@ void Reduce(Stream<cpu> *s, const TBlob& small, const OpReqType req,
   if (req == kNullOp) return;
   Shape<ndim> rshape, rstride;
   diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &rshape, &rstride);
-  int N = small.shape_.Size();
-  int M = rshape.Size();
+  size_t N = small.shape_.Size();
+  size_t M = rshape.Size();
 
   Shape<ndim> lhs_shape, lhs_stride;
   diff(small.shape_.get<ndim>(), lhs.shape_.get<ndim>(), &lhs_shape, &lhs_stride);
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index 391c35117128..304422038b89 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -190,7 +190,7 @@ namespace mxnet_op {
 template<int ndim, typename DType, typename OP>
 struct binary_broadcast_kernel {
   /*! \brief Map function for binary_broadcast_kernel */
-  MSHADOW_XINLINE static void Map(int base, int length, OpReqType req,
+  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
                                   const Shape <ndim> &lstride, const Shape <ndim> &rstride,
                                   const Shape <ndim> &oshape, DType *lhs, DType *rhs,
                                   DType *out) {
@@ -199,7 +199,7 @@ struct binary_broadcast_kernel {
     auto ridx = static_cast<index_t>(dot(coord, rstride));
     KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx]));
     // starts from 1 to avoid extra inc at end of loop
-    for (int i = 1; i < length; ++i) {
+    for (index_t i = 1; i < length; ++i) {
       inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
       // When tuning, don't actually run the op, since it's not going to be tuned against
       // the actual op we'll eventually be using
@@ -208,7 +208,7 @@ struct binary_broadcast_kernel {
   }
 
   /*! \brief Map function for binary_broadcast_kernel */
-  MSHADOW_XINLINE static void Map(int base, int length, OpReqType req,
+  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
                                   const Shape <ndim> &lstride, const Shape <ndim> &rstride,
                                   const Shape <ndim> &oshape, DType lhs, DType *rhs,
                                   DType *out) {
@@ -217,7 +217,7 @@ struct binary_broadcast_kernel {
     auto ridx = static_cast<index_t>(dot(coord, rstride));
     KERNEL_ASSIGN(out[base], req, OP::Map(lhs, rhs[ridx]));
     // starts from 1 to avoid extra inc at end of loop
-    for (int i = 1; i < length; ++i) {
+    for (index_t i = 1; i < length; ++i) {
       inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
       // When tuning, don't actually run the op, since it's not going to be tuned against
       // the actual op we'll eventually be using
@@ -238,7 +238,7 @@ struct csr_dns_csr_broadcast_kernel {
    * \param out          ptr to the data buffer of the result csr matrix
    */
   template<typename DType, typename CType, typename RType>
-  MSHADOW_XINLINE static void Map(int row, const DType *csr_data, const CType *csr_indices,
+  MSHADOW_XINLINE static void Map(index_t row, const DType *csr_data, const CType *csr_indices,
                                   const RType *csr_indptr, const DType *dns, DType *out) {
     const nnvm::dim_t curr_row_i = csr_indptr[row];
     const nnvm::dim_t next_row_i = csr_indptr[row + 1];
@@ -257,7 +257,7 @@ struct csr_dns_csr_broadcast_kernel {
    * \param nnz         number of non-zero elements in input csr matrix
    */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, const DType *csr_data, const DType* scalar_ptr,
+  MSHADOW_XINLINE static void Map(index_t i, const DType *csr_data, const DType* scalar_ptr,
                                   DType *out, const nnvm::dim_t nnz) {
     const DType scale = scalar_ptr[0];
     if (i < nnz) {
@@ -269,7 +269,7 @@ struct csr_dns_csr_broadcast_kernel {
 template<int req, typename OP, bool reverse = false>
 struct csr_dns_map_kernel {
   template <typename DType, typename CType, typename RType>
-  MSHADOW_XINLINE static void Map(int row, const DType *csr_data, const CType *csr_indices,
+  MSHADOW_XINLINE static void Map(index_t row, const DType *csr_data, const CType *csr_indices,
                                   const RType *csr_indptr, DType *out, const nnvm::dim_t num_rows,
                                   const nnvm::dim_t num_cols) {
     if (row < num_rows) {
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index 77236e068f86..c39418dbe41d 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -36,7 +36,7 @@ struct TakeCPU {
   // K is the number of rows of in_data
   // i is the index of out_data
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data,
                                   const IType* idx, const size_t M, const int64_t K) {
     int64_t j = static_cast<int64_t>(idx[i]);
     if (clip) {
@@ -420,19 +420,19 @@ inline void SparseEmbeddingOpBackwardRspImpl<cpu>(const bool deterministic,
 
 template<typename DType, typename IType>
 inline typename std::enable_if<(!std::is_same<DType, mshadow::half::half_t>::value), void>::type
-GatherNDBackwardImpl(int N, int M, int K,
+GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                      const mshadow::Shape<10> strides,
                      DType* out,
                      const DType* data,
                      const IType* indices,
                      mshadow::Stream<cpu> *s) {
 #pragma omp parallel for
-  for (int i = 0; i < N; i++) {
-    int offset = 0;
-    for (int j = 0; j < M; ++j) {
-      offset += strides[j] * static_cast<int>(indices[j*N + i]);
+  for (index_t i = 0; i < N; i++) {
+    index_t offset = 0;
+    for (index_t j = 0; j < M; ++j) {
+      offset += strides[j] * static_cast<index_t>(indices[j*N + i]);
     }
-    for (int j = 0; j < K; ++j) {
+    for (index_t j = 0; j < K; ++j) {
 #pragma omp atomic
       out[offset + j] += data[i * K + j];
     }
@@ -441,18 +441,18 @@ GatherNDBackwardImpl(int N, int M, int K,
 
 template<typename DType, typename IType>
 inline typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value, void>::type
-GatherNDBackwardImpl(int N, int M, int K,
+GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                      const mshadow::Shape<10> strides,
                      DType* out,
                      const DType* data,
                      const IType* indices,
                      mshadow::Stream<cpu> *s) {
-  for (int i = 0; i < N; i++) {
-    int offset = 0;
-    for (int j = 0; j < M; ++j) {
-      offset += strides[j] * static_cast<int>(indices[j*N + i]);
+  for (index_t i = 0; i < N; i++) {
+    index_t offset = 0;
+    for (index_t j = 0; j < M; ++j) {
+      offset += strides[j] * static_cast<index_t>(indices[j*N + i]);
     }
-    for (int j = 0; j < K; ++j) {
+    for (index_t j = 0; j < K; ++j) {
       out[offset + j] += data[i * K + j];
     }
   }
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 0d72b1815fde..bad3e5a1a6c5 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -439,22 +439,22 @@ inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const bool deterministic,
 
 struct backward_gather_nd_gpu {
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, int N, int M, int K,
+  MSHADOW_XINLINE static void Map(index_t i, index_t N, index_t M, index_t K,
                                   const mshadow::Shape<10> strides,
                                   DType* out, const DType* data,
                                   const IType* indices) {
-    int offset = 0;
-    for (int j = 0; j < M; ++j) {
+    index_t offset = 0;
+    for (index_t j = 0; j < M; ++j) {
       offset += strides[j] * static_cast<int>(indices[j*N + i]);
     }
-    for (int j = 0; j < K; ++j) {
+    for (index_t j = 0; j < K; ++j) {
       atomicAdd(out + (offset + j), data[i * K + j]);
     }
   }
 };
 
 template<typename DType, typename IType>
-inline void GatherNDBackwardImpl(int N, int M, int K,
+inline void GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                                  const mshadow::Shape<10> strides,
                                  DType* out,
                                  const DType* data,
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 92b6e21018e5..fba331e25705 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -314,7 +314,8 @@ struct Take {
    * \param axis        axis id
    */
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data, const IType* idx,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data,
+                                  const IType* idx,
                                   const mshadow::Shape<10> in_stride,
                                   const mshadow::Shape<10> out_stride,
                                   const int in_ndims, const int out_ndims, const int idx_ndims,
@@ -361,7 +362,7 @@ struct TakeRspKernel {
    * \param nnr         number of non-zero rows
    */
   template<typename DType, typename IType, typename RType>
-  MSHADOW_XINLINE static void Map(int i,
+  MSHADOW_XINLINE static void Map(index_t i,
                                   const IType* data,
                                   DType* out,
                                   const RType* weight_idx,
@@ -1395,15 +1396,15 @@ inline bool ScatterNDType(const nnvm::NodeAttrs& attrs,
 
 struct scatter_nd {
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, OpReqType req, int N, int M, int K,
+  MSHADOW_XINLINE static void Map(index_t i, OpReqType req, index_t N, index_t M, index_t K,
                                   const mshadow::Shape<10> strides,
                                   DType* out, const DType* data,
                                   const IType* indices) {
-    int offset = 0;
-    for (int j = 0; j < M; ++j) {
-      offset += strides[j] * static_cast<int>(indices[j*N + i]);
+    index_t offset = 0;
+    for (index_t j = 0; j < M; ++j) {
+      offset += strides[j] * static_cast<index_t>(indices[j*N + i]);
     }
-    for (int j = 0; j < K; ++j) {
+    for (index_t j = 0; j < K; ++j) {
       KERNEL_ASSIGN(out[offset+j], req, data[i*K + j]);
     }
   }
@@ -1416,17 +1417,18 @@ void ScatterNDForward(const nnvm::NodeAttrs& attrs,
                       const std::vector<OpReqType>& req,
                       const std::vector<TBlob>& outputs) {
   using namespace mshadow;
+  using nnvm::dim_t;
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
   if (req[0] == kNullOp) return;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const TShape& oshape = outputs[0].shape_;
   const TShape& ishape = inputs[1].shape_;
-  int M = ishape[0];
-  int N = ishape.Size() / M;
-  int K = oshape.ProdShape(M, oshape.ndim());
+  dim_t M = ishape[0];
+  dim_t N = ishape.Size() / M;
+  dim_t K = oshape.ProdShape(M, oshape.ndim());
   mshadow::Shape<10> strides;
-  for (int i = M-1, stride = K; i >= 0; stride *= oshape[i], --i) strides[i] = stride;
+  for (dim_t i = M-1, stride = K; i >= 0; stride *= oshape[i], --i) strides[i] = stride;
   if (kWriteTo == req[0]) {
     Fill<true>(s, outputs[0], req[0], 0);
   }
@@ -1441,7 +1443,7 @@ void ScatterNDForward(const nnvm::NodeAttrs& attrs,
 
 template<typename DType, typename IType>
 inline typename std::enable_if<(!std::is_same<DType, mshadow::half::half_t>::value), void>::type
-GatherNDBackwardImpl(int N, int M, int K,
+GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                      const mshadow::Shape<10> strides,
                      DType* out,
                      const DType* data,
@@ -1450,7 +1452,7 @@ GatherNDBackwardImpl(int N, int M, int K,
 
 template<typename DType, typename IType>
 inline typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value, void>::type
-GatherNDBackwardImpl(int N, int M, int K,
+GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                      const mshadow::Shape<10> strides,
                      DType* out,
                      const DType* data,
@@ -1458,7 +1460,7 @@ GatherNDBackwardImpl(int N, int M, int K,
                      mshadow::Stream<cpu> *s);
 
 template<typename DType, typename IType>
-inline void GatherNDBackwardImpl(int N, int M, int K,
+inline void GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                                  const mshadow::Shape<10> strides,
                                  DType* out,
                                  const DType* data,
@@ -1472,17 +1474,18 @@ void GatherNDBackward(const nnvm::NodeAttrs& attrs,
                       const std::vector<OpReqType>& req,
                       const std::vector<TBlob>& outputs) {
   using namespace mshadow;
+  using nnvm::dim_t;
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
   if (req[0] == kNullOp) return;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const TShape& oshape = outputs[0].shape_;
   const TShape& ishape = inputs[1].shape_;
-  int M = ishape[0];
-  int N = ishape.Size() / M;
-  int K = oshape.ProdShape(M, oshape.ndim());
+  dim_t M = ishape[0];
+  dim_t N = ishape.Size() / M;
+  dim_t K = oshape.ProdShape(M, oshape.ndim());
   mshadow::Shape<10> strides;
-  for (int i = M-1, stride = K; i >= 0; stride *= oshape[i], --i) strides[i] = stride;
+  for (dim_t i = M-1, stride = K; i >= 0; stride *= oshape[i], --i) strides[i] = stride;
   if (kWriteTo == req[0]) {
     Fill<true>(s, outputs[0], req[0], 0);
   }
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index 4e52b087f10a..e9e67cb1a4c5 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -453,7 +453,7 @@ void EyeFill(const nnvm::NodeAttrs& attrs,
 
 struct range_fwd {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, int repeat, DType start, DType step,
+  MSHADOW_XINLINE static void Map(index_t i, int repeat, DType start, DType step,
                                   int req, DType* out) {
     KERNEL_ASSIGN(out[i], req, start + (i/repeat) * step);
   }
@@ -471,8 +471,8 @@ void RangeCompute(const nnvm::NodeAttrs& attrs,
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       // Force unsigned params to take two's complement form on ARM to ensure consistency with x86
       // results.  Casting negative floats to unsigned types is undefined in the CPP standard.
-      auto step = std::is_signed<DType>() ? param.step : static_cast<int>(param.step);
-      auto start = std::is_signed<DType>() ? param.start : static_cast<int>(param.start);
+      auto step = std::is_signed<DType>() ? param.step : static_cast<index_t>(param.step);
+      auto start = std::is_signed<DType>() ? param.start : static_cast<index_t>(param.start);
       Kernel<range_fwd, xpu>::Launch(s,
                                      outputs[0].Size(),
                                      static_cast<int>(param.repeat),
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 9c81d87464de..3b229cf38eba 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -626,9 +626,9 @@ inline void GetIndexRange(const TShape& dshape,
                           const nnvm::Tuple<dmlc::optional<int>>& param_begin,
                           const nnvm::Tuple<dmlc::optional<int>>& param_end,
                           const nnvm::Tuple<dmlc::optional<int>>& param_step,
-                          common::StaticArray<int, ndim>* begin,
-                          common::StaticArray<int, ndim>* end,
-                          common::StaticArray<int, ndim>* step) {
+                          common::StaticArray<index_t, ndim>* begin,
+                          common::StaticArray<index_t, ndim>* end,
+                          common::StaticArray<index_t, ndim>* step) {
   CHECK_NE(dshape.ndim(), 0U);
   CHECK_LE(param_begin.ndim(), dshape.ndim())
     << "Slicing axis exceeds data dimensions";
@@ -646,8 +646,8 @@ inline void GetIndexRange(const TShape& dshape,
   }
 
   for (index_t i = 0; i < param_begin.ndim(); ++i) {
-    int b = 0, e = dshape[i], s = 1;
-    const int len = dshape[i];
+    index_t b = 0, e = dshape[i], s = 1;
+    const index_t len = dshape[i];
     if (param_step.ndim() != 0U) {
       const auto& opt_step_val = param_step[i];
       if (opt_step_val.has_value()) {
@@ -724,7 +724,7 @@ inline bool SliceOpShape(const nnvm::NodeAttrs& attrs,
   TShape oshape = dshape;
 
   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
     for (index_t i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
@@ -743,19 +743,19 @@ template<int ndim, int req>
 struct slice_forward<ndim, req, gpu> {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* data,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* data,
                                   const mshadow::Shape<ndim> dshape,
                                   const mshadow::Shape<ndim> oshape,
-                                  const common::StaticArray<int, ndim> begin,
-                                  const common::StaticArray<int, ndim> step) {
-    const int data_last_dim_size = dshape[ndim-1];
-    const int out_last_dim_size = oshape[ndim-1];
-    const int step_last_dim = step[ndim-1];
-    const int begin_last_dim = begin[ndim-1];
-    const int j = i % out_last_dim_size;
-    int irow = 0;  // row id of flattend 2D data
-    int stride = 1;
-    int idx = i / out_last_dim_size;
+                                  const common::StaticArray<index_t, ndim> begin,
+                                  const common::StaticArray<index_t, ndim> step) {
+    const index_t data_last_dim_size = dshape[ndim-1];
+    const index_t out_last_dim_size = oshape[ndim-1];
+    const index_t step_last_dim = step[ndim-1];
+    const index_t begin_last_dim = begin[ndim-1];
+    const index_t j = i % out_last_dim_size;
+    index_t irow = 0;  // row id of flattend 2D data
+    index_t stride = 1;
+    index_t idx = i / out_last_dim_size;
     #pragma unroll
     for (int k = ndim - 2; k >= 0; --k) {
       irow += stride * ((idx % oshape[k]) * step[k] + begin[k]);
@@ -771,20 +771,20 @@ template<int ndim, int req>
 struct slice_forward<ndim, req, cpu> {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* data,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* data,
                                   const mshadow::Shape<ndim> dshape,
                                   const mshadow::Shape<ndim> oshape,
-                                  const common::StaticArray<int, ndim> begin,
-                                  const common::StaticArray<int, ndim> step) {
-    const int data_last_dim_size = dshape[ndim-1];
-    const int out_last_dim_size = oshape[ndim-1];
-    const int step_last_dim = step[ndim-1];
-    const int begin_last_dim = begin[ndim-1];
-    int out_offset = i * out_last_dim_size;
-    for (int j = 0; j < out_last_dim_size; ++j) {
-      int irow = 0;  // row id of flattend 2D data
-      int stride = 1;
-      int idx = i;
+                                  const common::StaticArray<index_t, ndim> begin,
+                                  const common::StaticArray<index_t, ndim> step) {
+    const index_t data_last_dim_size = dshape[ndim-1];
+    const index_t out_last_dim_size = oshape[ndim-1];
+    const index_t step_last_dim = step[ndim-1];
+    const index_t begin_last_dim = begin[ndim-1];
+    index_t out_offset = i * out_last_dim_size;
+    for (index_t j = 0; j < out_last_dim_size; ++j) {
+      index_t irow = 0;  // row id of flattend 2D data
+      index_t stride = 1;
+      index_t idx = i;
       #pragma unroll
       for (int k = ndim - 2; k >= 0; --k) {
         irow += stride * ((idx % oshape[k]) * step[k] + begin[k]);
@@ -813,11 +813,11 @@ void SliceOpForward(const nnvm::NodeAttrs& attrs,
   const TBlob& out = outputs[0];
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(data.shape_, param.begin, param.end, param.step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        int num_threads = out.shape_.FlatTo2D()[0];
+        size_t num_threads = out.shape_.FlatTo2D()[0];
         if (std::is_same<xpu, gpu>::value) {
           num_threads *= out.shape_.get<ndim>()[ndim - 1];
         }
@@ -836,20 +836,20 @@ template<int ndim, int req>
 struct slice_assign<ndim, req, cpu> {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* val,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* val,
                                   const mshadow::Shape<ndim> oshape,
                                   const mshadow::Shape<ndim> vshape,
-                                  const common::StaticArray<int, ndim> begin,
-                                  const common::StaticArray<int, ndim> step) {
-    const int data_last_dim_size = oshape[ndim-1];
-    const int out_last_dim_size = vshape[ndim-1];
-    const int step_last_dim = step[ndim-1];
-    const int begin_last_dim = begin[ndim-1];
-    int offset = i * out_last_dim_size;
-    for (int j = 0; j < out_last_dim_size; ++j) {
-      int irow = 0;  // row id of flattend 2D out
-      int stride = 1;
-      int idx = i;
+                                  const common::StaticArray<index_t, ndim> begin,
+                                  const common::StaticArray<index_t, ndim> step) {
+    const index_t data_last_dim_size = oshape[ndim-1];
+    const index_t out_last_dim_size = vshape[ndim-1];
+    const index_t step_last_dim = step[ndim-1];
+    const index_t begin_last_dim = begin[ndim-1];
+    index_t offset = i * out_last_dim_size;
+    for (index_t j = 0; j < out_last_dim_size; ++j) {
+      index_t irow = 0;  // row id of flattend 2D out
+      index_t stride = 1;
+      index_t idx = i;
       #pragma unroll
       for (int k = ndim - 2; k >= 0; --k) {
         irow += stride * ((idx % vshape[k]) * step[k] + begin[k]);
@@ -866,19 +866,19 @@ template<int ndim, int req>
 struct slice_assign<ndim, req, gpu> {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* val,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* val,
                                   const mshadow::Shape<ndim> oshape,
                                   const mshadow::Shape<ndim> vshape,
-                                  const common::StaticArray<int, ndim> begin,
-                                  const common::StaticArray<int, ndim> step) {
-    const int data_last_dim_size = oshape[ndim-1];
-    const int out_last_dim_size = vshape[ndim-1];
-    const int step_last_dim = step[ndim-1];
-    const int begin_last_dim = begin[ndim-1];
-    const int j = i % out_last_dim_size;
-    int irow = 0;  // row id of flattend 2D out
-    int stride = 1;
-    int idx = i / out_last_dim_size;
+                                  const common::StaticArray<index_t, ndim> begin,
+                                  const common::StaticArray<index_t, ndim> step) {
+    const index_t data_last_dim_size = oshape[ndim-1];
+    const index_t out_last_dim_size = vshape[ndim-1];
+    const index_t step_last_dim = step[ndim-1];
+    const index_t begin_last_dim = begin[ndim-1];
+    const index_t j = i % out_last_dim_size;
+    index_t irow = 0;  // row id of flattend 2D out
+    index_t stride = 1;
+    index_t idx = i / out_last_dim_size;
     #pragma unroll
     for (int k = ndim - 2; k >= 0; --k) {
       irow += stride * ((idx % vshape[k]) * step[k] + begin[k]);
@@ -911,7 +911,7 @@ void SliceOpBackward(const nnvm::NodeAttrs& attrs,
     LOG(FATAL) << "_slice_backward does not support kWriteInplace";
   }
   MXNET_NDIM_SWITCH(ograd.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(igrad.shape_, param.begin, param.end, param.step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
@@ -937,7 +937,7 @@ inline bool SliceAssignOpShape(const nnvm::NodeAttrs& attrs,
   TShape vshape = dshape;  // vshape is the value shape on the right hand side
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
     for (index_t i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
@@ -975,7 +975,7 @@ void SliceAssignOpForward(const nnvm::NodeAttrs& attrs,
 
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(data.shape_, param.begin, param.end, param.step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
@@ -1024,20 +1024,20 @@ template<int ndim>
 struct slice_assign_scalar {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType val,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType val,
                                   const OpReqType req,
                                   const mshadow::Shape<ndim> oshape,
                                   const mshadow::Shape<ndim> vshape,
-                                  const common::StaticArray<int, ndim> begin,
-                                  const common::StaticArray<int, ndim> step) {
-    const int data_last_dim_size = oshape[ndim-1];
-    const int out_last_dim_size = vshape[ndim-1];
-    const int step_last_dim = step[ndim-1];
-    const int begin_last_dim = begin[ndim-1];
-    for (int j = 0; j < out_last_dim_size; ++j) {
-      int irow = 0;  // row id of flattend 2D out
-      int stride = 1;
-      int idx = i;
+                                  const common::StaticArray<index_t, ndim> begin,
+                                  const common::StaticArray<index_t, ndim> step) {
+    const index_t data_last_dim_size = oshape[ndim-1];
+    const index_t out_last_dim_size = vshape[ndim-1];
+    const index_t step_last_dim = step[ndim-1];
+    const index_t begin_last_dim = begin[ndim-1];
+    for (index_t j = 0; j < out_last_dim_size; ++j) {
+      index_t irow = 0;  // row id of flattend 2D out
+      index_t stride = 1;
+      index_t idx = i;
       #pragma unroll
       for (int k = ndim - 2; k >= 0; --k) {
         irow += stride * ((idx % vshape[k]) * step[k] + begin[k]);
@@ -1076,7 +1076,7 @@ void SliceAssignScalarOpForward(const nnvm::NodeAttrs& attrs,
   TShape vshape = data.shape_;
   const SliceAssignScalarParam& param = nnvm::get<SliceAssignScalarParam>(attrs.parsed);
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(data.shape_, param.begin, param.end, param.step, &begin, &end, &step);
     for (index_t i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
@@ -1107,7 +1107,7 @@ struct SliceAxisParam : public dmlc::Parameter<SliceAxisParam> {
 };
 
 inline void GetSliceAxisParams(const SliceAxisParam& param, const TShape& ishape,
-                           int* axis, int* begin, int* end) {
+                           int* axis, index_t* begin, index_t* end) {
   *axis = param.axis;
   if (*axis < 0) {
     *axis += static_cast<int>(ishape.ndim());
@@ -1115,7 +1115,7 @@ inline void GetSliceAxisParams(const SliceAxisParam& param, const TShape& ishape
   CHECK(*axis < static_cast<int>(ishape.ndim()) && *axis >= 0) <<
     "Transformed axis must be smaller than the source ndim and larger than zero! Recieved axis=" <<
     param.axis << ", src_ndim=" << ishape.ndim() << ", transformed axis=" << *axis;
-  int axis_size = static_cast<int>(ishape[*axis]);
+  index_t axis_size = static_cast<index_t>(ishape[*axis]);
   *begin = param.begin;
   *end = -1;
   if (*begin < 0) {
@@ -1149,7 +1149,8 @@ inline bool SliceAxisShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   TShape& ishape = (*in_attrs)[0];
-  int axis, begin, end;
+  int axis;
+  index_t begin, end;
   GetSliceAxisParams(param, ishape, &axis, &begin, &end);
   TShape shape(ishape.ndim());
   for (index_t i = 0; i < ishape.ndim(); ++i) {
@@ -1173,7 +1174,8 @@ void SliceAxis(const nnvm::NodeAttrs& attrs,
   using namespace mshadow::expr;
   const SliceAxisParam& param = nnvm::get<SliceAxisParam>(attrs.parsed);
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  int axis, begin, end;
+  int axis;
+  index_t begin, end;
   GetSliceAxisParams(param, inputs[0].shape_, &axis, &begin, &end);
   int ndim = static_cast<int>(outputs[0].ndim());
 
@@ -1207,7 +1209,8 @@ void SliceAxisGrad_(const nnvm::NodeAttrs& attrs,
   using namespace mshadow::op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  int axis, begin, end;
+  int axis;
+  index_t begin, end;
   GetSliceAxisParams(param, outputs[0].shape_, &axis, &begin, &end);
   int ndim = static_cast<int>(outputs[0].shape_.ndim());
 
@@ -1354,7 +1357,7 @@ void SliceLikeForward(const nnvm::NodeAttrs& attrs,
   SliceLikeInferRanges(ishape, from_shape, param.axes, &param_begin, &param_end, &param_step);
 
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(data.shape_, param_begin, param_end, param_step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
@@ -1400,7 +1403,7 @@ void SliceLikeBackward(const nnvm::NodeAttrs& attrs,
   SliceLikeInferRanges(ishape, from_shape, param.axes, &param_begin, &param_end, &param_step);
 
   MXNET_NDIM_SWITCH(ograd.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(ograd.shape_, param_begin, param_end, param_step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
@@ -1429,7 +1432,7 @@ struct ClipParam : public dmlc::Parameter<ClipParam> {
 
 struct clip {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* datas,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* datas,
                                   DType a_min, DType a_max) {
     DType data = datas[i];
     if (data > a_max) {
@@ -1445,7 +1448,7 @@ struct clip {
 
 struct clip_grad {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* grad, const DType* datas,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* grad, const DType* datas,
                                   DType a_min, DType a_max) {
     DType data = datas[i];
     if (data > a_max) {
@@ -1934,7 +1937,7 @@ struct reverse {
   }
 #ifdef __CUDACC__
   template<typename DType>
-  __device__  static void Map(int index, index_t nreversedim, const DType *src, DType *dst,
+  __device__  static void Map(index_t index, index_t nreversedim, const DType *src, DType *dst,
                               const index_t * stride_,
                               const index_t * trailing_) {
     __shared__ index_t stride_share[REVERSE_MAX_DIM];
@@ -1949,7 +1952,7 @@ struct reverse {
   }
 #else
   template<typename DType>
-  MSHADOW_XINLINE  static void Map(int index, index_t nreversedim, const DType *src, DType *dst,
+  MSHADOW_XINLINE  static void Map(index_t index, index_t nreversedim, const DType *src, DType *dst,
                                    const index_t * stride_,
                                    const index_t * trailing_) {
     index_t new_idx = ReverseIndex(index, nreversedim, stride_, trailing_);
@@ -2141,10 +2144,10 @@ struct SqueezeParam : public dmlc::Parameter<SqueezeParam> {
 // move all the zeros to the last of the shape array
 // and keep the relative order of the non-zero values.
 // Returns the new shape size after moving all zeros to the end.
-inline uint32_t SqueezeShapeHelper(TShape* shape) {
+inline size_t SqueezeShapeHelper(TShape* shape) {
   CHECK(shape != nullptr);
-  uint32_t count = 0;
-  for (uint32_t i = 0; i < shape->ndim(); ++i) {
+  size_t count = 0;
+  for (size_t i = 0; i < shape->ndim(); ++i) {
     if ((*shape)[i] == 0) {
       ++count;
     } else {
@@ -2167,7 +2170,7 @@ inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
   if (param.axis.has_value()) {
     // preprocess axis
     TShape axes = param.axis.value();
-    for (uint32_t i = 0; i < axes.ndim(); ++i) {
+    for (size_t i = 0; i < axes.ndim(); ++i) {
       if (axes[i] < 0) {
         axes[i] += dndim;
         CHECK_GE(axes[i], 0)
@@ -2182,11 +2185,11 @@ inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
       oshape[axes[i]] = 0;
     }
   } else {
-    for (uint32_t i = 0; i < oshape.ndim(); ++i) {
+    for (size_t i = 0; i < oshape.ndim(); ++i) {
       if (oshape[i] == 1) oshape[i] = 0;
     }
   }
-  uint32_t oshape_size = SqueezeShapeHelper(&oshape);
+  size_t oshape_size = SqueezeShapeHelper(&oshape);
   if (oshape_size == 0) {  // corner case when dshape is (1, 1, 1, 1)
     oshape[0] = 1;
     oshape_size = 1;
@@ -2229,7 +2232,7 @@ inline bool DepthToSpaceOpShape(const nnvm::NodeAttrs& attrs,
 
   expected_out[0] = in_shape[0];
   expected_out[1] = in_shape[1] / (block * block);
-  uint32_t i = 2;
+  size_t i = 2;
   while (i < expected_out.ndim()) {
     expected_out[i] = in_shape[i] * block;
     ++i;
@@ -2259,9 +2262,9 @@ inline bool DepthToSpaceOpType(const nnvm::NodeAttrs& attrs,
  * \param inp_index         index within input tensor from where value is retrieved
  * \param offset_arr        array containing the linear offset of input tensor
  */
-MSHADOW_XINLINE void update_index(int index_position, int dim_size, int *idx,
-                                  int *inp_index, const int* offset_arr) {
-  int next_idx_val = *idx / dim_size;
+MSHADOW_XINLINE void update_index(index_t index_position, index_t dim_size, index_t *idx,
+                                  index_t *inp_index, const index_t* offset_arr) {
+  index_t next_idx_val = *idx / dim_size;
   *inp_index += (*idx - next_idx_val * dim_size) * offset_arr[index_position];
   *idx = next_idx_val;
 }
@@ -2280,9 +2283,9 @@ MSHADOW_XINLINE void update_index(int index_position, int dim_size, int *idx,
 template<int req>
 struct depth_to_space_forward {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data,
-                                  const int block, const int* size, const int* offset_arr) {
-    int inp_index = 0, idx = i, dim_size;
+  MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data,
+                                  const int block, const index_t* size, const index_t* offset_arr) {
+    index_t inp_index = 0, idx = i, dim_size;
     dim_size = block;
     update_index(2, dim_size, &idx, &inp_index, offset_arr);
     dim_size = size[3];
@@ -2315,9 +2318,9 @@ struct depth_to_space_forward {
 template<int req>
 struct compute_offset_for_depth_to_space {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* offset_arr, DType* size, const int block,
-                                  const int32_t size0, const int32_t size1, const int32_t size2,
-                                  const int32_t size3) {
+  MSHADOW_XINLINE static void Map(index_t i, DType* offset_arr, DType* size, const int block,
+                                  const index_t size0, const index_t size1, const index_t size2,
+                                  const index_t size3) {
     size[0] = size0;
     size[1] = size1;
     size[2] = size2;
@@ -2349,10 +2352,10 @@ void DepthToSpaceOpForward(const nnvm::NodeAttrs& attrs,
   int block = param.block_size;
 
   mshadow::Tensor<xpu, 1, char> workspace =
-    ctx.requested[0].get_space_typed<xpu, 1, char>(mshadow::Shape1(sizeof(int32_t) * 10), s);
+    ctx.requested[0].get_space_typed<xpu, 1, char>(mshadow::Shape1(sizeof(index_t) * 10), s);
   char* workspace_curr_ptr = workspace.dptr_;
-  int32_t* offset_arr = reinterpret_cast<int32_t*>(workspace_curr_ptr);
-  int32_t* size = reinterpret_cast<int32_t*>(workspace_curr_ptr + sizeof(int32_t) * 6);
+  index_t* offset_arr = reinterpret_cast<index_t*>(workspace_curr_ptr);
+  index_t* size = reinterpret_cast<index_t*>(workspace_curr_ptr + sizeof(index_t) * 6);
 
   MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
     MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
@@ -2431,9 +2434,9 @@ inline bool SpaceToDepthOpType(const nnvm::NodeAttrs& attrs,
 template<int req>
 struct space_to_depth_forward {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data, const int block,
-                                  const int* size, const int* offset_arr) {
-    int inp_index = 0, idx = i, dim_size;
+  MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data, const int block,
+                                  const index_t* size, const index_t* offset_arr) {
+    index_t inp_index = 0, idx = i, dim_size;
     dim_size = size[3] / block;
     update_index(4, dim_size, &idx, &inp_index, offset_arr);
     dim_size = size[2] / block;
@@ -2466,9 +2469,9 @@ struct space_to_depth_forward {
 template<int req>
 struct compute_offset_for_space_to_depth {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* offset_arr, DType* size, const int block,
-                                  const int32_t size0, const int32_t size1,
-                                  const int32_t size2, const int32_t size3) {
+  MSHADOW_XINLINE static void Map(index_t i, DType* offset_arr, DType* size, const int block,
+                                  const index_t size0, const index_t size1,
+                                  const index_t size2, const index_t size3) {
     size[0] = size0;
     size[1] = size1;
     size[2] = size2;
@@ -2500,10 +2503,10 @@ void SpaceToDepthOpForward(const nnvm::NodeAttrs& attrs,
   int block = param.block_size;
 
   mshadow::Tensor<xpu, 1, char> workspace =
-    ctx.requested[0].get_space_typed<xpu, 1, char>(mshadow::Shape1(sizeof(int32_t) * 10), s);
+    ctx.requested[0].get_space_typed<xpu, 1, char>(mshadow::Shape1(sizeof(index_t) * 10), s);
   char* workspace_curr_ptr = workspace.dptr_;
-  int32_t* offset_arr = reinterpret_cast<int32_t*>(workspace_curr_ptr);
-  int32_t* size = reinterpret_cast<int32_t*>(workspace_curr_ptr + sizeof(int32_t) * 6);
+  index_t* offset_arr = reinterpret_cast<index_t*>(workspace_curr_ptr);
+  index_t* size = reinterpret_cast<index_t*>(workspace_curr_ptr + sizeof(index_t) * 6);
 
   MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
     MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index 121acc174b51..a301362f2db7 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -15,20 +15,126 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import unittest
 import mxnet as mx
+import numpy as np
 from mxnet import gluon, nd
 
+# dimension constants
+MEDIUM_X = 10000
+LARGE_X = 100000000
+LARGE_Y = 50000000
+SMALL_Y = 50
+LARGE_SIZE = LARGE_X * SMALL_Y
+
+def test_gluon_embedding():
+    m = gluon.nn.Embedding(SMALL_Y, MEDIUM_X)
+    m.initialize()
+    a = nd.zeros((MEDIUM_X, SMALL_Y))
+    b = m(a)
+    assert b.shape == (MEDIUM_X, SMALL_Y, MEDIUM_X)
+    assert b.asnumpy().size == LARGE_SIZE
+
+def test_ndarray_zeros():
+    a = nd.zeros(shape=(LARGE_X, SMALL_Y))
+    assert a[-1][0] == 0
+    assert a.shape == (LARGE_X, SMALL_Y)
+    assert a.size == LARGE_SIZE
+
+def test_ndarray_ones():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    assert a[-1][0] == 1
+    assert nd.sum(a).asnumpy() == LARGE_SIZE
+
+def test_ndarray_random_uniform():
+    a = nd.random.uniform(shape=(LARGE_X, SMALL_Y))
+    assert a[-1][0] != 0
+
+def test_ndarray_empty():
+    a = nd.empty((LARGE_X, SMALL_Y))
+    assert a.shape == (LARGE_X, SMALL_Y)
+
+def test_elementwise():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    b = nd.ones(shape=(LARGE_X, SMALL_Y))
+    res = a + b
+    assert np.sum(res[-1].asnumpy() == 2) == a.shape[1]
+    res = a + 1
+    assert np.sum(res[-1].asnumpy() == 2) == a.shape[1]
+    res = nd.sqrt(a + 3)
+    assert np.sum(res[-1].asnumpy() == 2) == a.shape[1]
+
+def test_reduce():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y)) 
+    assert nd.sum(a).asnumpy() == a.shape[0] * a.shape[1]
+
+def test_dot():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y)) 
+    b = nd.ones(shape=(SMALL_Y, SMALL_Y))
+    res = nd.dot(a, b)
+    assert np.sum(res[-1].asnumpy() == SMALL_Y) == b.shape[1]
+
+def test_FullyConnected():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y)) 
+    b = nd.ones(shape=(SMALL_Y, SMALL_Y)) 
+    res = nd.FullyConnected(a, b, num_hidden=b.shape[1], no_bias=True)
+    assert np.sum(res[-1].asnumpy() == SMALL_Y) == b.shape[1]
+
+def test_broadcast():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    b = nd.arange(0, LARGE_X).reshape(LARGE_X, 1)
+    res = nd.broadcast_to(b, shape=(b.shape[0], SMALL_Y))
+    assert np.sum(res[-1].asnumpy() == LARGE_X) == res.shape[1]
+    res = mx.nd.broadcast_like(b, a)
+    assert np.sum(res[-1].asnumpy() == LARGE_X) == a.shape[1]
+
+def test_clip():
+    a = nd.arange(0, LARGE_X).reshape(LARGE_X, 1)
+    b = nd.broadcast_to(a, shape=(a.shape[0], SMALL_Y))
+    res = nd.clip(b, a_min=100, a_max=1000)
+    assert np.sum(res[-1].asnumpy() == 1000) == b.shape[1]
+
+def test_take():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    idx = nd.arange(LARGE_X-1000, LARGE_X)
+    res = nd.take(a, idx)
+    assert np.sum(res[-1].asnumpy() == 1) == res.shape[1]
+
+def test_slice():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    res = nd.slice(a, begin=(LARGE_X-1000, 1), end=(LARGE_X, SMALL_Y))
+    assert np.sum(res[-1].asnumpy() == 1) == res.shape[1]
+
+def test_slice_assign():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    a[LARGE_X-1:LARGE_X] = 1000
+    assert np.sum(a[-1].asnumpy() == 1000) == a.shape[1]
+ 
+def test_expand_dims():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    res = nd.expand_dims(a, axis=1)
+    assert res.shape == (a.shape[0], 1, a.shape[1])
+
+def test_squeeze():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    data = nd.expand_dims(a, axis=1)
+    res = nd.squeeze(data)
+    assert res.shape == a.shape
+
+def test_broadcast_div():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    b = nd.ones(shape=(LARGE_X, 1)) * 2
+    res = a / b
+    assert np.sum(res[-1].asnumpy() == 0.5) == a.shape[1]
+
+def test_Dense(ctx=mx.cpu(0)):
+    data = mx.nd.ones(shape=(50*1000*1000, 100))
+    linear = gluon.nn.Dense(100)
+    linear.initialize(ctx=ctx)
+    res = linear(data)
+    res.wait_to_read()
+    assert res.shape == (50000000, 100)
 
-class TestLargeArray(unittest.TestCase):
-    def test_ndarray2numpy(self):
-        m = gluon.nn.Embedding(14000, 128)
-        m.initialize()
-        ind = nd.zeros((700000, 128))
-        x = m(ind)
-        x.shape
-        test = x.asnumpy()
-        assert (x.shape == test.shape)
 
 if __name__ == '__main__':
-    unittest.main()
+    import nose
+    nose.runmodule()

From aed3079817b6e073da4253f35bb3a72c51976aeb Mon Sep 17 00:00:00 2001
From: Gaurav Gireesh <Gaurav.Gireesh@fox.com>
Date: Fri, 30 Nov 2018 22:53:46 -0800
Subject: [PATCH 14/28] [MXNET-1210 ] Gluon Audio - Example (#13325)

* Initialized the example

* Addressed PR comments, about existing synset.txt file - no overwrite

* RST - docstring issues fixed

* added README

* Addressed PR comments

* Addressed PR comments, checking Divide by 0

* Raising error if format is not supported.

* changed a line for ndarray of labels

* Trigger CI

* Trigger CI

* PR comments addressed around skip_header argument

* Addressed PR comments around librosa import

* PR Comments

* Passing lazy=lazy from argument

* Added PR comments, labels to README.MD

* Trigger CI

* Addressing PR Comments in README

* Modified README.md

* Added example under audio folder

* Retrigger CI

* Retrigger CI
---
 example/gluon/audio/transforms.py             | 205 ++++++++++++++++++
 example/gluon/audio/urban_sounds/README.md    | 100 +++++++++
 example/gluon/audio/urban_sounds/datasets.py  | 179 +++++++++++++++
 example/gluon/audio/urban_sounds/model.py     |  33 +++
 example/gluon/audio/urban_sounds/predict.py   |  92 ++++++++
 .../gluon/audio/urban_sounds/requirements.txt |   2 +
 example/gluon/audio/urban_sounds/train.py     | 157 ++++++++++++++
 7 files changed, 768 insertions(+)
 create mode 100644 example/gluon/audio/transforms.py
 create mode 100644 example/gluon/audio/urban_sounds/README.md
 create mode 100644 example/gluon/audio/urban_sounds/datasets.py
 create mode 100644 example/gluon/audio/urban_sounds/model.py
 create mode 100644 example/gluon/audio/urban_sounds/predict.py
 create mode 100644 example/gluon/audio/urban_sounds/requirements.txt
 create mode 100644 example/gluon/audio/urban_sounds/train.py

diff --git a/example/gluon/audio/transforms.py b/example/gluon/audio/transforms.py
new file mode 100644
index 000000000000..8b76d131cdb1
--- /dev/null
+++ b/example/gluon/audio/transforms.py
@@ -0,0 +1,205 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# coding: utf-8
+# pylint: disable= arguments-differ
+"""Audio transforms."""
+
+import warnings
+import numpy as np
+try:
+    import librosa
+except ImportError as e:
+    warnings.warn("librosa dependency could not be resolved or \
+    imported, could not provide some/all transform.")
+
+from mxnet import ndarray as nd
+from mxnet.gluon.block import Block
+
+class MFCC(Block):
+    """Extracts Mel frequency cepstrum coefficients from the audio data file
+    More details : https://librosa.github.io/librosa/generated/librosa.feature.mfcc.html
+
+    Attributes
+    ----------
+    sampling_rate: int, default 22050
+        sampling rate of the input audio signal
+    num_mfcc: int, default 20
+        number of mfccs to return
+
+
+    Inputs:
+        - **x**: input tensor (samples, ) shape.
+
+    Outputs:
+        - **out**: output array is a scaled NDArray with (samples, ) shape.
+
+    """
+
+    def __init__(self, sampling_rate=22050, num_mfcc=20):
+        self._sampling_rate = sampling_rate
+        self._num_fcc = num_mfcc
+        super(MFCC, self).__init__()
+
+    def forward(self, x):
+        if isinstance(x, np.ndarray):
+            y = x
+        elif isinstance(x, nd.NDArray):
+            y = x.asnumpy()
+        else:
+            warnings.warn("MFCC - allowed datatypes mx.nd.NDArray and numpy.ndarray")
+            return x
+
+        audio_tmp = np.mean(librosa.feature.mfcc(y=y, sr=self._sampling_rate, n_mfcc=self._num_fcc).T, axis=0)
+        return nd.array(audio_tmp)
+
+
+class Scale(Block):
+    """Scale audio numpy.ndarray from a 16-bit integer to a floating point number between
+    -1.0 and 1.0. The 16-bit integer is the sample resolution or bit depth.
+
+    Attributes
+    ----------
+    scale_factor : float
+        The factor to scale the input tensor by.
+
+
+    Inputs:
+        - **x**: input tensor (samples, ) shape.
+
+    Outputs:
+        - **out**: output array is a scaled NDArray with (samples, ) shape.
+
+    Examples
+    --------
+    >>> scale = audio.transforms.Scale(scale_factor=2)
+    >>> audio_samples = mx.nd.array([2,3,4])
+    >>> scale(audio_samples)
+    [1.  1.5 2. ]
+    <NDArray 3 @cpu(0)>
+
+    """
+
+    def __init__(self, scale_factor=2**31):
+        self.scale_factor = scale_factor
+        super(Scale, self).__init__()
+
+    def forward(self, x):
+        if self.scale_factor == 0:
+            warnings.warn("Scale factor cannot be 0.")
+            return x
+        if isinstance(x, np.ndarray):
+            return nd.array(x/self.scale_factor)
+        return x / self.scale_factor
+
+
+class PadTrim(Block):
+    """Pad/Trim a 1d-NDArray of NPArray (Signal or Labels)
+
+    Attributes
+    ----------
+    max_len : int
+        Length to which the array will be padded or trimmed to.
+    fill_value: int or float
+        If there is a need of padding, what value to pad at the end of the input array.
+
+
+    Inputs:
+        - **x**: input tensor (samples, ) shape.
+
+    Outputs:
+        - **out**: output array is a scaled NDArray with (max_len, ) shape.
+
+    Examples
+    --------
+    >>> padtrim = audio.transforms.PadTrim(max_len=9, fill_value=0)
+    >>> audio_samples = mx.nd.array([1,2,3,4,5])
+    >>> padtrim(audio_samples)
+    [1. 2. 3. 4. 5. 0. 0. 0. 0.]
+    <NDArray 9 @cpu(0)>
+
+    """
+
+    def __init__(self, max_len, fill_value=0):
+        self._max_len = max_len
+        self._fill_value = fill_value
+        super(PadTrim, self).__init__()
+
+    def forward(self, x):
+        if  isinstance(x, np.ndarray):
+            x = nd.array(x)
+        if self._max_len > x.size:
+            pad = nd.ones((self._max_len - x.size,)) * self._fill_value
+            x = nd.concat(x, pad, dim=0)
+        elif self._max_len < x.size:
+            x = x[:self._max_len]
+        return x
+
+
+class MEL(Block):
+    """Create MEL Spectrograms from a raw audio signal. Relatively pretty slow.
+
+    Attributes
+    ----------
+    sampling_rate: int, default 22050
+        sampling rate of the input audio signal
+    num_fft: int, default 2048
+        length of the Fast Fourier transform window
+    num_mels: int, default 20
+        number of mel bands to generate
+    hop_length: int, default 512
+        total samples between successive frames
+
+
+    Inputs:
+        - **x**: input tensor (samples, ) shape.
+
+    Outputs:
+        - **out**: output array which consists of mel spectograms, shape = (n_mels, 1)
+
+       Usage (see librosa.feature.melspectrogram docs):
+           MEL(sr=16000, n_fft=1600, hop_length=800, n_mels=64)
+
+    Examples
+    --------
+    >>> mel = audio.transforms.MEL()
+    >>> audio_samples = mx.nd.array([1,2,3,4,5])
+    >>> mel(audio_samples)
+    [[3.81801406e+04]
+    [9.86858240e-29]
+    [1.87405472e-29]
+    [2.38637225e-29]
+    [3.94043010e-29]
+    [3.67071565e-29]
+    [7.29390295e-29]
+    [8.84324438e-30]...
+    <NDArray 128x1 @cpu(0)>
+
+    """
+
+    def __init__(self, sampling_rate=22050, num_fft=2048, num_mels=20, hop_length=512):
+        self._sampling_rate = sampling_rate
+        self._num_fft = num_fft
+        self._num_mels = num_mels
+        self._hop_length = hop_length
+        super(MEL, self).__init__()
+
+    def forward(self, x):
+        if isinstance(x, nd.NDArray):
+            x = x.asnumpy()
+        specs = librosa.feature.melspectrogram(x, sr=self._sampling_rate,\
+        n_fft=self._num_fft, n_mels=self._num_mels, hop_length=self._hop_length)
+        return nd.array(specs)
diff --git a/example/gluon/audio/urban_sounds/README.md b/example/gluon/audio/urban_sounds/README.md
new file mode 100644
index 000000000000..c85d29db2e5a
--- /dev/null
+++ b/example/gluon/audio/urban_sounds/README.md
@@ -0,0 +1,100 @@
+# Urban Sounds Classification in MXNet Gluon
+
+This example provides an end-to-end pipeline for a common datahack competition - Urban Sounds Classification Example.
+Below is the link to the competition:
+https://datahack.analyticsvidhya.com/contest/practice-problem-urban-sound-classification/
+
+After logging in, the data set can be downloaded.
+The details of the dataset and the link to download it are given below:
+
+
+## Urban Sounds Dataset:
+### Description
+  The dataset contains 8732 wav files which are audio samples(<= 4s)) of street sounds like engine_idling, car_horn, children_playing, dog_barking and so on.
+  The task is to classify these audio samples into one of the following 10 labels:
+  ```
+  siren,
+  street_music,
+  drilling,
+  dog_bark,
+  children_playing,
+  gun_shot,
+  engine_idling,
+  air_conditioner,
+  jackhammer,
+  car_horn
+  ```
+
+To be able to run this example:
+
+1. `pip install -r requirements.txt`
+
+    If you are in the directory where the requirements.txt file lies,
+    this step installs the required libraries to run the example.
+    The main dependency that is required is: Librosa. 
+    The version used to test the example is: `0.6.2`
+    For more details, refer here:
+https://librosa.github.io/librosa/install.html
+
+2. Download the dataset(train.zip, test.zip) required for this example from the location:
+https://drive.google.com/drive/folders/0By0bAi7hOBAFUHVXd1JCN3MwTEU
+
+3. Extract both the zip archives into the **current directory** - after unzipping you would get 2 new folders namely,
+   **Train** and **Test** and two csv files - **train.csv**, **test.csv**
+
+   Assuming you are in a directory *"UrbanSounds"*, after downloading and extracting train.zip, the folder structure should be:
+   
+   ```
+        UrbanSounds        
+                    - Train
+                        - 0.wav, 1.wav ...
+                    - train.csv
+                    - train.py
+                    - predict.py ...
+    ```
+
+4. Apache MXNet is installed on the machine. For instructions, go to the link: https://mxnet.incubator.apache.org/install/
+
+
+
+For information on the current design of how the AudioFolderDataset is implemented, refer below:
+https://cwiki.apache.org/confluence/display/MXNET/Gluon+-+Audio
+
+### Usage 
+
+For training:
+
+- Arguments
+  - train : The folder/directory that contains the audio(wav) files locally. Default = "./Train"
+  - csv: The file name of the csv file that contains audio file name to label mapping. Default = "train.csv"
+  - epochs : Number of epochs to train the model. Default = 30
+  - batch_size : The batch size for training. Default = 32
+
+
+###### To use the default arguments, use:
+```
+python train.py
+``` 
+or
+
+###### To pass command-line arguments for training data directory, epochs, batch_size, csv file name, use :
+```
+python train.py --train ./Train --csv train.csv --batch_size 32 --epochs 30 
+```
+
+For prediction:
+
+- Arguments
+  - pred : The folder/directory that contains the audio(wav) files which are to be classified. Default = "./Test"
+
+
+###### To use the default arguments, use:
+```
+python predict.py
+``` 
+or
+
+###### To pass command-line arguments for test data directory, use :
+```
+python predict.py --pred ./Test
+```
\ No newline at end of file
diff --git a/example/gluon/audio/urban_sounds/datasets.py b/example/gluon/audio/urban_sounds/datasets.py
new file mode 100644
index 000000000000..51c040c8f162
--- /dev/null
+++ b/example/gluon/audio/urban_sounds/datasets.py
@@ -0,0 +1,179 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=
+""" Audio Dataset container."""
+from __future__ import print_function
+__all__ = ['AudioFolderDataset']
+
+import os
+import warnings
+from itertools import islice
+import csv
+from mxnet.gluon.data import Dataset
+from mxnet import ndarray as nd
+try:
+    import librosa
+except ImportError as e:
+    raise ImportError("librosa dependency could not be resolved or \
+    imported, could not load audio onto the numpy array. pip install librosa")
+
+
+
+class AudioFolderDataset(Dataset):
+    """A dataset for loading Audio files stored in a folder structure like::
+
+        root/children_playing/0.wav
+        root/siren/23.wav
+        root/drilling/26.wav
+        root/dog_barking/42.wav
+            OR
+        Files(wav) and a csv file that has file name and associated label
+
+    Parameters
+    ----------
+    root : str
+        Path to root directory.
+    transform : callable, default None
+        A function that takes data and label and transforms them
+    train_csv: str, default None
+       train_csv should be populated by the training csv filename
+    file_format: str, default '.wav'
+        The format of the audio files(.wav)
+    skip_header: boolean, default False
+        While reading from csv file, whether to skip at the start of the file to avoid reading in header
+
+
+    Attributes
+    ----------
+    synsets : list
+        List of class names. `synsets[i]` is the name for the  `i`th label
+    items : list of tuples
+        List of all audio in (filename, label) pairs.
+
+    """
+    def __init__(self, root, train_csv=None, file_format='.wav', skip_header=False):
+        if not librosa:
+            warnings.warn("pip install librosa to continue.")
+            raise RuntimeError("Librosa not installed. Run pip install librosa and retry this step.")
+        self._root = os.path.expanduser(root)
+        self._exts = ['.wav']
+        self._format = file_format
+        self._train_csv = train_csv
+        if file_format.lower() not in self._exts:
+            raise RuntimeError("Format {} not supported currently.".format(file_format))
+        skip_rows = 0
+        if skip_header:
+            skip_rows = 1
+        self._list_audio_files(self._root, skip_rows=skip_rows)
+
+
+    def _list_audio_files(self, root, skip_rows=0):
+        """Populates synsets - a map of index to label for the data items.
+        Populates the data in the dataset, making tuples of (data, label)
+        """
+        self.synsets = []
+        self.items = []
+        if not self._train_csv:
+            # The audio files are organized in folder structure with
+            # directory name as label and audios in them
+            self._folder_structure(root)
+        else:
+            # train_csv contains mapping between filename and label
+            self._csv_labelled_dataset(root, skip_rows=skip_rows)
+
+        # Generating the synset.txt file now
+        if not os.path.exists("./synset.txt"):
+            with open("./synset.txt", "w") as synsets_file:
+                for item in self.synsets:
+                    synsets_file.write(item+os.linesep)
+            print("Synsets is generated as synset.txt")
+        else:
+            warnings.warn("Synset file already exists in the current directory! Not generating synset.txt.")
+
+
+    def _folder_structure(self, root):
+        for folder in sorted(os.listdir(root)):
+            path = os.path.join(root, folder)
+            if not os.path.isdir(path):
+                warnings.warn('Ignoring {}, which is not a directory.'.format(path))
+                continue
+            label = len(self.synsets)
+            self.synsets.append(folder)
+            for filename in sorted(os.listdir(path)):
+                file_name = os.path.join(path, filename)
+                ext = os.path.splitext(file_name)[1]
+                if ext.lower() not in self._exts:
+                    warnings.warn('Ignoring {} of type {}. Only support {}'\
+                    .format(filename, ext, ', '.join(self._exts)))
+                    continue
+                self.items.append((file_name, label))
+
+
+    def _csv_labelled_dataset(self, root, skip_rows=0):
+        with open(self._train_csv, "r") as traincsv:
+            for line in islice(csv.reader(traincsv), skip_rows, None):
+                filename = os.path.join(root, line[0])
+                label = line[1].strip()
+                if label not in self.synsets:
+                    self.synsets.append(label)
+                if self._format not in filename:
+                    filename = filename+self._format
+                self.items.append((filename, nd.array([self.synsets.index(label)]).reshape((1,))))
+
+
+    def __getitem__(self, idx):
+        """Retrieve the item (data, label) stored at idx in items"""
+        filename, label = self.items[idx]
+        # resampling_type is passed as kaiser_fast for a better performance
+        X1, _ = librosa.load(filename, res_type='kaiser_fast')
+        return nd.array(X1), label
+
+
+    def __len__(self):
+        """Retrieves the number of items in the dataset"""
+        return len(self.items)
+
+
+    def transform_first(self, fn, lazy=False):
+        """Returns a new dataset with the first element of each sample
+        transformed by the transformer function `fn`.
+
+        This is useful, for example, when you only want to transform data
+        while keeping label as is.
+        lazy=False is passed to transform_first for dataset so that all tramsforms could be performed in
+        one shot and not during training. This is a performance consideration.
+
+        Parameters
+        ----------
+        fn : callable
+            A transformer function that takes the first element of a sample
+            as input and returns the transformed element.
+        lazy : bool, default False
+            If False, transforms all samples at once. Otherwise,
+            transforms each sample on demand. Note that if `fn`
+            is stochastic, you must set lazy to True or you will
+            get the same result on all epochs.
+
+        Returns
+        -------
+        Dataset
+            The transformed dataset.
+
+        """
+        return super(AudioFolderDataset, self).transform_first(fn, lazy=lazy)
diff --git a/example/gluon/audio/urban_sounds/model.py b/example/gluon/audio/urban_sounds/model.py
new file mode 100644
index 000000000000..af23cb946e2e
--- /dev/null
+++ b/example/gluon/audio/urban_sounds/model.py
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""This module builds a model an MLP with a configurable output layer( number of units in the last layer).
+Users can pass any number of units in the last layer. SInce this dataset has 10 labels,
+the default value of num_labels = 10
+"""
+import mxnet as mx
+from mxnet import gluon
+
+# Defining a neural network with number of labels
+def get_net(num_labels=10):
+    net = gluon.nn.Sequential()
+    with net.name_scope():
+        net.add(gluon.nn.Dense(256, activation="relu")) # 1st layer (256 nodes)
+        net.add(gluon.nn.Dense(256, activation="relu")) # 2nd hidden layer ( 256 nodes )
+        net.add(gluon.nn.Dense(num_labels))
+    net.collect_params().initialize(mx.init.Xavier())
+    return net
diff --git a/example/gluon/audio/urban_sounds/predict.py b/example/gluon/audio/urban_sounds/predict.py
new file mode 100644
index 000000000000..0c3631173667
--- /dev/null
+++ b/example/gluon/audio/urban_sounds/predict.py
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Prediction module for Urban Sounds Classification"""
+from __future__ import print_function
+import os
+import sys
+import warnings
+import mxnet as mx
+from mxnet import nd
+from model import get_net
+try:
+    import librosa
+except ImportError:
+    raise ImportError("Librosa is not installed! please run the following command:\
+     `pip install librosa`")
+sys.path.append('../')
+
+def predict(prediction_dir='./Test'):
+    """The function is used to run predictions on the audio files in the directory `pred_directory`.
+
+    Parameters
+    ----------
+    net:
+        The model that has been trained.
+    prediction_dir: string, default ./Test
+        The directory that contains the audio files on which predictions are to be made
+
+    """
+
+    if not os.path.exists(prediction_dir):
+        warnings.warn("The directory on which predictions are to be made is not found!")
+        return
+
+    if len(os.listdir(prediction_dir)) == 0:
+        warnings.warn("The directory on which predictions are to be made is empty! Exiting...")
+        return
+
+    # Loading synsets
+    if not os.path.exists('./synset.txt'):
+        warnings.warn("The synset or labels for the dataset do not exist. Please run the training script first.")
+        return
+
+    with open("./synset.txt", "r") as f:
+        synset = [l.rstrip() for l in f]
+    net = get_net(len(synset))
+    print("Trying to load the model with the saved parameters...")
+    if not os.path.exists("./net.params"):
+        warnings.warn("The model does not have any saved parameters... Cannot proceed! Train the model first")
+        return
+
+    net.load_parameters("./net.params")
+    file_names = os.listdir(prediction_dir)
+    full_file_names = [os.path.join(prediction_dir, item) for item in file_names]
+    from transforms import MFCC
+    mfcc = MFCC()
+    print("\nStarting predictions for audio files in ", prediction_dir, " ....\n")
+    for filename in full_file_names:
+        # Argument kaiser_fast to res_type is faster than 'kaiser_best'. To reduce the load time, passing kaiser_fast.
+        X1, _ = librosa.load(filename, res_type='kaiser_fast')
+        transformed_test_data = mfcc(mx.nd.array(X1))
+        output = net(transformed_test_data.reshape((1, -1)))
+        prediction = nd.argmax(output, axis=1)
+        print(filename, " -> ", synset[(int)(prediction.asscalar())])
+
+
+if __name__ == '__main__':
+    try:
+        import argparse
+        parser = argparse.ArgumentParser(description="Urban Sounds clsssification example - MXNet")
+        parser.add_argument('--pred', '-p', help="Enter the folder path that contains your audio files", type=str)
+        args = parser.parse_args()
+        pred_dir = args.pred
+
+    except ImportError:
+        warnings.warn("Argparse module not installed! passing default arguments.")
+        pred_dir = './Test'
+    predict(prediction_dir=pred_dir)
+    print("Urban sounds classification Prediction DONE!")
diff --git a/example/gluon/audio/urban_sounds/requirements.txt b/example/gluon/audio/urban_sounds/requirements.txt
new file mode 100644
index 000000000000..d885e0beec7e
--- /dev/null
+++ b/example/gluon/audio/urban_sounds/requirements.txt
@@ -0,0 +1,2 @@
+librosa>=0.6.2 # librosa is a library that is used to load the audio(wav) files and provides capabilities of feature extraction.
+argparse # used for parsing arguments
\ No newline at end of file
diff --git a/example/gluon/audio/urban_sounds/train.py b/example/gluon/audio/urban_sounds/train.py
new file mode 100644
index 000000000000..c88f9fb55187
--- /dev/null
+++ b/example/gluon/audio/urban_sounds/train.py
@@ -0,0 +1,157 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The module to run training on the Urban sounds dataset"""
+from __future__ import print_function
+import sys
+import os
+import time
+import warnings
+import mxnet as mx
+from mxnet import gluon, nd, autograd
+from datasets import AudioFolderDataset
+import model
+sys.path.append('../')
+
+def evaluate_accuracy(data_iterator, net):
+    """Function to evaluate accuracy of any data iterator passed to it as an argument"""
+    acc = mx.metric.Accuracy()
+    for data, label in data_iterator:
+        output = net(data)
+        predictions = nd.argmax(output, axis=1)
+        predictions = predictions.reshape((-1, 1))
+        acc.update(preds=predictions, labels=label)
+    return acc.get()[1]
+
+
+def train(train_dir=None, train_csv=None, epochs=30, batch_size=32):
+    """Function responsible for running the training the model."""
+
+    if not train_dir or not os.path.exists(train_dir) or not train_csv:
+        warnings.warn("No train directory could be found ")
+        return
+    # Make a dataset from the local folder containing Audio data
+    print("\nMaking an Audio Dataset...\n")
+    tick = time.time()
+    aud_dataset = AudioFolderDataset(train_dir, train_csv=train_csv, file_format='.wav', skip_header=True)
+    tock = time.time()
+
+    print("Loading the dataset took ", (tock-tick), " seconds.")
+    print("\n=======================================\n")
+    print("Number of output classes = ", len(aud_dataset.synsets))
+    print("\nThe labels are : \n")
+    print(aud_dataset.synsets)
+    # Get the model to train
+    net = model.get_net(len(aud_dataset.synsets))
+    print("\nNeural Network = \n")
+    print(net)
+    print("\nModel - Neural Network Generated!\n")
+    print("=======================================\n")
+
+    #Define the loss - Softmax CE Loss
+    softmax_loss = gluon.loss.SoftmaxCELoss(from_logits=False, sparse_label=True)
+    print("Loss function initialized!\n")
+    print("=======================================\n")
+
+    #Define the trainer with the optimizer
+    trainer = gluon.Trainer(net.collect_params(), 'adadelta')
+    print("Optimizer - Trainer function initialized!\n")
+    print("=======================================\n")
+    print("Loading the dataset to the Gluon's OOTB Dataloader...")
+
+    #Getting the data loader out of the AudioDataset and passing the transform
+    from transforms import MFCC
+    aud_transform = MFCC()
+    tick = time.time()
+
+    audio_train_loader = gluon.data.DataLoader(aud_dataset.transform_first(aud_transform), batch_size=32, shuffle=True)
+    tock = time.time()
+    print("Time taken to load data and apply transform here is ", (tock-tick), " seconds.")
+    print("=======================================\n")
+
+
+    print("Starting the training....\n")
+    # Training loop
+    tick = time.time()
+    batch_size = batch_size
+    num_examples = len(aud_dataset)
+
+    for epoch in range(epochs):
+        cumulative_loss = 0
+        for data, label in audio_train_loader:
+            with autograd.record():
+                output = net(data)
+                loss = softmax_loss(output, label)
+            loss.backward()
+
+            trainer.step(batch_size)
+            cumulative_loss += mx.nd.sum(loss).asscalar()
+
+        if epoch%5 == 0:
+            train_accuracy = evaluate_accuracy(audio_train_loader, net)
+            print("Epoch {}. Loss: {} Train accuracy : {} ".format(epoch, cumulative_loss/num_examples, train_accuracy))
+            print("\n------------------------------\n")
+
+    train_accuracy = evaluate_accuracy(audio_train_loader, net)
+    tock = time.time()
+    print("\nFinal training accuracy: ", train_accuracy)
+
+    print("Training the sound classification for ", epochs, " epochs, MLP model took ", (tock-tick), " seconds")
+    print("====================== END ======================\n")
+
+    print("Trying to save the model parameters here...")
+    net.save_parameters("./net.params")
+    print("Saved the model parameters in current directory.")
+
+
+if __name__ == '__main__':
+    training_dir = './Train'
+    training_csv = './train.csv'
+    epochs = 30
+    batch_size = 32
+
+    try:
+        import argparse
+        parser = argparse.ArgumentParser(description="Urban Sounds classification example - MXNet Gluon")
+        parser.add_argument('--train', '-t', help="Enter the folder path that contains your audio files", type=str)
+        parser.add_argument('--csv', '-c', help="Enter the filename of the csv that contains filename\
+        to label mapping", type=str)
+        parser.add_argument('--epochs', '-e', help="Enter the number of epochs \
+        you would want to run the training for.", type=int)
+        parser.add_argument('--batch_size', '-b', help="Enter the batch_size of data", type=int)
+        args = parser.parse_args()
+
+        if args:
+            if args.train:
+                training_dir = args.train
+
+            if args.csv:
+                training_csv = args.csv
+
+            if args.epochs:
+                epochs = args.epochs
+
+            if args.batch_size:
+                batch_size = args.batch_size
+
+
+    except ImportError as er:
+        warnings.warn("Argument parsing module could not be imported \
+        Passing default arguments.")
+
+
+    train(train_dir=training_dir, train_csv=training_csv, epochs=epochs, batch_size=batch_size)
+    print("Urban sounds classification Training DONE!")

From c9ddcb86eed0ac978b557c715efd9ef86b74c3c7 Mon Sep 17 00:00:00 2001
From: Vandana Kannan <vandanavk@users.noreply.github.com>
Date: Fri, 30 Nov 2018 23:05:51 -0800
Subject: [PATCH 15/28] ONNX export: Instance normalization, Shape (#12920)

* ONNX import/export: Make backend_rep common

* ONNX export: Instance Normalization

* ONNX export: Shape operator
---
 .../contrib/onnx/mx2onnx/_op_translations.py  | 26 +++++
 .../onnx/{export => }/backend_rep.py          | 32 +++---
 tests/python-pytest/onnx/export/backend.py    |  4 +
 .../onnx/export/onnx_backend_test.py          |  4 +-
 .../onnx/import/mxnet_backend.py              |  6 +-
 .../onnx/import/mxnet_backend_rep.py          | 98 -------------------
 6 files changed, 54 insertions(+), 116 deletions(-)
 rename tests/python-pytest/onnx/{export => }/backend_rep.py (78%)
 delete mode 100644 tests/python-pytest/onnx/import/mxnet_backend_rep.py

diff --git a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
index e2aab6b1efa7..facdcfedcbca 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
@@ -623,6 +623,23 @@ def convert_identity(node, **kwargs):
     """
     return create_basic_op_node('Identity', node, kwargs)
 
+@mx_op.register("InstanceNorm")
+def convert_instancenorm(node, **kwargs):
+    """Map MXNet's InstanceNorm operator attributes to onnx's InstanceNormalization operator
+    based on the input node's attributes and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    eps = float(attrs.get("eps", 0.001))
+
+    node = onnx.helper.make_node(
+        'InstanceNormalization',
+        inputs=input_nodes,
+        outputs=[name],
+        name=name,
+        epsilon=eps)
+
+    return [node]
 
 @mx_op.register("LeakyReLU")
 def convert_leakyrelu(node, **kwargs):
@@ -1546,6 +1563,15 @@ def convert_sum(node, **kwargs):
         )
     return [node]
 
+
+@mx_op.register("shape_array")
+def convert_shape(node, **kwargs):
+    """Map MXNet's shape_array operator attributes to onnx's Shape operator
+    and return the created node.
+    """
+    return create_basic_op_node('Shape', node, kwargs)
+
+
 @mx_op.register("hard_sigmoid")
 def convert_hardsigmoid(node, **kwargs):
     """Map MXNet's hard_sigmoid operator attributes to onnx's HardSigmoid operator
diff --git a/tests/python-pytest/onnx/export/backend_rep.py b/tests/python-pytest/onnx/backend_rep.py
similarity index 78%
rename from tests/python-pytest/onnx/export/backend_rep.py
rename to tests/python-pytest/onnx/backend_rep.py
index 8729eafea1a1..63836ac848df 100644
--- a/tests/python-pytest/onnx/export/backend_rep.py
+++ b/tests/python-pytest/onnx/backend_rep.py
@@ -16,16 +16,17 @@
 # under the License.
 
 # coding: utf-8
-"""backend rep for onnx test infrastructure"""
+"""MXNet backend rep for onnx test infrastructure"""
 try:
     from onnx.backend.base import BackendRep
 except ImportError:
-    raise ImportError("Onnx and protobuf need to be installed")
+    raise ImportError("Onnx and protobuf need to be installed. Instructions to"
+                      + " install - https://github.com/onnx/onnx#installation")
 import mxnet as mx
 
 # Using these functions for onnx test infrastructure.
 # Implemented by following onnx docs guide:
-# https://github.com/onnx/onnx/blob/master/docs/Implementing%20an%20ONNX%20backend.md
+# https://github.com/onnx/onnx/blob/master/docs/ImplementingAnOnnxBackend.md
 # MXNetBackendRep object will be returned by MXNetBackend's prepare method which is used to
 # execute a model repeatedly.
 # Inputs will be passed to the run method of MXNetBackendRep class, it will perform computation and
@@ -54,9 +55,6 @@ def run(self, inputs, **kwargs):
         params : numpy array
             result obtained after running the inference on mxnet
         """
-        data_forward = []
-        for val in inputs:
-            data_forward.append(mx.nd.array(val))
         # create module, passing cpu context
         if self.device == 'CPU':
             ctx = mx.cpu()
@@ -68,17 +66,19 @@ def run(self, inputs, **kwargs):
         data_names = [graph_input for graph_input in self.symbol.list_inputs()
                       if graph_input not in self.arg_params and graph_input not in self.aux_params]
 
-        data_shapes = []
+        data_forward = []
         for idx, input_name in enumerate(data_names):
-            data_shapes.append((input_name, inputs[idx].shape))
+            val = inputs[idx]
+            data_forward.append(mx.nd.array(val))
 
-        mod = mx.mod.Module(symbol=self.symbol, data_names=data_names, context=ctx,
-                            label_names=None)
-        mod.bind(for_training=False, data_shapes=data_shapes,
-                 label_shapes=None)
-        mod.set_params(arg_params=self.arg_params, aux_params=self.aux_params)
+        if self.arg_params:
+            for idx, input_name in enumerate(self.arg_params):
+                val = self.arg_params[input_name]
+                data_names.append(input_name)
+                data_forward.append(mx.nd.array(val))
 
-        # run inference
-        mod.forward(mx.io.DataBatch(data_forward))
-        result = mod.get_outputs()[0].asnumpy()
+        args = dict(zip(data_names, data_forward))
+        exe = self.symbol.bind(ctx, args=args, aux_states=self.aux_params)
+        exe.forward(is_train=False)
+        result = exe.outputs[0].asnumpy()
         return [result]
diff --git a/tests/python-pytest/onnx/export/backend.py b/tests/python-pytest/onnx/export/backend.py
index e23cc01494e9..3ea1dafca255 100644
--- a/tests/python-pytest/onnx/export/backend.py
+++ b/tests/python-pytest/onnx/export/backend.py
@@ -17,6 +17,8 @@
 
 # coding: utf-8
 """backend wrapper for onnx test infrastructure"""
+import os
+import sys
 import numpy as np
 from mxnet.contrib.onnx.onnx2mx.import_onnx import GraphProto
 from mxnet.contrib.onnx.mx2onnx.export_onnx import MXNetGraph
@@ -25,6 +27,8 @@
     from onnx.backend.base import Backend
 except ImportError:
     raise ImportError("Onnx and protobuf need to be installed")
+CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.insert(0, os.path.join(CURR_PATH, '../'))
 from backend_rep import MXNetBackendRep
 
 # Using these functions for onnx test infrastructure.
diff --git a/tests/python-pytest/onnx/export/onnx_backend_test.py b/tests/python-pytest/onnx/export/onnx_backend_test.py
index ec9ddf23c252..be9273eb6fac 100644
--- a/tests/python-pytest/onnx/export/onnx_backend_test.py
+++ b/tests/python-pytest/onnx/export/onnx_backend_test.py
@@ -95,7 +95,9 @@
     'test_clip'
     'test_cast',
     'test_depthtospace',
-    'test_hardsigmoid'
+    'test_hardsigmoid',
+    'test_instancenorm',
+    'test_shape'
     ]
 
 BASIC_MODEL_TESTS = [
diff --git a/tests/python-pytest/onnx/import/mxnet_backend.py b/tests/python-pytest/onnx/import/mxnet_backend.py
index 10f89ecbbbc7..bd4910b64f85 100644
--- a/tests/python-pytest/onnx/import/mxnet_backend.py
+++ b/tests/python-pytest/onnx/import/mxnet_backend.py
@@ -17,6 +17,8 @@
 
 # coding: utf-8
 """MXNet backend wrapper for onnx test infrastructure"""
+import os
+import sys
 from mxnet.contrib.onnx.onnx2mx.import_onnx import GraphProto
 try:
     from onnx import helper, TensorProto
@@ -24,7 +26,9 @@
 except ImportError:
     raise ImportError("Onnx and protobuf need to be installed. Instructions to"
                       + " install - https://github.com/onnx/onnx#installation")
-from mxnet_backend_rep import MXNetBackendRep
+CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.insert(0, os.path.join(CURR_PATH, '../'))
+from backend_rep import MXNetBackendRep
 
 # MXNetBackend class will take an ONNX model with inputs, perform a computation,
 # and then return the output.
diff --git a/tests/python-pytest/onnx/import/mxnet_backend_rep.py b/tests/python-pytest/onnx/import/mxnet_backend_rep.py
deleted file mode 100644
index 938f25d38bf3..000000000000
--- a/tests/python-pytest/onnx/import/mxnet_backend_rep.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-"""MXNet backend rep for onnx test infrastructure"""
-try:
-    from onnx.backend.base import BackendRep
-except ImportError:
-    raise ImportError("Onnx and protobuf need to be installed. Instructions to"
-                      + " install - https://github.com/onnx/onnx#installation")
-import mxnet as mx
-
-# Using these functions for onnx test infrastructure.
-# Implemented by following onnx docs guide:
-# https://github.com/onnx/onnx/blob/master/docs/ImplementingAnOnnxBackend.md
-# MXNetBackendRep object will be returned by MXNetBackend's prepare method which is used to
-# execute a model repeatedly.
-# Inputs will be passed to the run method of MXNetBackendRep class, it will perform computation and
-# retrieve the corresponding results for comparison to the onnx backend.
-# https://github.com/onnx/onnx/blob/master/onnx/backend/test/runner/__init__.py.
-
-class MXNetBackendRep(BackendRep):
-    """Running model inference on mxnet engine and return the result
-     to onnx test infrastructure for comparison."""
-    def __init__(self, symbol, arg_params, aux_params, device):
-        self.symbol = symbol
-        self.arg_params = arg_params
-        self.aux_params = aux_params
-        self.device = device
-
-    def run(self, inputs, **kwargs):
-        """Run model inference and return the result
-
-        Parameters
-        ----------
-        inputs : numpy array
-            input to run a layer on
-
-        Returns
-        -------
-        params : numpy array
-            result obtained after running the inference on mxnet
-        """
-        data_forward = []
-        for val in inputs:
-            data_forward.append(mx.nd.array(val))
-        # create module, passing cpu context
-        if self.device == 'CPU':
-            ctx = mx.cpu()
-        else:
-            raise NotImplementedError("ONNX tests are run only for CPU context.")
-
-        # To fetch the data names of the input to the model we list the inputs of the symbol graph
-        # and exclude the argument and auxiliary parameters from the list
-        data_names = [graph_input for graph_input in self.symbol.list_inputs()
-                      if graph_input not in self.arg_params and graph_input not in self.aux_params]
-
-        data_shapes = []
-        for idx, input_name in enumerate(data_names):
-            data_shapes.append((input_name, inputs[idx].shape))
-
-        # module bind method requires all data to have same batch size,
-        # using module if all data have same batch size
-        if len(set([data_shape[1][0] for data_shape in data_shapes])) == 1:
-            mod = mx.mod.Module(symbol=self.symbol, data_names=data_names, context=ctx,
-                                label_names=None)
-            mod.bind(for_training=False, data_shapes=data_shapes,
-                     label_shapes=None)
-            mod.set_params(arg_params=self.arg_params, aux_params=self.aux_params)
-
-            # run inference
-            mod.forward(mx.io.DataBatch(data_forward))
-            result = mod.get_outputs()[0].asnumpy()
-            # split operator inference returns 1 less dimension
-            if self.symbol.name.startswith('split'):
-                return [i.asnumpy() for i in mod.get_outputs()]
-            return [result]
-        # using symbol bind method if data have different batch size
-        else:
-            exec1 = self.symbol.bind(ctx, args=dict(zip(data_names, data_forward)))
-            exec1.forward(is_train=False)
-            result = exec1.outputs[0].asnumpy()
-            return [result]
-

From 9e74dfa82ada90bd924b5985f12834091d5d4a4b Mon Sep 17 00:00:00 2001
From: Vishaal Kapoor <40836875+vishaalkapoor@users.noreply.github.com>
Date: Fri, 30 Nov 2018 23:35:42 -0800
Subject: [PATCH 16/28] Clarify dependency on OpenCV in CNN Visualization
 tutorial. (#13495)

---
 docs/tutorials/vision/cnn_visualization.md | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/docs/tutorials/vision/cnn_visualization.md b/docs/tutorials/vision/cnn_visualization.md
index 63d2b13271ba..5ded6f1587e0 100644
--- a/docs/tutorials/vision/cnn_visualization.md
+++ b/docs/tutorials/vision/cnn_visualization.md
@@ -1,16 +1,21 @@
 # Visualizing Decisions of Convolutional Neural Networks
 
-Convolutional Neural Networks have made a lot of progress in Computer Vision. Their accuracy is as good as humans in some tasks. However it remains hard to explain the predictions of convolutional neural networks, as they lack the interpretability offered by other models, for example decision trees.
+Convolutional Neural Networks have made a lot of progress in Computer Vision. Their accuracy is as good as humans in some tasks. However, it remains difficult to explain the predictions of convolutional neural networks, as they lack the interpretability offered by other models such as decision trees.
 
-It is often helpful to be able to explain why a model made the prediction it made. For example when a model misclassifies an image, it is hard to say why without visualizing the network's decision.
+It is often helpful to be able to explain why a model made the prediction it made. For example, when a model misclassifies an image, without visualizing the network's decision, it is hard to say why the misclassification was made.
 
 <img align="right" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/cnn_visualization/volcano_barn_spider.png" alt="Explaining the misclassification of volcano as spider" width=500px/>
 
-Visualizations also help build confidence about the predictions of a model. For example, even if a model correctly predicts birds as birds, we would want to confirm that the model bases its decision on the features of bird and not on the features of some other object that might occur together with birds in the dataset (like leaves).
+Visualizations can also build confidence about the predictions of a model. For example, even if a model correctly predicts birds as birds, we would want to confirm that the model bases its decision on the features of bird and not on the features of some other object that might occur together with birds in the dataset (like leaves).
 
-In this tutorial, we show how to visualize the predictions made by convolutional neural networks using [Gradient-weighted Class Activation Mapping](https://arxiv.org/abs/1610.02391). Unlike many other visualization methods, Grad-CAM can be used on a wide variety of CNN model families - CNNs with fully connected layers, CNNs used for structural outputs (e.g. captioning), CNNs used in tasks with multi-model input (e.g. VQA) or reinforcement learning without architectural changes or re-training.
+In this tutorial we show how to visualize the predictions made by convolutional neural networks using [Gradient-weighted Class Activation Mapping](https://arxiv.org/abs/1610.02391). Unlike many other visualization methods, Grad-CAM can be used on a wide variety of CNN model families - CNNs with fully connected layers, CNNs used for structural outputs (e.g. captioning), CNNs used in tasks with multi-model input (e.g. VQA) or reinforcement learning without architectural changes or re-training.
 
-In the rest of this notebook, we will explain how to visualize predictions made by [VGG-16](https://arxiv.org/abs/1409.1556). We begin by importing the required dependencies. `gradcam` module contains the implementation of visualization techniques used in this notebook.
+In the rest of this notebook, we will explain how to visualize predictions made by [VGG-16](https://arxiv.org/abs/1409.1556). We begin by importing the required dependencies. 
+
+## Prerequesites
+* OpenCV is required by `gradcam` (below) and can be installed with pip using `pip opencv-python`.
+
+* the `gradcam` module contains the implementation of visualization techniques used in this notebook. `gradcam` can be installed to a temporary directory by executing the following code block.
 
 ```python
 from __future__ import print_function

From 049107c62bffc08c78fbbc31ac4f93a233a3ac07 Mon Sep 17 00:00:00 2001
From: Aaron Markham <markhama@amazon.com>
Date: Fri, 30 Nov 2018 23:38:20 -0800
Subject: [PATCH 17/28] clarify ops faq regarding docs strings (#13492)

---
 docs/faq/add_op_in_backend.md | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/docs/faq/add_op_in_backend.md b/docs/faq/add_op_in_backend.md
index ed906da27377..c44a0aa05235 100644
--- a/docs/faq/add_op_in_backend.md
+++ b/docs/faq/add_op_in_backend.md
@@ -135,7 +135,7 @@ The last line of the above code snippet is a tuple of three lists returned
 by `d.infer_shape()`. The first list contains all the argument shapes
 of `a`, `b`, and `c`. The second contains the output shape of `d`. The
 third one represents the shapes of auxiliary states, which is not used
-in this case, and thus is empty. 
+in this case, and thus is empty.
 In this example, we only specified values for variable `a`'s first dimension
 and `c`'s second dimension. The `0` in shape `(2, 0)` indicates that the size
 of the second dimension is unknown, same meaning for shape `(0, 3)`.
@@ -437,10 +437,13 @@ NNVM_REGISTER_OP(quadratic)
 where :math:`x` is an input tensor and all operations
 in the function are element-wise.
 
-Example::
-  x = [[1, 2], [3, 4]]
-  y = quadratic(data=x, a=1, b=2, c=3)
-  y = [[6, 11], [18, 27]]
+Example:
+
+  .. code-block:: python
+     :emphasize-lines: 1,3
+     x = [[1, 2], [3, 4]]
+     y = quadratic(data=x, a=1, b=2, c=3)
+     y = [[6, 11], [18, 27]]
 
 )code" ADD_FILELINE)                                                               // 4
 .set_attr_parser(ParamParser<QuadraticParam>)                                      // 5
@@ -474,8 +477,11 @@ NNVM_REGISTER_OP(_backward_quadratic)
 of `Op` type and save it in the operator manager and return a reference
 of the just created operator object.
 - Lines 3-4: Add description as an operator attribute
-including examples of the operator. The documentation engine would extract
+including examples of the operator. The documentation engine will extract
 this description and display it on the documentation web page.
+`emphasize-lines` is optional.
+For more examples and troubleshooting with doc strings, refer to the [MXNet
+developer wiki's Documentation Guide](https://cwiki.apache.org/confluence/display/MXNET/Documentation+Guide).
 - Line 5: Set parameter struct parser for the operator. It is used for parsing
 the parameters `a`, `b`, and `c` input from frontend.
 - Line 6: Set the number of inputs for the operator.
@@ -630,7 +636,7 @@ python tools/flakiness_checker.py test_operator.test_quadratic_function
 
 Please note that for `check_symbolic_forward` and `check_symbolic_backward` we pass
 both the operator symbols and expected results for comparison, for
-`check_numeric_gradient` we only pass the operator symbol, as the 
+`check_numeric_gradient` we only pass the operator symbol, as the
 `check_numeric_gradient` computes the expected value using finite difference
 method. Which is why it is highly recommended to add `check_numeric_gradient`
 test for every operator with backward function implemented as it eliminates

From 80e2a1da91a5b72313d3761d7c43ebddd84f5931 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 30 Nov 2018 23:46:50 -0800
Subject: [PATCH 18/28] Add graph_compact operator. (#13436)

* add graph_compact.

* fix.

* add doc.

* add tests for graph_compact.

* address comments.

* update docs.

* trigger CI
---
 docs/api/python/ndarray/contrib.md      |   5 +
 docs/api/python/symbol/contrib.md       |  11 ++
 src/operator/contrib/dgl_graph.cc       | 222 +++++++++++++++++++++++-
 tests/python/unittest/test_dgl_graph.py |  40 +++++
 4 files changed, 276 insertions(+), 2 deletions(-)

diff --git a/docs/api/python/ndarray/contrib.md b/docs/api/python/ndarray/contrib.md
index 709ddae007c5..d7c9021b5957 100644
--- a/docs/api/python/ndarray/contrib.md
+++ b/docs/api/python/ndarray/contrib.md
@@ -61,6 +61,11 @@ In the rest of this document, we list routines provided by the `ndarray.contrib`
     index_copy
     getnnz
     edge_id
+    dgl_csr_neighbor_uniform_sample
+    dgl_csr_neighbor_non_uniform_sample
+    dgl_subgraph
+    dgl_adjacency
+    dgl_graph_compact
 ```
 
 ## API Reference
diff --git a/docs/api/python/symbol/contrib.md b/docs/api/python/symbol/contrib.md
index c0a4da54cbde..35cd11c89a70 100644
--- a/docs/api/python/symbol/contrib.md
+++ b/docs/api/python/symbol/contrib.md
@@ -55,6 +55,17 @@ In the rest of this document, we list routines provided by the `symbol.contrib`
     foreach
     while_loop
     cond
+    isinf
+    isfinite
+    isnan
+    index_copy
+    getnnz
+    edge_id
+    dgl_csr_neighbor_uniform_sample
+    dgl_csr_neighbor_non_uniform_sample
+    dgl_subgraph
+    dgl_adjacency
+    dgl_graph_compact
 ```
 
 ## API Reference
diff --git a/src/operator/contrib/dgl_graph.cc b/src/operator/contrib/dgl_graph.cc
index 74ad3d435648..ed7caacfdbae 100644
--- a/src/operator/contrib/dgl_graph.cc
+++ b/src/operator/contrib/dgl_graph.cc
@@ -768,7 +768,10 @@ static void CSRNeighborUniformSampleComputeExCPU(const nnvm::NodeAttrs& attrs,
 NNVM_REGISTER_OP(_contrib_dgl_csr_neighbor_uniform_sample)
 .describe(R"code(This operator samples sub-graph from a csr graph via an
 uniform probability. 
-Example::
+
+Example:
+
+   .. code:: python
 
   shape = (5, 5)
   data_np = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], dtype=np.int64)
@@ -850,7 +853,10 @@ static void CSRNeighborNonUniformSampleComputeExCPU(const nnvm::NodeAttrs& attrs
 NNVM_REGISTER_OP(_contrib_dgl_csr_neighbor_non_uniform_sample)
 .describe(R"code(This operator samples sub-graph from a csr graph via an
 uniform probability. 
-Example::
+
+Example:
+
+   .. code:: python
 
   shape = (5, 5)
   prob = mx.nd.array([0.9, 0.8, 0.2, 0.4, 0.1], dtype=np.float32)
@@ -1379,6 +1385,8 @@ the data value of float32.
 
 Example:
 
+   .. code:: python
+
   x = [[ 1, 0, 0 ],
        [ 0, 2, 0 ],
        [ 0, 0, 3 ]]
@@ -1400,5 +1408,215 @@ the data value of float32.
 .set_attr<FComputeEx>("FComputeEx<cpu>", DGLAdjacencyForwardEx<cpu>)
 .add_argument("data", "NDArray-or-Symbol", "Input ndarray");
 
+///////////////////////// Compact subgraphs ///////////////////////////
+
+struct SubgraphCompactParam : public dmlc::Parameter<SubgraphCompactParam> {
+  int num_args;
+  bool return_mapping;
+  nnvm::Tuple<nnvm::dim_t> graph_sizes;
+  DMLC_DECLARE_PARAMETER(SubgraphCompactParam) {
+    DMLC_DECLARE_FIELD(num_args).set_lower_bound(2)
+    .describe("Number of input arguments.");
+    DMLC_DECLARE_FIELD(return_mapping)
+    .describe("Return mapping of vid and eid between the subgraph and the parent graph.");
+    DMLC_DECLARE_FIELD(graph_sizes)
+    .describe("the number of vertices in each graph.");
+  }
+};  // struct SubgraphCompactParam
+
+DMLC_REGISTER_PARAMETER(SubgraphCompactParam);
+
+static inline size_t get_num_graphs(const SubgraphCompactParam &params) {
+  // Each CSR needs a 1D array to store the original vertex Id for each row.
+  return params.num_args / 2;
+}
+
+static void CompactSubgraph(const NDArray &csr, const NDArray &vids,
+                            const NDArray &out_csr, size_t graph_size) {
+  TBlob in_idx_data = csr.aux_data(csr::kIdx);
+  TBlob in_ptr_data = csr.aux_data(csr::kIndPtr);
+  const dgl_id_t *indices_in = in_idx_data.dptr<dgl_id_t>();
+  const dgl_id_t *indptr_in = in_ptr_data.dptr<dgl_id_t>();
+  const dgl_id_t *row_ids = vids.data().dptr<dgl_id_t>();
+  size_t num_elems = csr.aux_data(csr::kIdx).shape_.Size();
+  // The last element in vids is the actual number of vertices in the subgraph.
+  CHECK_EQ(vids.shape()[0], in_ptr_data.shape_[0]);
+  CHECK_EQ(static_cast<size_t>(row_ids[vids.shape()[0] - 1]), graph_size);
+
+  // Prepare the Id map from the original graph to the subgraph.
+  std::unordered_map<dgl_id_t, dgl_id_t> id_map;
+  id_map.reserve(graph_size);
+  for (size_t i = 0; i < graph_size; i++) {
+    id_map.insert(std::pair<dgl_id_t, dgl_id_t>(row_ids[i], i));
+    CHECK_NE(row_ids[i], -1);
+  }
+
+  TShape nz_shape(1);
+  nz_shape[0] = num_elems;
+  TShape indptr_shape(1);
+  CHECK_EQ(out_csr.shape()[0], graph_size);
+  indptr_shape[0] = graph_size + 1;
+  CHECK_GE(in_ptr_data.shape_[0], indptr_shape[0]);
+
+  out_csr.CheckAndAllocData(nz_shape);
+  out_csr.CheckAndAllocAuxData(csr::kIdx, nz_shape);
+  out_csr.CheckAndAllocAuxData(csr::kIndPtr, indptr_shape);
+
+  dgl_id_t *indices_out = out_csr.aux_data(csr::kIdx).dptr<dgl_id_t>();
+  dgl_id_t *indptr_out = out_csr.aux_data(csr::kIndPtr).dptr<dgl_id_t>();
+  dgl_id_t *sub_eids = out_csr.data().dptr<dgl_id_t>();
+  std::copy(indptr_in, indptr_in + indptr_shape[0], indptr_out);
+  for (int64_t i = 0; i < nz_shape[0]; i++) {
+    dgl_id_t old_id = indices_in[i];
+    auto it = id_map.find(old_id);
+    CHECK(it != id_map.end());
+    indices_out[i] = it->second;
+    sub_eids[i] = i;
+  }
+}
+
+static void SubgraphCompactComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                        const OpContext& ctx,
+                                        const std::vector<NDArray>& inputs,
+                                        const std::vector<OpReqType>& req,
+                                        const std::vector<NDArray>& outputs) {
+  const SubgraphCompactParam& params = nnvm::get<SubgraphCompactParam>(attrs.parsed);
+  int num_g = get_num_graphs(params);
+#pragma omp parallel for
+  for (int i = 0; i < num_g; i++) {
+    CompactSubgraph(inputs[i], inputs[i + num_g], outputs[i], params.graph_sizes[i]);
+  }
+}
+
+static bool SubgraphCompactStorageType(const nnvm::NodeAttrs& attrs,
+                                       const int dev_mask,
+                                       DispatchMode* dispatch_mode,
+                                       std::vector<int> *in_attrs,
+                                       std::vector<int> *out_attrs) {
+  const SubgraphCompactParam& params = nnvm::get<SubgraphCompactParam>(attrs.parsed);
+  size_t num_g = get_num_graphs(params);
+  CHECK_EQ(num_g * 2, in_attrs->size());
+  // These are the input subgraphs.
+  for (size_t i = 0; i < num_g; i++)
+    CHECK_EQ(in_attrs->at(i), kCSRStorage);
+  // These are the vertex Ids in the original graph.
+  for (size_t i = 0; i < num_g; i++)
+    CHECK_EQ(in_attrs->at(i + num_g), kDefaultStorage);
+
+  bool success = true;
+  *dispatch_mode = DispatchMode::kFComputeEx;
+  for (size_t i = 0; i < out_attrs->size(); i++) {
+    if (!type_assign(&(*out_attrs)[i], mxnet::kCSRStorage))
+      success = false;
+  }
+  return success;
+}
+
+static bool SubgraphCompactShape(const nnvm::NodeAttrs& attrs,
+                                 std::vector<TShape> *in_attrs,
+                                 std::vector<TShape> *out_attrs) {
+  const SubgraphCompactParam& params = nnvm::get<SubgraphCompactParam>(attrs.parsed);
+  size_t num_g = get_num_graphs(params);
+  CHECK_EQ(num_g * 2, in_attrs->size());
+  // These are the input subgraphs.
+  for (size_t i = 0; i < num_g; i++) {
+    CHECK_EQ(in_attrs->at(i).ndim(), 2U);
+    CHECK_GE(in_attrs->at(i)[0], params.graph_sizes[i]);
+    CHECK_GE(in_attrs->at(i)[1], params.graph_sizes[i]);
+  }
+  // These are the vertex Ids in the original graph.
+  for (size_t i = 0; i < num_g; i++) {
+    CHECK_EQ(in_attrs->at(i + num_g).ndim(), 1U);
+    CHECK_GE(in_attrs->at(i + num_g)[0], params.graph_sizes[i]);
+  }
+
+  for (size_t i = 0; i < num_g; i++) {
+    TShape gshape(2);
+    gshape[0] = params.graph_sizes[i];
+    gshape[1] = params.graph_sizes[i];
+    out_attrs->at(i) = gshape;
+    if (params.return_mapping)
+      out_attrs->at(i + num_g) = gshape;
+  }
+  return true;
+}
+
+static bool SubgraphCompactType(const nnvm::NodeAttrs& attrs,
+                                std::vector<int> *in_attrs,
+                                std::vector<int> *out_attrs) {
+  for (size_t i = 0; i < in_attrs->size(); i++) {
+    CHECK_EQ(in_attrs->at(i), mshadow::kInt64);
+  }
+  for (size_t i = 0; i < out_attrs->size(); i++) {
+    out_attrs->at(i) = mshadow::kInt64;
+  }
+  return true;
+}
+
+NNVM_REGISTER_OP(_contrib_dgl_graph_compact)
+.describe(R"code(This operator compacts a CSR matrix generated by
+dgl_csr_neighbor_uniform_sample and dgl_csr_neighbor_non_uniform_sample.
+The CSR matrices generated by these two operators may have many empty
+rows at the end and many empty columns. This operator removes these
+empty rows and empty columns.
+
+Example:
+
+   .. code:: python
+
+  shape = (5, 5)
+  data_np = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], dtype=np.int64)
+  indices_np = np.array([1,2,3,4,0,2,3,4,0,1,3,4,0,1,2,4,0,1,2,3], dtype=np.int64)
+  indptr_np = np.array([0,4,8,12,16,20], dtype=np.int64)
+  a = mx.nd.sparse.csr_matrix((data_np, indices_np, indptr_np), shape=shape)
+  seed = mx.nd.array([0,1,2,3,4], dtype=np.int64)
+  out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1,
+          num_neighbor=2, max_num_vertices=6)
+  subg_v = out[0]
+  subg = out[1]
+  compact = mx.nd.contrib.dgl_graph_compact(subg, subg_v,
+          graph_sizes=(subg_v[-1].asnumpy()[0]), return_mapping=False)
+
+  compact.asnumpy()
+  array([[0, 0, 0, 1, 0],
+         [2, 0, 3, 0, 0],
+         [0, 4, 0, 0, 5],
+         [0, 6, 0, 0, 7],
+         [8, 9, 0, 0, 0]])
+
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<SubgraphCompactParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const SubgraphCompactParam& params = nnvm::get<SubgraphCompactParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const SubgraphCompactParam& params = nnvm::get<SubgraphCompactParam>(attrs.parsed);
+  int num_varray = get_num_graphs(params);
+  if (params.return_mapping)
+    return num_varray * 2;
+  else
+    return num_varray;
+})
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  const SubgraphCompactParam& params = nnvm::get<SubgraphCompactParam>(attrs.parsed);
+  std::vector<std::string> names;
+  names.reserve(params.num_args);
+  size_t num_graphs = get_num_graphs(params);
+  for (size_t i = 0; i < num_graphs; i++)
+    names.push_back("graph" + std::to_string(i));
+  for (size_t i = 0; i < num_graphs; ++i)
+    names.push_back("varray" + std::to_string(i));
+  return names;
+})
+.set_attr<FInferStorageType>("FInferStorageType", SubgraphCompactStorageType)
+.set_attr<nnvm::FInferShape>("FInferShape", SubgraphCompactShape)
+.set_attr<nnvm::FInferType>("FInferType", SubgraphCompactType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SubgraphCompactComputeExCPU)
+.set_attr<std::string>("key_var_num_args", "num_args")
+.add_argument("graph_data", "NDArray-or-Symbol[]", "Input graphs and input vertex Ids.")
+.add_arguments(SubgraphCompactParam::__FIELDS__());
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/unittest/test_dgl_graph.py b/tests/python/unittest/test_dgl_graph.py
index f996d7f38de8..069fef6e32f0 100644
--- a/tests/python/unittest/test_dgl_graph.py
+++ b/tests/python/unittest/test_dgl_graph.py
@@ -63,6 +63,18 @@ def check_non_uniform(out, num_hops, max_num_vertices):
     for data in layer:
         assert(data <= num_hops)
 
+def check_compact(csr, id_arr, num_nodes):
+    compact = mx.nd.contrib.dgl_graph_compact(csr, id_arr, graph_sizes=num_nodes, return_mapping=False)
+    assert compact.shape[0] == num_nodes
+    assert compact.shape[1] == num_nodes
+    assert mx.nd.sum(compact.indptr == csr.indptr[0:(num_nodes + 1)]).asnumpy() == num_nodes + 1
+    sub_indices = compact.indices.asnumpy()
+    indices = csr.indices.asnumpy()
+    id_arr = id_arr.asnumpy()
+    for i in range(len(sub_indices)):
+        sub_id = sub_indices[i]
+        assert id_arr[sub_id] == indices[i]
+
 def test_uniform_sample():
     shape = (5, 5)
     data_np = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], dtype=np.int64)
@@ -74,36 +86,64 @@ def test_uniform_sample():
     out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1, num_neighbor=2, max_num_vertices=5)
     assert (len(out) == 3)
     check_uniform(out, num_hops=1, max_num_vertices=5)
+    num_nodes = out[0][-1].asnumpy()
+    assert num_nodes > 0
+    assert num_nodes < len(out[0])
+    check_compact(out[1], out[0], num_nodes)
 
     seed = mx.nd.array([0], dtype=np.int64)
     out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1, num_neighbor=1, max_num_vertices=4)
     assert (len(out) == 3)
     check_uniform(out, num_hops=1, max_num_vertices=4)
+    num_nodes = out[0][-1].asnumpy()
+    assert num_nodes > 0
+    assert num_nodes < len(out[0])
+    check_compact(out[1], out[0], num_nodes)
 
     seed = mx.nd.array([0], dtype=np.int64)
     out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=2, num_neighbor=1, max_num_vertices=4)
     assert (len(out) == 3)
     check_uniform(out, num_hops=2, max_num_vertices=4)
+    num_nodes = out[0][-1].asnumpy()
+    assert num_nodes > 0
+    assert num_nodes < len(out[0])
+    check_compact(out[1], out[0], num_nodes)
 
     seed = mx.nd.array([0,2,4], dtype=np.int64)
     out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1, num_neighbor=2, max_num_vertices=5)
     assert (len(out) == 3)
     check_uniform(out, num_hops=1, max_num_vertices=5)
+    num_nodes = out[0][-1].asnumpy()
+    assert num_nodes > 0
+    assert num_nodes < len(out[0])
+    check_compact(out[1], out[0], num_nodes)
 
     seed = mx.nd.array([0,4], dtype=np.int64)
     out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1, num_neighbor=2, max_num_vertices=5)
     assert (len(out) == 3)
     check_uniform(out, num_hops=1, max_num_vertices=5)
+    num_nodes = out[0][-1].asnumpy()
+    assert num_nodes > 0
+    assert num_nodes < len(out[0])
+    check_compact(out[1], out[0], num_nodes)
 
     seed = mx.nd.array([0,4], dtype=np.int64)
     out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=2, num_neighbor=2, max_num_vertices=5)
     assert (len(out) == 3)
     check_uniform(out, num_hops=2, max_num_vertices=5)
+    num_nodes = out[0][-1].asnumpy()
+    assert num_nodes > 0
+    assert num_nodes < len(out[0])
+    check_compact(out[1], out[0], num_nodes)
 
     seed = mx.nd.array([0,4], dtype=np.int64)
     out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1, num_neighbor=2, max_num_vertices=5)
     assert (len(out) == 3)
     check_uniform(out, num_hops=1, max_num_vertices=5)
+    num_nodes = out[0][-1].asnumpy()
+    assert num_nodes > 0
+    assert num_nodes < len(out[0])
+    check_compact(out[1], out[0], num_nodes)
 
 def test_non_uniform_sample():
     shape = (5, 5)

From 1fd7558f090bcd1e5995c529e3f7349b2d0f18e1 Mon Sep 17 00:00:00 2001
From: Marco de Abreu <marcoabreu@users.noreply.github.com>
Date: Sat, 1 Dec 2018 21:57:33 +0100
Subject: [PATCH 19/28] Deprecate Jenkinsfile (#13474)

---
 Jenkinsfile | 1010 ---------------------------------------------------
 1 file changed, 1010 deletions(-)
 delete mode 100644 Jenkinsfile

diff --git a/Jenkinsfile b/Jenkinsfile
deleted file mode 100644
index 015ca81bad76..000000000000
--- a/Jenkinsfile
+++ /dev/null
@@ -1,1010 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-
-/***
- *      _____  _                          
- *     |  __ \| |                         
- *     | |__) | | ___  __ _ ___  ___      
- *     |  ___/| |/ _ \/ _` / __|/ _ \     
- *     | | | || |  __/ (_| \__ \  __/     
- *     |_|_| ||_|\___|\__,_|___/\___|     
- *      / _` |/ _ \                       
- *     | (_| | (_) |_                     
- *      \__,_|\___/| |                    
- *      _ __   ___ | |_                   
- *     | '_ \ / _ \| __|    _ _  __       
- *     | | | | (_) | |_    | (_)/ _|      
- *     |_|_|_|\___/_\__| __| |_| |_ _   _ 
- *     | '_ ` _ \ / _ \ / _` | |  _| | | |
- *     | | | | | | (_) | (_| | | | | |_| |
- *     |_| |_| |_|\___/ \__,_|_|_|  \__, |
- *                                   __/ |
- *                                  |___/ 
- *
- * This file is about to be deprecated! See https://github.com/apache/incubator-mxnet/pull/13344
- * for more details
- */
-
-
-// mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
-
-// Python wheels
-mx_pip = 'build/*.whl'
-
-// for scala build, need to pass extra libs when run with dist_kvstore
-mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a'
-// mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
-mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
-// mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
-mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
-mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
-mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/lenet, build/cpp-package/example/alexnet, build/cpp-package/example/googlenet, build/cpp-package/example/lenet_with_mxdataiter, build/cpp-package/example/resnet, build/cpp-package/example/mlp, build/cpp-package/example/mlp_cpu, build/cpp-package/example/mlp_gpu, build/cpp-package/example/test_score, build/cpp-package/example/test_optimizer'
-mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/mlp_cpu'
-
-// timeout in minutes
-max_time = 120
-
-
-// Python unittest for CPU
-// Python 2
-def python2_ut(docker_container_name) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    utils.docker_run(docker_container_name, 'unittest_ubuntu_python2_cpu', false)
-  }
-}
-
-// Python 3
-def python3_ut(docker_container_name) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu', false)
-  }
-}
-
-// Python 3
-def python3_ut_asan(docker_container_name) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu_asan', false)
-  }
-}
-
-def python3_ut_mkldnn(docker_container_name) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu_mkldnn', false)
-  }
-}
-
-// GPU test has two parts. 1) run unittest on GPU, 2) compare the results on
-// both CPU and GPU
-// Python 2
-def python2_gpu_ut(docker_container_name) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    utils.docker_run(docker_container_name, 'unittest_ubuntu_python2_gpu', true)
-  }
-}
-
-// Python 3
-def python3_gpu_ut(docker_container_name) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_gpu', true)
-  }
-}
-
-// Python 3 NOCUDNN
-def python3_gpu_ut_nocudnn(docker_container_name) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_gpu_nocudnn', true)
-  }
-}
-
-def deploy_docs() {
-  parallel 'Docs': {
-    node(NODE_LINUX_CPU) {
-      ws('workspace/docs') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          utils.init_git()
-          utils.docker_run('ubuntu_cpu', 'deploy_docs', false)
-          sh "ci/other/ci_deploy_doc.sh ${env.BRANCH_NAME} ${env.BUILD_NUMBER}"
-        }
-      }
-    }
-  },
-  'Julia docs': {
-    node(NODE_LINUX_CPU) {
-      ws('workspace/julia-docs') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          utils.unpack_and_init('cpu', mx_lib)
-          utils.docker_run('ubuntu_cpu', 'deploy_jl_docs', false)
-        }
-      }
-    }
-  }
-}
-
-node('utility') {
-  // Loading the utilities requires a node context unfortunately
-  checkout scm
-  utils = load('ci/Jenkinsfile_utils.groovy')
-}
-utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', windows_cpu: 'mxnetwindows-cpu', windows_gpu: 'mxnetwindows-gpu')
-
-utils.main_wrapper(
-core_logic: {
-  stage('Sanity Check') {
-    parallel 'Lint': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/sanity-lint') {
-          utils.init_git()
-          utils.docker_run('ubuntu_cpu', 'sanity_check', false)
-        }
-      }
-    },
-    'RAT License': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/sanity-rat') {
-          utils.init_git()
-          utils.docker_run('ubuntu_rat', 'nightly_test_rat_check', false)
-        }
-      }
-    }
-  }
-
-  stage('Build') {
-    parallel 'CPU: CentOS 7': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-centos7-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('centos7_cpu', 'build_centos7_cpu', false)
-            utils.pack_lib('centos7_cpu', mx_dist_lib, true)
-          }
-        }
-      }
-    },
-    'CPU: CentOS 7 MKLDNN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-centos7-mkldnn') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('centos7_cpu', 'build_centos7_mkldnn', false)
-            utils.pack_lib('centos7_mkldnn', mx_lib, true)
-          }
-        }
-      }
-    },
-    'GPU: CentOS 7': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-centos7-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('centos7_gpu', 'build_centos7_gpu', false)
-            utils.pack_lib('centos7_gpu', mx_lib, true)
-          }
-        }
-      }
-    },
-    'CPU: Openblas': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-openblas') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_openblas', false)
-            utils.pack_lib('cpu', mx_dist_lib, true)
-          }
-        }
-      }
-    },
-    'CPU: ASAN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-asan') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_cmake_asan', false)
-            utils.pack_lib('cpu_asan', mx_lib_cpp_examples_cpu)
-          }
-        }
-      }
-    },
-    'CPU: Openblas, debug': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-openblas') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_cmake_debug', false)
-            utils.pack_lib('cpu_debug', mx_cmake_lib_debug, true)
-          }
-        }
-      }
-    },
-    'CPU: Clang 3.9': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-clang39') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang39', false)
-          }
-        }
-      }
-    },
-    'CPU: Clang 6': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-clang60') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang60', false)
-          }
-        }
-      }
-    },
-    'CPU: Clang Tidy': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-clang60_tidy') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang_tidy', false)
-          }
-        }
-      }
-    },
-    'CPU: Clang 3.9 MKLDNN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-mkldnn-clang39') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang39_mkldnn', false)
-            utils.pack_lib('mkldnn_cpu_clang3', mx_mkldnn_lib, true)
-          }
-        }
-      }
-    },
-    'CPU: Clang 6 MKLDNN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-mkldnn-clang60') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang60_mkldnn', false)
-            utils.pack_lib('mkldnn_cpu_clang6', mx_mkldnn_lib, true)
-          }
-        }
-      }
-    },
-    'CPU: MKLDNN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-mkldnn-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_mkldnn', false)
-            utils.pack_lib('mkldnn_cpu', mx_mkldnn_lib, true)
-          }
-        }
-      }
-    },
-    'GPU: MKLDNN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-mkldnn-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_mkldnn', false)
-            utils.pack_lib('mkldnn_gpu', mx_mkldnn_lib, true)
-          }
-        }
-      }
-    },
-    'GPU: MKLDNN_CUDNNOFF': {
-       node(NODE_LINUX_CPU) {
-         ws('workspace/build-mkldnn-gpu-nocudnn') {
-           timeout(time: max_time, unit: 'MINUTES') {
-             utils.init_git()
-             utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_mkldnn_nocudnn', false)
-             utils.pack_lib('mkldnn_gpu_nocudnn', mx_mkldnn_lib, true)
-           }
-         }
-       }
-    },
-    'GPU: CUDA9.1+cuDNN7': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda91_cudnn7', false)
-            utils.pack_lib('gpu', mx_lib_cpp_examples, true)
-          }
-        }
-      }
-    },
-    'Amalgamation MIN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/amalgamationmin') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_amalgamation_min', false)
-          }
-        }
-      }
-    },
-    'Amalgamation': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/amalgamation') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_amalgamation', false)
-          }
-        }
-      }
-    },
-
-    'GPU: CMake MKLDNN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cmake-mkldnn-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_gpu', 'build_ubuntu_gpu_cmake_mkldnn', false)
-            utils.pack_lib('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib, true)
-          }
-        }
-      }
-    },
-    'GPU: CMake': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cmake-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_gpu', 'build_ubuntu_gpu_cmake', false)
-            utils.pack_lib('cmake_gpu', mx_cmake_lib, true)
-          }
-        }
-      }
-    },
-    'TensorRT': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-tensorrt') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_gpu_tensorrt', 'build_ubuntu_gpu_tensorrt', false)
-            utils.pack_lib('tensorrt', mx_tensorrt_lib, true)
-          }
-        }
-      }
-    },
-    'Build CPU windows':{
-      node(NODE_WINDOWS_CPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/build-cpu') {
-            withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
-              utils.init_git_win()
-              powershell 'python ci/build_windows.py -f WIN_CPU'
-              stash includes: 'windows_package.7z', name: 'windows_package_cpu'
-            }
-          }
-        }
-      }
-    },
-
-    'Build GPU windows':{
-      node(NODE_WINDOWS_CPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/build-gpu') {
-            withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
-              utils.init_git_win()
-              powershell 'python ci/build_windows.py -f WIN_GPU'
-              stash includes: 'windows_package.7z', name: 'windows_package_gpu'
-            }
-          }
-        }
-      }
-    },
-    'Build GPU MKLDNN windows':{
-      node(NODE_WINDOWS_CPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/build-gpu') {
-            withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0','BUILD_NAME=vc14_gpu_mkldnn']) {
-              utils.init_git_win()
-              powershell 'python ci/build_windows.py -f WIN_GPU_MKLDNN'
-              stash includes: 'windows_package.7z', name: 'windows_package_gpu_mkldnn'
-            }
-          }
-        }
-      }
-    },
-    //'NVidia Jetson / ARMv8':{
-    //  node(NODE_LINUX_CPU) {
-    //    ws('workspace/build-jetson-armv8') {
-    //      timeout(time: max_time, unit: 'MINUTES') {
-    //        utils.init_git()
-    //        utils.docker_run('jetson', 'build_jetson', false)
-    //      }
-    //    }
-    //  }
-    //},
-    'ARMv7':{
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-ARMv7') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('armv7', 'build_armv7', false)
-            utils.pack_lib('armv7', mx_pip)
-          }
-        }
-      }
-    },
-    'ARMv6':{
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-ARMv6') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('armv6', 'build_armv6', false)
-          }
-        }
-      }
-    },
-    'ARMv8':{
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-ARMv8') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('armv8', 'build_armv8', false)
-          }
-        }
-      }
-    },
-    'Android / ARMv8':{
-      node(NODE_LINUX_CPU) {
-        ws('workspace/android64') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('android_armv8', 'build_android_armv8', false)
-          }
-        }
-      }
-    },
-    'Android / ARMv7':{
-      node(NODE_LINUX_CPU) {
-        ws('workspace/androidv7') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('android_armv7', 'build_android_armv7', false)
-          }
-        }
-      }
-    }
-
-  } // End of stage('Build')
-
-  stage('Tests') {
-    parallel 'Python2: CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-python2-cpu') {
-          try {
-            utils.unpack_and_init('cpu', mx_lib, true)
-            python2_ut('ubuntu_cpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python2_cpu_unittest.xml')
-            utils.collect_test_results_unix('nosetests_train.xml', 'nosetests_python2_cpu_train.xml')
-            utils.collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python2_cpu_quantization.xml')
-          }
-        }
-      }
-    },
-    'Python3: CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-python3-cpu') {
-          try {
-            utils.unpack_and_init('cpu', mx_lib, true)
-            python3_ut('ubuntu_cpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_cpu_unittest.xml')
-            utils.collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python3_cpu_quantization.xml')
-          }
-        }
-      }
-    },
-    'CPU ASAN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-python3-cpu-asan') {
-            utils.unpack_and_init('cpu_asan', mx_lib_cpp_examples_cpu)
-            utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_asan', false)
-        }
-      }
-    },
-    'Python3: CPU debug': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-python3-cpu-debug') {
-          try {
-            utils.unpack_and_init('cpu_debug', mx_cmake_lib_debug, true)
-            python3_ut('ubuntu_cpu')
-          } finally {
-            utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_cpu_debug_unittest.xml')
-            utils.collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python3_cpu_debug_quantization.xml')
-          }
-        }
-      }
-    },
-    'Python2: GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-python2-gpu') {
-          try {
-            utils.unpack_and_init('gpu', mx_lib, true)
-            python2_gpu_ut('ubuntu_gpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python2_gpu.xml')
-          }
-        }
-      }
-    },
-    'Python3: GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-python3-gpu') {
-          try {
-            utils.unpack_and_init('gpu', mx_lib, true)
-            python3_gpu_ut('ubuntu_gpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_gpu.xml')
-          }
-        }
-      }
-    },
-    'Python2: Quantize GPU': {
-      node(NODE_LINUX_GPU_P3) {
-        ws('workspace/ut-python2-quantize-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              utils.unpack_and_init('gpu', mx_lib, true)
-              utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_python2_quantization_gpu', true)
-              utils.publish_test_coverage()
-            } finally {
-              utils.collect_test_results_unix('nosetests_quantization_gpu.xml', 'nosetests_python2_quantize_gpu.xml')
-            }
-          }
-        }
-      }
-    },
-    'Python3: Quantize GPU': {
-      node(NODE_LINUX_GPU_P3) {
-        ws('workspace/ut-python3-quantize-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              utils.unpack_and_init('gpu', mx_lib, true)
-              utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_python3_quantization_gpu', true)
-              utils.publish_test_coverage()
-            } finally {
-              utils.collect_test_results_unix('nosetests_quantization_gpu.xml', 'nosetests_python3_quantize_gpu.xml')
-            }
-          }
-        }
-      }
-    },
-    'Python2: MKLDNN-CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-python2-mkldnn-cpu') {
-          try {
-            utils.unpack_and_init('mkldnn_cpu', mx_mkldnn_lib, true)
-            python2_ut('ubuntu_cpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python2_mkldnn_cpu_unittest.xml')
-            utils.collect_test_results_unix('nosetests_train.xml', 'nosetests_python2_mkldnn_cpu_train.xml')
-            utils.collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python2_mkldnn_cpu_quantization.xml')
-          }
-        }
-      }
-    },
-    'Python2: MKLDNN-GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-python2-mkldnn-gpu') {
-          try {
-            utils.unpack_and_init('mkldnn_gpu', mx_mkldnn_lib, true)
-            python2_gpu_ut('ubuntu_gpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python2_mkldnn_gpu.xml')
-          }
-        }
-      }
-    },
-    'Python3: MKLDNN-CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-python3-mkldnn-cpu') {
-          try {
-            utils.unpack_and_init('mkldnn_cpu', mx_mkldnn_lib, true)
-            python3_ut_mkldnn('ubuntu_cpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_mkldnn_cpu_unittest.xml')
-            utils.collect_test_results_unix('nosetests_mkl.xml', 'nosetests_python3_mkldnn_cpu_mkl.xml')
-          }
-        }
-      }
-    },
-    'Python3: MKLDNN-GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-python3-mkldnn-gpu') {
-          try {
-            utils.unpack_and_init('mkldnn_gpu', mx_mkldnn_lib, true)
-            python3_gpu_ut('ubuntu_gpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_mkldnn_gpu.xml')
-          }
-        }
-      }
-    },
-    'Python3: MKLDNN-GPU-NOCUDNN': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-python3-mkldnn-gpu-nocudnn') {
-          try {
-            utils.unpack_and_init('mkldnn_gpu_nocudnn', mx_mkldnn_lib, true)
-            python3_gpu_ut_nocudnn('ubuntu_gpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_mkldnn_gpu_nocudnn.xml')
-          }
-        }
-      }
-    },
-    'Python3: CentOS 7 CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-centos7-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              utils.unpack_and_init('centos7_cpu', mx_lib, true)
-              utils.docker_run('centos7_cpu', 'unittest_centos7_cpu', false)
-              utils.publish_test_coverage()
-            } finally {
-              utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_centos7_cpu_unittest.xml')
-              utils.collect_test_results_unix('nosetests_train.xml', 'nosetests_python3_centos7_cpu_train.xml')
-            }
-          }
-        }
-      }
-    },
-    'Python3: CentOS 7 GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/build-centos7-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              utils.unpack_and_init('centos7_gpu', mx_lib, true)
-              utils.docker_run('centos7_gpu', 'unittest_centos7_gpu', true)
-              utils.publish_test_coverage()
-            } finally {
-              utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_centos7_gpu.xml')
-            }
-          }
-        }
-      }
-    },
-    'Python3: TensorRT GPU': {
-      node(NODE_LINUX_GPU_P3) {
-        ws('workspace/build-tensorrt') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              utils.unpack_and_init('tensorrt', mx_tensorrt_lib, true)
-              utils.docker_run('ubuntu_gpu_tensorrt', 'unittest_ubuntu_tensorrt_gpu', true)
-              utils.publish_test_coverage()
-            } finally {
-              utils.collect_test_results_unix('nosetests_tensorrt.xml', 'nosetests_python3_tensorrt_gpu.xml')
-            }
-          }
-        }
-      }
-    },
-    'Scala: CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-scala-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_dist_lib, true)
-            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_scala', false)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'Scala: CentOS CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-scala-centos7-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('centos7_cpu', mx_dist_lib, true)
-            utils.docker_run('centos7_cpu', 'unittest_centos7_cpu_scala', false)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'Clojure: CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-clojure-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_dist_lib, true)
-            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_clojure', false)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'Perl: CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-perl-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_lib, true)
-            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpugpu_perl', false)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'Perl: GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-perl-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu', mx_lib, true)
-            utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_cpugpu_perl', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'Cpp: GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-cpp-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cmake_gpu', mx_cmake_lib, true)
-            utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_cpp', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'Cpp: MKLDNN+GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-cpp-mkldnn-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib, true)
-            utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_cpp', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'R: CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-r-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_lib, true)
-            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_R', false)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'R: GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-r-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu', mx_lib, true)
-            utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_R', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'Julia 0.6: CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-julia06-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_lib)
-            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_julia06', false)
-          }
-        }
-      }
-    },
-
-    'Python 2: CPU Win':{
-      node(NODE_WINDOWS_CPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/ut-python-cpu') {
-            try {
-              utils.init_git_win()
-              unstash 'windows_package_cpu'
-              powershell 'ci/windows/test_py2_cpu.ps1'
-            } finally {
-              utils.collect_test_results_windows('nosetests_unittest.xml', 'nosetests_unittest_windows_python2_cpu.xml')
-            }
-          }
-        }
-      }
-    },
-    'Python 3: CPU Win': {
-      node(NODE_WINDOWS_CPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/ut-python-cpu') {
-            try {
-              utils.init_git_win()
-              unstash 'windows_package_cpu'
-              powershell 'ci/windows/test_py3_cpu.ps1'
-            } finally {
-              utils.collect_test_results_windows('nosetests_unittest.xml', 'nosetests_unittest_windows_python3_cpu.xml')
-            }
-          }
-        }
-      }
-    },
-    'Python 2: GPU Win':{
-      node(NODE_WINDOWS_GPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/ut-python-gpu') {
-            try {
-              utils.init_git_win()
-              unstash 'windows_package_gpu'
-              powershell 'ci/windows/test_py2_gpu.ps1'
-            } finally {
-              utils.collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python2_gpu.xml')
-              utils.collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python2_gpu.xml')
-            }
-          }
-        }
-      }
-    },
-    'Python 3: GPU Win':{
-      node(NODE_WINDOWS_GPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/ut-python-gpu') {
-            try {
-              utils.init_git_win()
-              unstash 'windows_package_gpu'
-              powershell 'ci/windows/test_py3_gpu.ps1'
-            } finally {
-              utils.collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python3_gpu.xml')
-              utils.collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python3_gpu.xml')
-            }
-          }
-        }
-      }
-    },
-    'Python 3: MKLDNN-GPU Win':{
-      node(NODE_WINDOWS_GPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/ut-python-gpu') {
-            try {
-              utils.init_git_win()
-              unstash 'windows_package_gpu_mkldnn'
-              powershell 'ci/windows/test_py3_gpu.ps1'
-            } finally {
-              utils.collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python3_gpu_mkldnn.xml')
-              utils.collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python3_gpu_mkldnn.xml')
-            }
-          }
-        }
-      }
-    },
-    'Onnx CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/it-onnx-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_lib, true)
-            utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_onnx', false)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'Python GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/it-python-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu', mx_lib, true)
-            utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_python', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'cpp-package GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/it-cpp-package') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu', mx_lib_cpp_examples, true)
-            utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_cpp_package', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    // Disabled due to: https://github.com/apache/incubator-mxnet/issues/11407
-    // 'Caffe GPU': {
-    //   node(NODE_LINUX_GPU) {
-    //     ws('workspace/it-caffe') {
-    //       timeout(time: max_time, unit: 'MINUTES') {
-    //         utils.init_git()
-    //         utils.unpack_lib('gpu', mx_lib)
-    //         utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_caffe', true)
-    //         utils.publish_test_coverage()
-    //       }
-    //     }
-    //   }
-    // },
-    'dist-kvstore tests GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/it-dist-kvstore') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu', mx_lib, true)
-            utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_dist_kvstore', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    /*  Disabled due to master build failure:
-     *  http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/incubator-mxnet/detail/master/1221/pipeline/
-     *  https://github.com/apache/incubator-mxnet/issues/11801
-
-    'dist-kvstore tests CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/it-dist-kvstore') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_lib, true)
-            utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_dist_kvstore', false)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    }, */
-    'Scala: GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-scala-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu', mx_dist_lib, true)
-            utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_scala', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'ARMv7 QEMU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-armv7-qemu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('armv7', mx_pip)
-            sh "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} -p test.arm_qemu ./runtime_functions.py run_ut_py3_qemu"
-          }
-        }
-      }
-    }
-  }
-
-  stage('Deploy') {
-    deploy_docs()
-  }
-}
-,
-failure_handler: {
-  // Only send email if master or release branches failed
-  if (currentBuild.result == "FAILURE" && (env.BRANCH_NAME == "master" || env.BRANCH_NAME.startsWith("v"))) {
-    emailext body: 'Build for MXNet branch ${BRANCH_NAME} has broken. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[BUILD FAILED] Branch ${BRANCH_NAME} build ${BUILD_NUMBER}', to: '${EMAIL}'
-  }
-}
-)

From d8029c84d07ec98a8706ff10e09c7664c4a3e9b3 Mon Sep 17 00:00:00 2001
From: Steffen Rochel <steffenrochel@gmail.com>
Date: Sat, 1 Dec 2018 20:26:45 -0800
Subject: [PATCH 20/28] update github location for sampled_block.py (#13508)

Updated to https://github.com/dmlc/gluon-nlp/blob/master/src/gluonnlp/model/sampled_block.py
---
 example/recommenders/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/recommenders/README.md b/example/recommenders/README.md
index 4b1d5ca6da14..628182c849b8 100644
--- a/example/recommenders/README.md
+++ b/example/recommenders/README.md
@@ -17,7 +17,7 @@ The examples are driven by notebook files.
 ### Negative Sampling
 
 * A previous version of this example had an example of negative sampling. For example of negative sampling, please refer to:
-    [Gluon NLP Sampled Block](https://github.com/dmlc/gluon-nlp/blob/master/gluonnlp/model/sampled_block.py)
+    [Gluon NLP Sampled Block](https://github.com/dmlc/gluon-nlp/blob/master/src/gluonnlp/model/sampled_block.py)
     
 
 ## Acknowledgements

From 96f5beb904f73cf30db45516af8253edcb04a763 Mon Sep 17 00:00:00 2001
From: Nicolas Modrzyk <hellonico@gmail.com>
Date: Sun, 2 Dec 2018 23:34:36 +0900
Subject: [PATCH 21/28] #13453 [Clojure] - Add Spec Validations to the
 Optimizer namespace (#13499)

---
 .../org/apache/clojure_mxnet/optimizer.clj    | 52 +++++++++++++++++--
 .../apache/clojure_mxnet/optimizer_test.clj   | 10 ++++
 2 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj
index f18ff40f5698..f77f5532bfb1 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj
@@ -17,7 +17,19 @@
 
 (ns org.apache.clojure-mxnet.optimizer
   (:refer-clojure :exclude [update])
-  (:import (org.apache.mxnet.optimizer SGD DCASGD NAG AdaDelta RMSProp AdaGrad Adam SGLD)))
+  (:require  
+   [clojure.spec.alpha :as s]
+   [org.apache.clojure-mxnet.util :as util])
+  (:import 
+   (org.apache.mxnet.optimizer SGD DCASGD NAG AdaDelta RMSProp AdaGrad Adam SGLD)
+   (org.apache.mxnet FactorScheduler)))
+
+(s/def ::learning-rate float?)
+(s/def ::momentum float?)
+(s/def ::wd float?)
+(s/def ::clip-gradient float?)
+(s/def ::lr-scheduler #(instance? FactorScheduler))
+(s/def ::sgd-opts (s/keys :opt-un [::learning-rate ::momentum ::wd ::clip-gradient ::lr-scheduler]))
 
 (defn sgd
   "A very simple SGD optimizer with momentum and weight regularization."
@@ -26,10 +38,14 @@
           momentum 0.0
           wd 0.0001
           clip-gradient 0}}]
+   (util/validate! ::sgd-opts opts "Incorrect sgd optimizer options")
    (new SGD (float learning-rate) (float momentum) (float wd) (float clip-gradient) lr-scheduler))
   ([]
    (sgd {})))
 
+(s/def ::lambda float?)
+(s/def ::dcasgd-opts (s/keys :opt-un [::learning-rate ::momentum ::lambda ::wd ::clip-gradient ::lr-scheduler]))
+
 (defn dcasgd
   "DCASGD optimizer with momentum and weight regularization.
   Implementation of paper 'Asynchronous Stochastic Gradient Descent with
@@ -40,10 +56,13 @@
           lambda 0.04
           wd 0.0
           clip-gradient 0}}]
+   (util/validate! ::sgd-opts opts "Incorrect dcasgd optimizer options")
    (new DCASGD (float learning-rate) (float lambda) (float momentum) (float wd) (float clip-gradient) lr-scheduler))
   ([]
    (dcasgd {})))
 
+(s/def ::nag-opts (s/keys :opt-un [::learning-rate ::momentum ::wd ::clip-gradient ::lr-scheduler]))
+
 (defn nag
   "SGD with nesterov.
    It is implemented according to
@@ -53,10 +72,16 @@
           momentum 0.0
           wd 0.0001
           clip-gradient 0}}]
+   (util/validate! ::nag-opts opts "Incorrect nag optimizer options")
    (new NAG (float learning-rate) (float momentum) (float wd) (float clip-gradient) lr-scheduler))
   ([]
    (nag {})))
 
+(s/def ::rho float?)
+(s/def ::rescale-gradient float?)
+(s/def ::epsilon float?)
+(s/def ::ada-delta-opts (s/keys :opt-un [::rho ::rescale-gradient ::epsilon ::wd ::clip-gradient]))
+
 (defn ada-delta
   "AdaDelta optimizer as described in Matthew D. Zeiler, 2012.
    http://arxiv.org/abs/1212.5701"
@@ -66,10 +91,15 @@
           epsilon 1e-8
           wd 0.0
           clip-gradient 0}}]
+   (util/validate! ::ada-delta-opts opts "Incorrect ada-delta optimizer options")
    (new AdaDelta (float rho) (float rescale-gradient) (float epsilon) (float wd) (float clip-gradient)))
   ([]
    (ada-delta {})))
 
+(s/def gamma1 float?)
+(s/def gamma2 float?)
+(s/def ::rms-prop-opts (s/keys :opt-un [::learning-rate ::rescale-gradient ::gamma1 ::gamma2 ::wd ::clip-gradient]))
+
 (defn rms-prop
   "RMSProp optimizer as described in Tieleman & Hinton, 2012.
    http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
@@ -80,18 +110,21 @@
    -  wd L2 regularization coefficient add to all the weights
    -  clip-gradient clip gradient in range [-clip_gradient, clip_gradient]
    -  lr-scheduler The learning rate scheduler"
-  ([{:keys [learning-rate rescale-gradient gamma1 gamma2 wd lr-scheduler clip-gradient]
+  ([{:keys [learning-rate rescale-gradient gamma1 gamma2 wd lr-scheduler clip-gradient] :as opts
      :or {learning-rate 0.002
           rescale-gradient 1.0
           gamma1 0.95
           gamma2 0.9
           wd 0.0
           clip-gradient 0}}]
+   (util/validate! ::rms-prop-opts opts "Incorrect rms-prop optimizer options")
    (new RMSProp (float learning-rate) (float rescale-gradient) (float gamma1)
         (float gamma2) (float wd) lr-scheduler (float clip-gradient)))
   ([]
    (rms-prop {})))
 
+(s/def ::ada-grad-opts (s/keys :opt-un [::learning-rate ::rescale-gradient ::epsilon ::wd]))
+
 (defn ada-grad
   " AdaGrad optimizer as described in Duchi, Hazan and Singer, 2011.
    http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
@@ -101,15 +134,20 @@
                 Default value is set to 1e-7.
    - rescale-gradient rescaling factor of gradient.
    - wd L2 regularization coefficient add to all the weights"
-  ([{:keys [learning-rate rescale-gradient epsilon wd]
+  ([{:keys [learning-rate rescale-gradient epsilon wd] :as opts
      :or {learning-rate 0.05
           rescale-gradient 1.0
           epsilon 1e-7
           wd 0.0}}]
+   (util/validate! ::ada-grad-opts opts "Incorrect ada-grad optimizer options")
    (new AdaGrad (float learning-rate) (float rescale-gradient) (float epsilon) (float wd)))
   ([]
    (ada-grad {})))
 
+(s/def ::beta1 float?)
+(s/def ::beta2 float?)
+(s/def ::adam-opts (s/keys :opt-un [::learning-rate ::beta1 ::beta2 ::epsilon ::decay-factor ::wd ::clip-gradient ::lr-scheduler]))
+
 (defn adam
   "Adam optimizer as described in [King2014]
 
@@ -125,7 +163,7 @@
    - wd L2 regularization coefficient add to all the weights
    - clip-gradient  clip gradient in range [-clip_gradient, clip_gradient]
    - lr-scheduler The learning rate scheduler"
-  ([{:keys [learning-rate beta1 beta2 epsilon decay-factor wd clip-gradient lr-scheduler]
+  ([{:keys [learning-rate beta1 beta2 epsilon decay-factor wd clip-gradient lr-scheduler] :as opts
      :or {learning-rate 0.002
           beta1 0.9
           beta2 0.999
@@ -133,11 +171,14 @@
           decay-factor (- 1 1e-8)
           wd 0
           clip-gradient 0}}]
+   (util/validate! ::adam-opts opts "Incorrect adam optimizer options")
    (new Adam (float learning-rate) (float beta1) (float beta2) (float epsilon)
         (float decay-factor) (float wd) (float clip-gradient) lr-scheduler))
   ([]
    (adam {})))
 
+(s/def ::sgld-opts (s/keys :opt-un [::learning-rate ::rescale-gradient ::wd ::clip-gradient ::lr-scheduler]))
+
 (defn sgld
   "Stochastic Langevin Dynamics Updater to sample from a distribution.
 
@@ -146,11 +187,12 @@
   - wd L2 regularization coefficient add to all the weights
   - clip-gradient Float, clip gradient in range [-clip_gradient, clip_gradient]
   - lr-scheduler The learning rate scheduler"
-  ([{:keys [learning-rate rescale-gradient wd clip-gradient lr-scheduler]
+  ([{:keys [learning-rate rescale-gradient wd clip-gradient lr-scheduler] :as opts
      :or {learning-rate 0.01
           rescale-gradient 1
           wd 0.0001
           clip-gradient 0}}]
+   (util/validate! ::sgld-opts opts "Incorrect sgld optimizer options")
    (new SGLD (float learning-rate) (float rescale-gradient) (float wd)
         (float clip-gradient) lr-scheduler))
   ([]
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/optimizer_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/optimizer_test.clj
index f6461b10f028..599a0672bea5 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/optimizer_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/optimizer_test.clj
@@ -44,3 +44,13 @@
               ["sgld" optimizer/sgld]]]
     (doseq [opt opts]
       (test-optimizer opt))))
+
+(deftest test-optimizers-parameters-specs
+  (is (thrown? Exception (optimizer/sgd {:wd 'a})))
+  (is (thrown? Exception (optimizer/dcasgd {:lambda 'a})))
+  (is (thrown? Exception (optimizer/nag {:momentum 'a})))
+  (is (thrown? Exception (optimizer/ada-delta {:epsilon 'a})))
+  (is (thrown? Exception (optimizer/rms-prop {:gamma1 'a})))
+  (is (thrown? Exception (optimizer/ada-grad {:rescale-gradient 'a})))
+  (is (thrown? Exception (optimizer/adam {:beta1 'a})))
+  (is (thrown? Exception (optimizer/sgld {:lr-scheduler 0.1}))))
\ No newline at end of file

From 09b660719a9b28c79994304a268414bade420caa Mon Sep 17 00:00:00 2001
From: Vandana Kannan <vandanavk@users.noreply.github.com>
Date: Sun, 2 Dec 2018 20:30:35 -0800
Subject: [PATCH 22/28] ONNX export: Logical operators (#12852)

---
 .../contrib/onnx/mx2onnx/_op_translations.py  | 32 +++++++++++++++
 .../onnx/export/mxnet_export_test.py          | 39 +++++++++++++++++++
 tests/python-pytest/onnx/import/test_cases.py |  1 -
 3 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
index facdcfedcbca..86767a667128 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
@@ -1613,3 +1613,35 @@ def convert_broadcast_equal(node, **kwargs):
     and return the created node.
     """
     return create_basic_op_node('Equal', node, kwargs)
+
+
+@mx_op.register("broadcast_logical_and")
+def convert_broadcast_logical_and(node, **kwargs):
+    """Map MXNet's broadcast logical and operator attributes to onnx's Add operator
+    and return the created node.
+    """
+    return create_basic_op_node('And', node, kwargs)
+
+
+@mx_op.register("broadcast_logical_or")
+def convert_broadcast_logical_or(node, **kwargs):
+    """Map MXNet's broadcast logical or operator attributes to onnx's Or operator
+    and return the created node.
+    """
+    return create_basic_op_node('Or', node, kwargs)
+
+
+@mx_op.register("broadcast_logical_xor")
+def convert_broadcast_logical_xor(node, **kwargs):
+    """Map MXNet's broadcast logical xor operator attributes to onnx's Xor operator
+    and return the created node.
+    """
+    return create_basic_op_node('Xor', node, kwargs)
+
+
+@mx_op.register("logical_not")
+def convert_logical_not(node, **kwargs):
+    """Map MXNet's logical not operator attributes to onnx's Not operator
+    and return the created node.
+    """
+    return create_basic_op_node('Not', node, kwargs)
diff --git a/tests/python-pytest/onnx/export/mxnet_export_test.py b/tests/python-pytest/onnx/export/mxnet_export_test.py
index 964d0e760cae..6b858f05e24f 100644
--- a/tests/python-pytest/onnx/export/mxnet_export_test.py
+++ b/tests/python-pytest/onnx/export/mxnet_export_test.py
@@ -268,6 +268,45 @@ def test_ops(op_name, inputs, input_tensors, numpy_op):
     test_ops("Equal", input_data, input_tensor,
              np.equal(input_data[0], input_data[1]).astype(np.float32))
 
+
+def get_int_inputs(interval, shape):
+    """Helper to get integer input of given shape and range"""
+    assert len(interval) == len(shape)
+    inputs = []
+    input_tensors = []
+    for idx in range(len(interval)):
+        low, high = interval[idx]
+        inputs.append(np.random.randint(low, high, size=shape[idx]).astype("float32"))
+        input_tensors.append(helper.make_tensor_value_info("input"+str(idx+1),
+                                                        TensorProto.FLOAT, shape=shape[idx]))
+    return inputs, input_tensors
+
+
+@with_seed()
+def test_logical_ops():
+    """Test for logical and, or, not, xor operators"""
+    def test_ops(op_name, inputs, input_tensors, numpy_op):
+        outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, shape=np.shape(inputs[0]))]
+        nodes = [helper.make_node(op_name, ["input"+str(i+1) for i in range(len(inputs))], ["output"])]
+        graph = helper.make_graph(nodes,
+                                  op_name + "_test",
+                                  input_tensors,
+                                  outputs)
+        model = helper.make_model(graph)
+        bkd_rep = backend.prepare(model)
+        output = bkd_rep.run(inputs)
+        npt.assert_almost_equal(output[0], numpy_op)
+    input_data, input_tensor = get_int_inputs([(0, 2), (0, 2)], [(3, 4, 5), (3, 4, 5)])
+    test_ops("And", input_data, input_tensor,
+             np.logical_and(input_data[0], input_data[1]).astype(np.float32))
+    test_ops("Or", input_data, input_tensor,
+             np.logical_or(input_data[0], input_data[1]).astype(np.float32))
+    test_ops("Xor", input_data, input_tensor,
+             np.logical_xor(input_data[0], input_data[1]).astype(np.float32))
+    test_ops("Not", [input_data[0]], [input_tensor[0]],
+             np.logical_not(input_data[0]).astype(np.float32))
+
+
 def _assert_sym_equal(lhs, rhs):
     assert lhs.list_inputs() == rhs.list_inputs()  # input names must be identical
     assert len(lhs.list_outputs()) == len(rhs.list_outputs())  # number of outputs must be identical
diff --git a/tests/python-pytest/onnx/import/test_cases.py b/tests/python-pytest/onnx/import/test_cases.py
index aed68ffa114c..f41fe92352db 100644
--- a/tests/python-pytest/onnx/import/test_cases.py
+++ b/tests/python-pytest/onnx/import/test_cases.py
@@ -55,7 +55,6 @@
     'test_argmax',
     'test_argmin',
     'test_min',
-    'test_logical_',
     # enabling partial test cases for matmul
     'test_matmul_3d',
     'test_matmul_4d',

From dd9d80ca18e898b4443488eba3fd7e8bcf0ff225 Mon Sep 17 00:00:00 2001
From: Pedro Larroy <928489+larroy@users.noreply.github.com>
Date: Mon, 3 Dec 2018 15:17:09 +0100
Subject: [PATCH 23/28] Fix cmake options parsing in dev_menu (#13458)

Add GPU+MKLDNN unittests to dev_menu
---
 cmake/cmake_options.yml | 63 +++++++++++++++++++++--------------------
 dev_menu.py             | 20 +++++++------
 2 files changed, 44 insertions(+), 39 deletions(-)

diff --git a/cmake/cmake_options.yml b/cmake/cmake_options.yml
index 6fbf4e1d0617..01446f7b8f28 100644
--- a/cmake/cmake_options.yml
+++ b/cmake/cmake_options.yml
@@ -16,34 +16,35 @@
 # under the License.
 
 --- # CMake configuration
-USE_CUDA: OFF # Build with CUDA support
-USE_OLDCMAKECUDA: OFF # Build with old cmake cuda
-USE_NCCL: OFF # Use NVidia NCCL with CUDA
-USE_OPENCV: ON # Build with OpenCV support
-USE_OPENMP: ON # Build with Openmp support
-USE_CUDNN: ON # Build with cudnn support) # one could set CUDNN_ROOT for search path
-USE_SSE: ON # Build with x86 SSE instruction support IF NOT ARM
-USE_F16C: ON # Build with x86 F16C instruction support) # autodetects support if ON
-USE_LAPACK: ON # Build with lapack support
-USE_MKL_IF_AVAILABLE: ON # Use MKL if found
-USE_MKLML_MKL: ON # Use MKLDNN variant of MKL (if MKL found) IF USE_MKL_IF_AVAILABLE AND (NOT APPLE)
-USE_MKLDNN: ON # Use MKLDNN variant of MKL (if MKL found) IF USE_MKL_IF_AVAILABLE AND (NOT APPLE)
-USE_OPERATOR_TUNING: ON # Enable auto-tuning of operators IF NOT MSVC
-USE_GPERFTOOLS: ON # Build with GPerfTools support (if found)
-USE_JEMALLOC: ON # Build with Jemalloc support
-USE_PROFILER: ON # Build with Profiler support
-USE_DIST_KVSTORE: OFF # Build with DIST_KVSTORE support
-USE_PLUGINS_WARPCTC: OFF # Use WARPCTC Plugins
-USE_PLUGIN_CAFFE: OFF # Use Caffe Plugin
-USE_CPP_PACKAGE: OFF # Build C++ Package
-USE_MXNET_LIB_NAMING: ON # Use MXNet library naming conventions.
-USE_GPROF: OFF # Compile with gprof (profiling) flag
-USE_CXX14_IF_AVAILABLE: OFF # Build with C++14 if the compiler supports it
-USE_VTUNE: OFF # Enable use of Intel Amplifier XE (VTune)) # one could set VTUNE_ROOT for search path
-ENABLE_CUDA_RTC: ON # Build with CUDA runtime compilation support
-BUILD_CPP_EXAMPLES: ON # Build cpp examples
-INSTALL_EXAMPLES: OFF # Install the example source files.
-USE_SIGNAL_HANDLER: OFF # Print stack traces on segfaults.
-USE_TENSORRT: OFF # Enable infeference optimization with TensorRT.
-USE_ASAN: OFF # Enable Clang/GCC ASAN sanitizers.
-ENABLE_TESTCOVERAGE: OFF # Enable compilation with test coverage metric output
+USE_CUDA: "ON" # Build with CUDA support
+USE_OLDCMAKECUDA: "OFF" # Build with old cmake cuda
+USE_NCCL: "OFF" # Use NVidia NCCL with CUDA
+USE_OPENCV: "ON" # Build with OpenCV support
+USE_OPENMP: "ON" # Build with Openmp support
+USE_CUDNN: "ON" # Build with cudnn support) # one could set CUDNN_ROOT for search path
+USE_SSE: "ON" # Build with x86 SSE instruction support IF NOT ARM
+USE_F16C: "ON" # Build with x86 F16C instruction support) # autodetects support if "ON"
+USE_LAPACK: "ON" # Build with lapack support
+USE_MKL_IF_AVAILABLE: "ON" # Use MKL if found
+USE_MKLML_MKL: "ON" # Use MKLDNN variant of MKL (if MKL found) IF USE_MKL_IF_AVAILABLE AND (NOT APPLE)
+USE_MKLDNN: "ON" # Use MKLDNN variant of MKL (if MKL found) IF USE_MKL_IF_AVAILABLE AND (NOT APPLE)
+USE_OPERATOR_TUNING: "ON" # Enable auto-tuning of operators IF NOT MSVC
+USE_GPERFTOOLS: "ON" # Build with GPerfTools support (if found)
+USE_JEMALLOC: "ON" # Build with Jemalloc support
+USE_PROFILER: "ON" # Build with Profiler support
+USE_DIST_KVSTORE: "OFF" # Build with DIST_KVSTORE support
+USE_PLUGINS_WARPCTC: "OFF" # Use WARPCTC Plugins
+USE_PLUGIN_CAFFE: "OFF" # Use Caffe Plugin
+USE_CPP_PACKAGE: "OFF" # Build C++ Package
+USE_MXNET_LIB_NAMING: "ON" # Use MXNet library naming conventions.
+USE_GPROF: "OFF" # Compile with gprof (profiling) flag
+USE_CXX14_IF_AVAILABLE: "OFF" # Build with C++14 if the compiler supports it
+USE_VTUNE: "OFF" # Enable use of Intel Amplifier XE (VTune)) # one could set VTUNE_ROOT for search path
+ENABLE_CUDA_RTC: "ON" # Build with CUDA runtime compilation support
+BUILD_CPP_EXAMPLES: "ON" # Build cpp examples
+INSTALL_EXAMPLES: "OFF" # Install the example source files.
+USE_SIGNAL_HANDLER: "ON" # Print stack traces on segfaults.
+USE_TENSORRT: "OFF" # Enable infeference optimization with TensorRT.
+USE_ASAN: "OFF" # Enable Clang/GCC ASAN sanitizers.
+ENABLE_TESTCOVERAGE: "OFF" # Enable compilation with test coverage metric output
+CMAKE_BUILD_TYPE: "Debug"
diff --git a/dev_menu.py b/dev_menu.py
index 27db9e8aca6f..0fd78cb222e3 100755
--- a/dev_menu.py
+++ b/dev_menu.py
@@ -46,8 +46,12 @@ def __call__(self):
                 resp = input("Please answer yes or no: ")
 
 class CMake(object):
-    def __init__(self, cmake_options_yaml='cmake/cmake_options.yml'):
-        self.cmake_options_yaml = cmake_options_yaml
+    def __init__(self, cmake_options_yaml='cmake_options.yml', cmake_options_yaml_default='cmake/cmake_options.yml'):
+        if os.path.exists(cmake_options_yaml):
+            self.cmake_options_yaml = cmake_options_yaml
+        else:
+            self.cmake_options_yaml = cmake_options_yaml_default
+        logging.info('Using {} for CMake configuration'.format(self.cmake_options_yaml))
         self.cmake_options = None
         self.read_config()
 
@@ -58,13 +62,8 @@ def read_config(self):
 
     def _cmdlineflags(self):
         res = []
-        def _bool_ON_OFF(x):
-            if x:
-                return 'ON'
-            else:
-                return 'OFF'
         for opt,v in self.cmake_options.items():
-            res.append('-D{}={}'.format(opt,_bool_ON_OFF(v)))
+            res.append('-D{}={}'.format(opt,v))
         return res
 
     def cmake_command(self) -> str:
@@ -103,6 +102,11 @@ def __call__(self, build_dir='build', generator='Ninja', build_cmd='ninja'):
         "ci/build.py --platform ubuntu_gpu /work/runtime_functions.sh build_ubuntu_gpu",
         "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_python3_gpu",
     ]),
+    ('[Docker] Python3 GPU+MKLDNN unittests',
+    [
+        "ci/build.py --platform ubuntu_gpu /work/runtime_functions.sh build_ubuntu_gpu_cmake_mkldnn",
+        "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_python3_gpu",
+    ]),
     ('[Docker] Python3 CPU Intel MKLDNN unittests',
     [
         "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh build_ubuntu_cpu_mkldnn",

From b901d5262e9d964d64998a6fff6d28c9ffc9d353 Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@apache.org>
Date: Mon, 3 Dec 2018 13:50:12 -0800
Subject: [PATCH 24/28] Revert "Manually track num_max_thread (#12380)"
 (#13501)

This reverts commit 75410210e07a5fab5e044348aee276d578d5857e.
---
 src/engine/openmp.cc | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/engine/openmp.cc b/src/engine/openmp.cc
index 64899b09660e..8fe3939892d2 100644
--- a/src/engine/openmp.cc
+++ b/src/engine/openmp.cc
@@ -73,14 +73,18 @@ void OpenMP::set_reserve_cores(int cores) {
   CHECK_GE(cores, 0);
   reserve_cores_ = cores;
 #ifdef _OPENMP
-  omp_thread_max_ = std::max(omp_thread_max_ - reserve_cores_, 1);
+  if (reserve_cores_ >= omp_thread_max_) {
+    omp_set_num_threads(1);
+  } else {
+    omp_set_num_threads(omp_thread_max_ - reserve_cores_);
+  }
 #endif
 }
 
 int OpenMP::GetRecommendedOMPThreadCount(bool exclude_reserved) const {
 #ifdef _OPENMP
   if (omp_num_threads_set_in_environment_) {
-    return omp_thread_max_;
+    return omp_get_max_threads();
   }
   if (enabled_) {
     int thread_count = omp_get_max_threads();
@@ -97,8 +101,10 @@ int OpenMP::GetRecommendedOMPThreadCount(bool exclude_reserved) const {
     }
     return omp_thread_max_;
   }
-#endif
   return 1;
+#else
+  return 1;
+#endif
 }
 
 OpenMP *__init_omp__ = OpenMP::Get();

From c44bc853e70a8ba27d9fb8479d39466b7594cd6b Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Mon, 3 Dec 2018 15:19:40 -0800
Subject: [PATCH 25/28] Feature/mkldnn static 2 (#13503)

* build mkldnn as static lib

* update makefile to statically build mkldnn

* build static mkldnn

* fix static name

* fix static name

* update static for mac

* rename mkldnn dep in ci

* remove moving mkldnn dynamic lib

* remove commented code

* remove mkldnn dnaymic for unitest

* force static for mkldnn lib

* remove dynamic mkldnn bind

* only link windows

* add mkldnn.mk

* try force linking

* remove mkldnn dynanmic check

* remove test mkldnn install

* fix spacing

* fix index

* add artifacts

* add comment about windows

* remove static

* update makefile
---
 CMakeLists.txt                          |  1 +
 Makefile                                |  9 +++-
 ci/docker/runtime_functions.sh          |  3 --
 ci/jenkins/Jenkins_steps.groovy         |  8 ++--
 mkldnn.mk                               | 12 ++++--
 tests/cpp/unittest.mk                   |  8 ++--
 tests/python/mkl/test_mkldnn.py         |  6 +--
 tests/python/mkl/test_mkldnn_install.py | 56 -------------------------
 8 files changed, 26 insertions(+), 77 deletions(-)
 delete mode 100644 tests/python/mkl/test_mkldnn_install.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3b8bbd2e0272..161705643194 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -227,6 +227,7 @@ if(USE_MKLDNN)
   include(cmake/DownloadMKLML.cmake)
   # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
   if(NOT MSVC)
+    set(MKLDNN_LIBRARY_TYPE "STATIC" CACHE INTERNAL "" FORCE)
     set(ARCH_OPT_FLAGS "-mtune=generic")
   else()
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /EHsc")
diff --git a/Makefile b/Makefile
index 16ea59f3d585..e424904ad785 100644
--- a/Makefile
+++ b/Makefile
@@ -131,8 +131,13 @@ ifeq ($(USE_MKLDNN), 1)
 		CFLAGS += -I$(MKLROOT)/include
 		LDFLAGS += -L$(MKLROOT)/lib
 	endif
-	CFLAGS += -I$(MKLDNNROOT)/include
-	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
+	# MKLDNN but to needs to be dynamically linked for windows as not all VS compilers support static linking
+	ifneq ($(UNAME_S), Windows)
+		LIB_DEP += $(MKLDNNROOT)/lib/libmkldnn.a
+	else
+		CFLAGS += -I$(MKLDNNROOT)/include
+		LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
+	endif
 endif
 
 # setup opencv
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 1fc10bf0e085..5a44cccc6aa0 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -629,9 +629,6 @@ build_ubuntu_gpu_cmake_mkldnn() {
         /work/mxnet
 
     ninja -v
-    # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
-    cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
-    mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
 }
 
 build_ubuntu_gpu_cmake() {
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index f48a26737308..309775c88c85 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -23,19 +23,19 @@
 utils = load('ci/Jenkinsfile_utils.groovy')
 
 // mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 
 // Python wheels
 mx_pip = 'build/*.whl'
 
 // for scala build, need to pass extra libs when run with dist_kvstore
-mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a'
+mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmkldnn.a'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
+mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
 mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*'
 mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*'
diff --git a/mkldnn.mk b/mkldnn.mk
index d79bbe7d2a0e..5af3e9b1d741 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -19,14 +19,20 @@ ifeq ($(USE_MKLDNN), 1)
 	MKLDNN_SUBMODDIR = $(ROOTDIR)/3rdparty/mkldnn
 	MKLDNN_BUILDDIR = $(MKLDNN_SUBMODDIR)/build
 	MXNET_LIBDIR = $(ROOTDIR)/lib
+	MKLDNN_LIBRARY_TYPE=STATIC
 ifeq ($(UNAME_S), Darwin)
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.dylib
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml.dylib
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.0.dylib
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a
+else ifeq ($(UNAME_S), Windows)
+	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
+	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so
+	MKLDNN_LIBRARY_TYPE=SHARED
 else
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so.0
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a
 endif
 endif
 
@@ -37,7 +43,7 @@ mkldnn_build: $(MKLDNN_LIBFILE)
 $(MKLDNN_LIBFILE):
 	mkdir -p $(MKLDNNROOT)
 	cd $(MKLDNN_SUBMODDIR) && rm -rf external && cd scripts && ./prepare_mkl.sh && cd .. && cp -a external/*/* $(MKLDNNROOT)/.
-	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF
+	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF -DMKLDNN_LIBRARY_TYPE=$(MKLDNN_LIBRARY_TYPE)
 	$(MAKE) -C $(MKLDNN_BUILDDIR) VERBOSE=1
 	$(MAKE) -C $(MKLDNN_BUILDDIR) install
 	mkdir -p $(MXNET_LIBDIR)
diff --git a/tests/cpp/unittest.mk b/tests/cpp/unittest.mk
index 746ee2f096f1..665ce6982874 100644
--- a/tests/cpp/unittest.mk
+++ b/tests/cpp/unittest.mk
@@ -41,22 +41,22 @@ gtest-all.o : $(GTEST_SRCS_)
 gtest.a : gtest-all.o
 	$(AR) $(ARFLAGS) $@ $^
 
-build/tests/cpp/%.o : tests/cpp/%.cc | mkldnn
+build/tests/cpp/%.o : tests/cpp/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/$* $< > build/tests/cpp/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/operator/%.o : tests/cpp/operator/%.cc | mkldnn
+build/tests/cpp/operator/%.o : tests/cpp/operator/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/operator/$* $< > build/tests/cpp/operator/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/operator/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/storage/%.o : tests/cpp/storage/%.cc | mkldnn
+build/tests/cpp/storage/%.o : tests/cpp/storage/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/storage/$* $< > build/tests/cpp/storage/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/storage/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/engine/%.o : tests/cpp/engine/%.cc | mkldnn
+build/tests/cpp/engine/%.o : tests/cpp/engine/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/engine/$* $< > build/tests/cpp/engine/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/engine/$*.o $(filter %.cc %.a, $^)
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index c6c0a0832f1f..d9d3abfc3ced 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -27,7 +27,6 @@
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet.test_utils import *
-import test_mkldnn_install as install
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '../unittest/'))
 from common import with_seed
@@ -441,7 +440,4 @@ def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
     custom = mx.symbol.Custom(name='custom', data=conv, op_type='custom')
     exec1 = custom.bind(mx.cpu(), args={'data': mx.nd.ones([10,3,96,96]), 'conv_weight': mx.nd.ones([8,3,5,5])})
     exec1.forward()[0].wait_to_read()
-
-
-if __name__ == '__main__':
-    install.test_mkldnn_install()
+    
diff --git a/tests/python/mkl/test_mkldnn_install.py b/tests/python/mkl/test_mkldnn_install.py
deleted file mode 100644
index c2f26df72f2e..000000000000
--- a/tests/python/mkl/test_mkldnn_install.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-MKL-DNN related test cases
-"""
-
-import sys
-import os
-import logging
-
-
-def test_mkldnn_install():
-    """
-    This test will verify that MXNet is built/installed correctly when
-    compiled with Intel MKL-DNN library. The method will try to import
-    the mxnet module and see if the mkldnn library is mapped to this
-    process's address space.
-    """
-    logging.basicConfig(level=logging.INFO)
-
-    if not sys.platform.startswith('linux'):
-        logging.info("Bypass mkldnn install test for non-Linux OS")
-        return
-
-    try:
-        #pylint: disable=unused-variable
-        import mxnet as mx
-    except (ImportError, OSError) as e:
-        assert 0, "Import mxnet error: %s. Please double check your build/" \
-            "install steps or environment variable settings" % str(e)
-
-    pid = os.getpid()
-    rc = os.system("cat /proc/" + str(pid) +
-                   "/maps | grep libmkldnn > /dev/null")
-
-    if rc == 0:
-        logging.info("MXNet is built/installed correctly with MKL-DNN")
-    else:
-        assert 0, "MXNet is built/installed incorrectly with MKL-DNN, please " \
-            "double check your build/install steps or environment " \
-            "variable settings"

From 41f3f9805aafdc36873a1543eb69249988d1e3ca Mon Sep 17 00:00:00 2001
From: Aaron Markham <markhama@amazon.com>
Date: Mon, 3 Dec 2018 17:27:41 -0800
Subject: [PATCH 26/28] fix toctree Sphinx errors (#13489)

* fix toctree errors

* nudging file for CI
---
 docs/api/index.md       | 2 ++
 docs/tutorials/index.md | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/api/index.md b/docs/api/index.md
index eff6807678ea..9e7a58f7778c 100644
--- a/docs/api/index.md
+++ b/docs/api/index.md
@@ -1,11 +1,13 @@
 # MXNet APIs
 
+
 ```eval_rst
 .. toctree::
    :maxdepth: 1
 
    c++/index.md
    clojure/index.md
+   java/index.md
    julia/index.md
    perl/index.md
    python/index.md
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index 52e2be8f6a2b..7d102bb88f89 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -3,12 +3,13 @@
 ```eval_rst
 .. toctree::
    :hidden:
-   
+
    basic/index.md
    c++/index.md
    control_flow/index.md
    embedded/index.md
    gluon/index.md
+   java/index.md
    nlp/index.md
    onnx/index.md
    python/index.md

From e533304cdfbfcfd5cb3e2ec3dc9225174ec85586 Mon Sep 17 00:00:00 2001
From: Jose Luis Contreras <joseluis.contreras.santos@gmail.com>
Date: Tue, 4 Dec 2018 14:12:36 +0100
Subject: [PATCH 27/28] Disabled flaky test
 test_gluon_data.test_recordimage_dataset_with_data_loader_multiworker
 (#13527)

---
 tests/python/unittest/test_gluon_data.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index e4206095f9ba..d043a7c6b802 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -78,6 +78,7 @@ def _dataset_transform_fn(x, y):
     return x, y
 
 @with_seed()
+@unittest.skip("Flaky test: https://github.com/apache/incubator-mxnet/issues/13484")
 def test_recordimage_dataset_with_data_loader_multiworker():
     recfile = prepare_record()
     dataset = gluon.data.vision.ImageRecordDataset(recfile)

From 7f3a591c16bbc9767a319d8976216e570945307e Mon Sep 17 00:00:00 2001
From: Pedro Larroy <928489+larroy@users.noreply.github.com>
Date: Tue, 4 Dec 2018 18:48:39 +0100
Subject: [PATCH 28/28] [MXNET-1234] Fix shape inference problems in Activation
 backward (#13409)

* Provide a failing test for ReLU activation shape inference bug

* Fix Activation backward shape inference

fixes: #13333

* Add softsign Activation to test_gluon.py

* Use activation in GPU if we are using CUDNN and not MKLDNN as it's happening right now

* Don't disable MKLDNN
---
 src/operator/elemwise_op_common.h     | 20 ++++---
 src/operator/nn/activation-inl.h      | 12 ++--
 src/operator/nn/activation.cc         | 79 ++++++++++++++++-----------
 src/operator/nn/activation.cu         | 30 ++++++----
 tests/cpp/operator/activation_perf.cc | 26 +++++++--
 tests/python/unittest/test_gluon.py   | 12 ++--
 6 files changed, 109 insertions(+), 70 deletions(-)

diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index 4b8663bba6ea..e622ce216ad0 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -128,29 +128,33 @@ inline bool ElemwiseAttr(const nnvm::NodeAttrs& attrs,
   if (n_out != -1)
     out_size = static_cast<size_t>(n_out);
 
-  auto deduce = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
+  CHECK_LE(in_size, in_attrs->size());
+  CHECK_LE(out_size, out_attrs->size());
+  auto deduce = [&](const std::vector<AttrType>& vec, size_t size, const char *name) {
       for (size_t i = 0; i < size; ++i) {
-        CHECK(assign(&dattr, (*vec)[i]))
+        CHECK(assign(&dattr, vec.at(i)))
           << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
           << name << ": " << "expected " << attr_string(dattr)
-          << ", got " << attr_string((*vec)[i]);
+          << ", got " << attr_string(vec.at(i));
       }
     };
-  deduce(in_attrs, in_size, "input");
-  if (reverse_infer) deduce(out_attrs, out_size, "output");
+  deduce(*in_attrs, in_size, "input");
+  if (reverse_infer)
+      deduce(*out_attrs, out_size, "output");
 
   auto write = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
       for (size_t i = 0; i < size; ++i) {
-        CHECK(assign(&(*vec)[i], dattr))
+        CHECK(assign(&(vec->at(i)), dattr))
           << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
           << name << ": " << "expected " << attr_string(dattr)
-          << ", got " << attr_string((*vec)[i]);
+          << ", got " << attr_string(vec->at(i));
       }
     };
   write(in_attrs, in_size, "input");
   write(out_attrs, out_size, "output");
 
-  if (is_none(dattr)) return false;
+  if (is_none(dattr))
+      return false;
   return true;
 }
 
diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h
index 2705177f951d..1d8e4c2b6cda 100644
--- a/src/operator/nn/activation-inl.h
+++ b/src/operator/nn/activation-inl.h
@@ -48,6 +48,9 @@ enum ActivationOpInputs {kData};
 enum ActivationOpOutputs {kOut};
 enum ActivationOpResource {kTempSpace};
 enum ActivationOpType {kReLU, kSigmoid, kTanh, kSoftReLU, kSoftSign};
+
+// Get the number of inputs to the gradient depending on the activation type
+int GradNumInputs(int act_type);
 }  // activation
 
 struct ActivationParam : public dmlc::Parameter<ActivationParam> {
@@ -199,13 +202,8 @@ void ActivationGradCompute(const nnvm::NodeAttrs& attrs,
                            const std::vector<OpReqType>& req,
                            const std::vector<TBlob>& outputs) {
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-#if (MXNET_USE_CUDNN == 1 || MXNET_USE_MKLDNN == 1)
-  bool relu = param.act_type == activation::kReLU;
-  CHECK_EQ(inputs.size(), relu ? 2U : 3U);
-#else
-  bool softsign = param.act_type == activation::kSoftSign;
-  CHECK_EQ(inputs.size(), softsign ? 3U : 2U);
-#endif
+  const int act_type = param.act_type;
+  CHECK_EQ(inputs.size(), activation::GradNumInputs(act_type));
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
   ActivationGradComputeImpl<xpu>(attrs, ctx, inputs, req, outputs);
diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc
index ba44ebd4ed4d..305eeab21176 100644
--- a/src/operator/nn/activation.cc
+++ b/src/operator/nn/activation.cc
@@ -30,13 +30,34 @@
 #if MXNET_USE_MKLDNN == 1
 #include "./mkldnn/mkldnn_base-inl.h"
 #include "./mkldnn/mkldnn_ops-inl.h"
-#endif  // MXNET_USE_MKLDNN
+#endif  // MXNET_USE_MKLDNN == 1
 #include "../operator_common.h"
 #include "../../common/utils.h"
 
 namespace mxnet {
 namespace op {
 
+namespace activation {
+
+int GradNumInputs(int act_type) {
+    // check activation.cu \sa ActivationGradCompute
+    switch (act_type) {
+        case kReLU:
+            return 2;
+        case kSoftReLU:
+        case kSoftSign:
+        case kTanh:
+        case kSigmoid:
+            return 3;
+        default:
+            CHECK(false) << "missing activation type";
+    }
+    // unreachable
+    return -1;
+}
+
+}  // namespace activation
+
 DMLC_REGISTER_PARAMETER(ActivationParam);
 
 // This will determine the order of the inputs for backward computation.
@@ -44,24 +65,28 @@ struct ActivationGrad {
   const char *op_name;
   std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
                                           const std::vector<nnvm::NodeEntry>& ograds) const {
+    // ograds, output...
     std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
     heads.emplace_back(nnvm::NodeEntry{n, activation::kOut, 0});
 
     const NodeAttrs& attrs = n->attrs;
+    using namespace activation;
     int act_type = dmlc::get<ActivationParam>(attrs.parsed).act_type;
-    if (act_type == activation::kSoftSign) {
-      // for softsign need the inputs to compute the activation.
-      heads.push_back(n->inputs[activation::kData]);
-    }
-
-#if (MXNET_USE_CUDNN == 1 || MXNET_USE_MKLDNN == 1)
     // for ReLU, no need to pass input data. This enables inplace optimization during the
     // forward pass.
-    if (act_type != activation::kReLU &&
-        act_type != activation::kSoftSign) {
-      heads.push_back(n->inputs[activation::kData]);
+    // check activation.cu \sa ActivationGradCompute
+    switch (act_type) {
+        case kReLU:
+            break;
+        case kSoftReLU:
+        case kSoftSign:
+        case kTanh:
+        case kSigmoid:
+            heads.push_back(n->inputs[activation::kData]);
+            break;
+        default:
+            CHECK(false) << "missing activation type";
     }
-#endif
     return MakeGradNode(op_name, n, heads, n->attrs.dict);
   }
 };
@@ -89,21 +114,19 @@ void ActivationGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<NDArray>& outputs) {
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  bool relu = param.act_type == activation::kReLU;
-  CHECK_EQ(inputs.size(), relu ? 2U : 3U);
+  CHECK_EQ(inputs.size(), activation::GradNumInputs(param.act_type));
   if (SupportMKLDNN(inputs[0])) {
     MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
     // XXX: for y = relu(x), y is passed as "in_data" to Backward()
-    MKLDNNActivationBackward(attrs, ctx, inputs[0], relu ? inputs[1] : inputs[2], req[0],
+    const bool relu = param.act_type == activation::kReLU;
+    MKLDNNActivationBackward(attrs, ctx, inputs.at(0), relu ? inputs.at(1) : inputs.at(2), req[0],
                              outputs[0]);
-     MKLDNN_OPCHECK_RUN(ActivationGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    MKLDNN_OPCHECK_RUN(ActivationGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(ActivationGradComputeImpl<cpu>, attrs, ctx, inputs, req, outputs);
 }
-#endif
 
-#if MXNET_USE_MKLDNN == 1
 inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs,
                                          const int dev_mask,
                                          DispatchMode* dispatch_mode,
@@ -122,16 +145,12 @@ inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs,
                                           std::vector<int> *in_attrs,
                                           std::vector<int> *out_attrs) {
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  if (param.act_type != activation::kReLU) {
-    CHECK_EQ(in_attrs->size(), 3U);
-  } else {
-    // for ReLU activation, the backward pass only needs ograd and output
-    CHECK_EQ(in_attrs->size(), 2U);
-  }
+  CHECK_EQ(in_attrs->size(), activation::GradNumInputs(param.act_type));
   return MKLDNNStorageType(attrs, dev_mask, SupportMKLDNNAct(param),
                            dispatch_mode, in_attrs, out_attrs);
 }
-#endif
+#endif  // MXNET_USE_MKLDNN == 1
+
 
 MXNET_OPERATOR_REGISTER_UNARY(Activation)
 .describe(R"code(Applies an activation function element-wise to the input.
@@ -163,18 +182,16 @@ The following activation functions are supported:
 
 NNVM_REGISTER_OP(_backward_Activation)
 .set_num_inputs([](const nnvm::NodeAttrs& attrs) {
-    int act_type = dmlc::get<ActivationParam>(attrs.parsed).act_type;
-    // for ReLU activation, the backward pass only needs ograd and output
-    if (act_type == activation::kReLU) return 2;
-    return 3;
-  })
+    const int act_type = dmlc::get<ActivationParam>(attrs.parsed).act_type;
+    return activation::GradNumInputs(act_type);
+})
 .set_num_outputs(1)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 #if MXNET_USE_MKLDNN == 1
 .set_attr<FInferStorageType>("FInferStorageType", BackwardActStorageType)
 #endif
-.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<3, 1>)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 1>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<-1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<-1, 1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
   return std::vector<std::pair<int, int> >{{0, 0}};
 })
diff --git a/src/operator/nn/activation.cu b/src/operator/nn/activation.cu
index 8892cc34f710..ec7db844b100 100644
--- a/src/operator/nn/activation.cu
+++ b/src/operator/nn/activation.cu
@@ -54,12 +54,13 @@ void ActivationCompute<gpu>(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  const int act_type = param.act_type;
 
   // SoftReLU and kSoftSign are both not supported by CUDNN yet
-  if (param.act_type == activation::kSoftReLU) {
+  if (act_type == activation::kSoftReLU) {
     ActivationForward<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad>(ctx,
       inputs[0], req[0], outputs[0]);
-  } else if (param.act_type == activation::kSoftSign) {
+  } else if (act_type == activation::kSoftSign) {
     ActivationForward<gpu, mshadow_op::softsign, mshadow_op::softsign_grad>(ctx,
       inputs[0], req[0], outputs[0]);
   } else {
@@ -76,23 +77,28 @@ void ActivationGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<TBlob>& outputs) {
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  bool relu = param.act_type == activation::kReLU;
-  CHECK_EQ(inputs.size(), relu ? 2U : 3U);
+  const int act_type = param.act_type;
+  CHECK_EQ(inputs.size(), activation::GradNumInputs(act_type));
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
 
   // both SoftReLU and SoftSign not supported by CUDNN yet
-  if (param.act_type == activation::kSoftReLU) {
+  if (act_type == activation::kSoftReLU) {
     ActivationBackward<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad>(
-      ctx, inputs[0], inputs[1], req[0], outputs[0]);
-  } else if (param.act_type == activation::kSoftSign) {
+      ctx, inputs.at(0), inputs.at(1), req[0], outputs[0]);
+  } else if (act_type == activation::kSoftSign) {
     ActivationBackward<gpu, mshadow_op::softsign, mshadow_op::softsign_grad>(
-      ctx, inputs[0], inputs[2], req[0], outputs[0]);
-  } else {
-    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      ctx, inputs.at(0), inputs.at(2), req[0], outputs[0]);
+  } else if (act_type == activation::kReLU) {
+    MSHADOW_REAL_TYPE_SWITCH(inputs.at(0).type_flag_, DType, {
       // XXX: for y = relu(x), y is passed as "in_data" to Backward()
-      get_cudnn_op<DType>(param).Backward(ctx, inputs[0], relu ? inputs[1] : inputs[2],
-                                          inputs[1], req[0], outputs[0]);
+      get_cudnn_op<DType>(param).Backward(ctx, inputs.at(0), inputs.at(1),
+                                          inputs.at(1), req[0], outputs[0]);
+    });
+  } else {
+    MSHADOW_REAL_TYPE_SWITCH(inputs.at(0).type_flag_, DType, {
+      get_cudnn_op<DType>(param).Backward(ctx, inputs.at(0), inputs.at(2),
+                                          inputs.at(1), req[0], outputs[0]);
     });
   }
 }
diff --git a/tests/cpp/operator/activation_perf.cc b/tests/cpp/operator/activation_perf.cc
index 1bd8ca89c9f5..bba8a3ec5722 100644
--- a/tests/cpp/operator/activation_perf.cc
+++ b/tests/cpp/operator/activation_perf.cc
@@ -38,13 +38,27 @@ const kwargs_t basic_activation_args = { };
  * \brief Generic bidirectional sanity test
  */
 TEST(ACTIVATION_PERF, ExecuteBidirectional) {
+  using namespace std;
   TShape shape({5, 5});
-  kwargs_t kwargs = basic_activation_args;
-  kwargs.push_back({"act_type", "tanh"});
-
-  test::op::CoreOperatorRunner<float> runner;
-  runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor<float>::ArgsWithOpName(
-          kwargs, "Activation", "_backward_Activation"), 1);
+  vector<string> activations = {
+    "relu",
+    "sigmoid",
+    "tanh",
+    "softrelu",
+    "softsign"
+  };
+  for (const string& activation : activations) {
+    kwargs_t activation_args = {{"act_type", activation}};
+    test::op::CoreOperatorRunner<float> runner;
+    runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor<float>::ArgsWithOpName(
+            activation_args, "Activation", "_backward_Activation"), 1);
+  }
+  for (const string& activation : activations) {
+    kwargs_t activation_args = {{"act_type", activation}};
+    test::op::CoreOperatorRunner<float> runner;
+    runner.RunBidirectional(true, { shape }, test::op::CoreOpExecutor<float>::ArgsWithOpName(
+            activation_args, "Activation", "_backward_Activation"), 1);
+  }
 }
 
 /*!
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 3049674821c9..abe6b136fe0c 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -2411,7 +2411,7 @@ def hybrid_forward(self, F, x):
             x_reshape = x.reshape(self.reshape)
             out = self.act(x_reshape)
             return out
-    acts = ["relu", "sigmoid", "tanh", "softrelu"]
+    acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"]
     for act in acts:
         x = mx.nd.random.uniform(-1, 1, shape=(4, 16, 32, 32))
         shape = (4, 32, 32, -1)
@@ -2433,7 +2433,7 @@ def hybrid_forward(self, F, x):
             out = self.act(x_slice)
             return out
 
-    acts = ["relu", "sigmoid", "tanh", "softrelu"]
+    acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"]
     for act in acts:
         x = mx.nd.random.uniform(-1, 1, shape=(8, 32, 64, 64))
         slice = [(0, 16, 32, 32), (4, 32, 64, 64)]
@@ -2457,7 +2457,7 @@ def hybrid_forward(self, F, x):
             y_reshape = y.reshape(self.reshape[1])
             out = self.act1(y_reshape)
             return out
-    acts = ["relu", "sigmoid", "tanh", "softrelu"]
+    acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"]
     for idx0, act0 in enumerate(acts):
         for idx1, act1 in enumerate(acts):
             if idx1 == idx0:
@@ -2484,7 +2484,7 @@ def hybrid_forward(self, F, x):
             y_slice = y.slice(begin=self.slice[1][0], end=self.slice[1][1])
             out = self.act1(y_slice)
             return out
-    acts = ["relu", "sigmoid", "tanh", "softrelu"]
+    acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"]
     for idx0, act0 in enumerate(acts):
         for idx1, act1 in enumerate(acts):
             if idx1 == idx0:
@@ -2512,7 +2512,7 @@ def hybrid_forward(self, F, x):
             y_slice = y.slice(begin=self.slice[0], end=self.slice[1])
             out = self.act1(y_slice)
             return out
-    acts = ["relu", "sigmoid", "tanh", "softrelu"]
+    acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"]
     for idx0, act0 in enumerate(acts):
         for idx1, act1 in enumerate(acts):
             if idx1 == idx0:
@@ -2541,7 +2541,7 @@ def hybrid_forward(self, F, x):
             y_reshape = y.reshape(self.reshape)
             out = self.act1(y_reshape)
             return out
-    acts = ["relu", "sigmoid", "tanh", "softrelu"]
+    acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"]
     for idx0, act0 in enumerate(acts):
         for idx1, act1 in enumerate(acts):
             if idx1 == idx0: