From 4c17c030208c67b2f68808d12d5a79996cfaf4ba Mon Sep 17 00:00:00 2001
From: Haibin Lin <linhaibin.eric@gmail.com>
Date: Sat, 27 Jan 2018 18:22:50 -0800
Subject: [PATCH 01/18] Bump 1.1 (#192)

* bump

* also update base.h

* revert website changes

* Update index.html
---
 R-package/DESCRIPTION                           | 2 +-
 docs/_static/mxnet-theme/index.html             | 6 +++---
 include/mxnet/base.h                            | 4 ++--
 python/mxnet/libinfo.py                         | 2 +-
 scala-package/assembly/linux-x86_64-cpu/pom.xml | 6 +++---
 scala-package/assembly/linux-x86_64-gpu/pom.xml | 6 +++---
 scala-package/assembly/osx-x86_64-cpu/pom.xml   | 6 +++---
 scala-package/assembly/pom.xml                  | 2 +-
 scala-package/core/pom.xml                      | 6 +++---
 scala-package/examples/pom.xml                  | 4 ++--
 scala-package/init-native/linux-x86_64/pom.xml  | 4 ++--
 scala-package/init-native/osx-x86_64/pom.xml    | 4 ++--
 scala-package/init-native/pom.xml               | 2 +-
 scala-package/init/pom.xml                      | 2 +-
 scala-package/macros/pom.xml                    | 6 +++---
 scala-package/native/linux-x86_64-cpu/pom.xml   | 4 ++--
 scala-package/native/linux-x86_64-gpu/pom.xml   | 4 ++--
 scala-package/native/osx-x86_64-cpu/pom.xml     | 4 ++--
 scala-package/native/pom.xml                    | 2 +-
 scala-package/pom.xml                           | 2 +-
 scala-package/spark/pom.xml                     | 4 ++--
 snapcraft.yaml                                  | 2 +-
 22 files changed, 42 insertions(+), 42 deletions(-)
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 2996eed5db81..0ec7f3667cba 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: mxnet
 Type: Package
 Title: MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems
-Version: 1.0.1
+Version: 1.1.0
 Date: 2017-06-27
 Author: Tianqi Chen, Qiang Kou, Tong He
 Maintainer: Qiang Kou <qkou@qkou.info>
diff --git a/docs/_static/mxnet-theme/index.html b/docs/_static/mxnet-theme/index.html
index 9dfb7d6268fa..3b48832a03cd 100644
--- a/docs/_static/mxnet-theme/index.html
+++ b/docs/_static/mxnet-theme/index.html
@@ -21,9 +21,9 @@
   <div class="container">
     <div class="row">
       <div class="col-lg-4 col-sm-12">
-        <h3>Apache MXNet 1.0.1 Released</h3>
-        <p>We're excited to announce the release of MXNet 1.0.1! Check out the release notes for latest updates.</p>
-        <a href="https://github.com/apache/incubator-mxnet/releases/tag/1.0.1">Learn More</a>
+        <h3>Apache MXNet 1.1.0 Released</h3>
+        <p>We're excited to announce the release of MXNet 1.1.0! Check out the release notes for latest updates.</p>
+        <a href="https://github.com/apache/incubator-mxnet/releases/tag/1.1.0">Learn More</a>
       </div>
       <div class="col-lg-4 col-sm-12">
         <h3>MXNet Model Server</h3>
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 262c348eb88f..faf2fe1097ad 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -111,9 +111,9 @@
 /*! \brief major version */
 #define MXNET_MAJOR 1
 /*! \brief minor version */
-#define MXNET_MINOR 0
+#define MXNET_MINOR 1
 /*! \brief patch version */
-#define MXNET_PATCH 1
+#define MXNET_PATCH 0
 /*! \brief mxnet version */
 #define MXNET_VERSION (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
 /*! \brief helper for making version number */
diff --git a/python/mxnet/libinfo.py b/python/mxnet/libinfo.py
index 9ab0f5960a83..8ccac29d5f78 100644
--- a/python/mxnet/libinfo.py
+++ b/python/mxnet/libinfo.py
@@ -61,4 +61,4 @@ def find_lib_path():
 
 
 # current version
-__version__ = "1.0.1"
+__version__ = "1.1.0"
diff --git a/scala-package/assembly/linux-x86_64-cpu/pom.xml b/scala-package/assembly/linux-x86_64-cpu/pom.xml
index 75f2d2cdcb4b..cbcd7acdafdc 100644
--- a/scala-package/assembly/linux-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.0.1-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,12 +18,12 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>libmxnet-scala-linux-x86_64-cpu</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
   </dependencies>
diff --git a/scala-package/assembly/linux-x86_64-gpu/pom.xml b/scala-package/assembly/linux-x86_64-gpu/pom.xml
index 7c7162dbec2e..cfe22e7eea71 100644
--- a/scala-package/assembly/linux-x86_64-gpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.0.1-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,12 +18,12 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>libmxnet-scala-linux-x86_64-gpu</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
   </dependencies>
diff --git a/scala-package/assembly/osx-x86_64-cpu/pom.xml b/scala-package/assembly/osx-x86_64-cpu/pom.xml
index 0b5c4e20b49d..7f7f1ab75c3f 100644
--- a/scala-package/assembly/osx-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/osx-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.0.1-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,12 +18,12 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>libmxnet-scala-osx-x86_64-cpu</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
       <type>jnilib</type>
     </dependency>
   </dependencies>
diff --git a/scala-package/assembly/pom.xml b/scala-package/assembly/pom.xml
index efa3b75b155c..a755d7cb8465 100644
--- a/scala-package/assembly/pom.xml
+++ b/scala-package/assembly/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.0.1-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index b7219064a58d..0df704750571 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.0.1-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -71,13 +71,13 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-macros_${scala.binary.version}</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
   </dependencies>
diff --git a/scala-package/examples/pom.xml b/scala-package/examples/pom.xml
index 87ce89847246..a23b7b91f6bd 100644
--- a/scala-package/examples/pom.xml
+++ b/scala-package/examples/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.0.1-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -118,7 +118,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/scala-package/init-native/linux-x86_64/pom.xml b/scala-package/init-native/linux-x86_64/pom.xml
index 0d15a6cb117e..848d1e14ca77 100644
--- a/scala-package/init-native/linux-x86_64/pom.xml
+++ b/scala-package/init-native/linux-x86_64/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>1.0.1-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/init-native/osx-x86_64/pom.xml b/scala-package/init-native/osx-x86_64/pom.xml
index e2af2de982d8..f6b865f145c5 100644
--- a/scala-package/init-native/osx-x86_64/pom.xml
+++ b/scala-package/init-native/osx-x86_64/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>1.0.1-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/init-native/pom.xml b/scala-package/init-native/pom.xml
index 7ca19f1b07c7..672409981b2c 100644
--- a/scala-package/init-native/pom.xml
+++ b/scala-package/init-native/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.0.1-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/init/pom.xml b/scala-package/init/pom.xml
index 972b8948413a..26130a437391 100644
--- a/scala-package/init/pom.xml
+++ b/scala-package/init/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.0.1-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
 <!--  <relativePath>../pom.xml</relativePath>-->
   </parent>
 
diff --git a/scala-package/macros/pom.xml b/scala-package/macros/pom.xml
index dea908a8dc22..65dcbbc8f706 100644
--- a/scala-package/macros/pom.xml
+++ b/scala-package/macros/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.0.1-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -41,13 +41,13 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>libmxnet-init-scala-${platform}</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
       <scope>provided</scope>
       <type>${libtype}</type>
     </dependency>
diff --git a/scala-package/native/linux-x86_64-cpu/pom.xml b/scala-package/native/linux-x86_64-cpu/pom.xml
index 3bcb0123e84c..efaeedd8959b 100644
--- a/scala-package/native/linux-x86_64-cpu/pom.xml
+++ b/scala-package/native/linux-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.0.1-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/linux-x86_64-gpu/pom.xml b/scala-package/native/linux-x86_64-gpu/pom.xml
index 85ff4d25d3dc..0befa0c730a8 100644
--- a/scala-package/native/linux-x86_64-gpu/pom.xml
+++ b/scala-package/native/linux-x86_64-gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.0.1-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/osx-x86_64-cpu/pom.xml b/scala-package/native/osx-x86_64-cpu/pom.xml
index 809da667e28e..49812246b311 100644
--- a/scala-package/native/osx-x86_64-cpu/pom.xml
+++ b/scala-package/native/osx-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.0.1-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/pom.xml b/scala-package/native/pom.xml
index 55fb05380248..b772245ceb4c 100644
--- a/scala-package/native/pom.xml
+++ b/scala-package/native/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.0.1-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index 5680c83bb4eb..8599f7da8c4c 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -5,7 +5,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>ml.dmlc.mxnet</groupId>
   <artifactId>mxnet-parent_2.11</artifactId>
-  <version>1.0.1-SNAPSHOT</version>
+  <version>1.1.0-SNAPSHOT</version>
   <name>MXNet Scala Package - Parent</name>
   <url>https://github.com/dmlc/mxnet/tree/master/scala-package</url>
   <description>MXNet Scala Package</description>
diff --git a/scala-package/spark/pom.xml b/scala-package/spark/pom.xml
index 3863c774b0ac..da5c6e2e54ef 100644
--- a/scala-package/spark/pom.xml
+++ b/scala-package/spark/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.0.1-SNAPSHOT</version>
+    <version>1.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -21,7 +21,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.0.1-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/snapcraft.yaml b/snapcraft.yaml
index 8a0dd4528e1b..b17c73bd6a91 100644
--- a/snapcraft.yaml
+++ b/snapcraft.yaml
@@ -1,5 +1,5 @@
 name: mxnet
-version: '1.0.1'
+version: '1.1.0'
 summary: MXNet is a deep learning framework designed for efficiency and flexibility.
 description: |
   MXNet is a deep learning framework designed for both efficiency and 

From 9a5819687f16ea7cd611bca7b4bcb809d4186d9d Mon Sep 17 00:00:00 2001
From: Haibin Lin <linhaibin.eric@gmail.com>
Date: Sat, 27 Jan 2018 18:18:35 -0800
Subject: [PATCH 02/18] update news.md (#191)

* Update NEWS.md

* Update README.md
---
 NEWS.md   | 37 +++++++++++++++++++++++++++++++++++++
 README.md |  1 +
 2 files changed, 38 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index fc6b10188fc7..6e116c57002a 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,42 @@
 MXNet Change Log
 ================
+## 1.1.0
+### Usability Improvements
+- Improved the usability of examples and tutorials
+### Bug-fixes
+- Fixed I/O multiprocessing for too many open file handles (#8904), race condition (#8995), deadlock (#9126).
+- Fixed image IO integration with OpenCV 3.3 (#8757).
+- Fixed Gluon block printing (#8956).
+- Fixed float16 argmax when there is negative input. (#9149)
+- Fixed random number generator to ensure sufficient randomness. (#9119, #9256, #9300)
+- Fixed custom op multi-GPU scaling (#9283)
+- Fixed gradient of gather_nd when duplicate entries exist in index. (#9200)
+- Fixed overriden contexts in Module `group2ctx` option when using multiple contexts (#8867)
+### New Features
+- Added experimental API in `contrib.text` for building vocabulary, and loading pre-trained word embeddings, with built-in support for 307 GloVe and FastText pre-trained embeddings. (#8763)
+- Added experimental structural blocks in `gluon.contrib`: `Concurrent`, `HybridConcurrent`, `Identity`. (#9427)
+- Added `sparse.dot(dense, csr)` operator (#8938)
+- Added `Khatri-Rao` operator (#7781)
+- Added `FTML` and `Signum` optimizer (#9220, #9262)
+- Added `ENABLE_CUDA_RTC` build option (#9428)
+### API Changes
+- Added zero gradients to rounding operators including `rint`, `ceil`, `floor`, `trunc`, and `fix` (#9040)
+- Added `use_global_stats` in `nn.BatchNorm` (#9420)
+- Added `axis` argument to `SequenceLast`, `SequenceMask` and `SequenceReverse` operators (#9306)
+- Added `lazy_update` option for standard `SGD` & `Adam` optimizer with `row_sparse` gradients (#9468, #9189)
+- Added `select` option in `Block.collect_params` to support regex (#9348)
+- Added support for (one-to-one and sequence-to-one) inference on explicit unrolled RNN models in R (#9022) 
+### Depreciations
+- The Scala API name space is still called `ml.dmlc`. The name space is likely be changed in a future release to `org.apache` and might brake existing applications and scripts (#9579, #9324)
+### Performance Improvements
+- Improved GPU inference speed by 20% when batch size is 1 (#9055)
+- Improved `SequenceLast` operator speed (#9306)
+- Added multithreading for the class of broadcast_reduce operators on CPU (#9444)
+- Improved batching for GEMM/TRSM operators with large matrices on GPU (#8846)
+### Known Issues
+- "Predict with pre-trained models" tutorial is broken
+
+
 ## 1.0.0
 ### Performance
   - Enhanced the performance of `sparse.dot` operator.
diff --git a/README.md b/README.md
index feff02914276..dbae65deaaa0 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,7 @@ deep learning systems, and interesting insights of DL systems for hackers.
 
 What's New
 ----------
+* [Version 1.1.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.1.0) - MXNet 1.1.0 Release.
 * [Version 1.0.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.0.0) - MXNet 1.0.0 Release.
 * [Version 0.12.1 Release](https://github.com/apache/incubator-mxnet/releases/tag/0.12.1) - MXNet 0.12.1 Patch Release.
 * [Version 0.12.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/0.12.0) - MXNet 0.12.0 Release.

From 32caa1033631668016cf60755ae71b4b3186996c Mon Sep 17 00:00:00 2001
From: Ziyue Huang <zyhuang94@gmail.com>
Date: Sun, 28 Jan 2018 06:24:33 +0800
Subject: [PATCH 03/18] refactor regression ops to nnvm interface (#9540)

* refactor regression ops

* fix err for instantiation of minus_sign

* remove useless header file init_op.h

* replace with macro and address other comments

* update

* minor revise docs

* add mae test
---
 src/operator/operator_tune.cc          |   2 +
 src/operator/regression_output-inl.h   | 228 ++++++++++---------------
 src/operator/regression_output.cc      | 107 ++++++------
 src/operator/regression_output.cu      |  41 ++---
 tests/python/unittest/test_operator.py |   4 +-
 5 files changed, 170 insertions(+), 212 deletions(-)

diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index 7cdf7a2078cc..e0f8306565d9 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -286,12 +286,14 @@ IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::plus);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::minus);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::mul);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::div);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::minus_sign);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rminus);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rdiv);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::plus);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::minus);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::mul);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::div);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::minus_sign);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rminus);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rdiv);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::div_grad);  // NOLINT()
diff --git a/src/operator/regression_output-inl.h b/src/operator/regression_output-inl.h
index 08b2f0a4a813..4642f8dc4679 100644
--- a/src/operator/regression_output-inl.h
+++ b/src/operator/regression_output-inl.h
@@ -18,28 +18,28 @@
  */
 
 /*!
- * Copyright (c) 2015 by Contributors
  * \file regression_ouput-inl.h
  * \brief Regression output operator.
- */
+*/
 #ifndef MXNET_OPERATOR_REGRESSION_OUTPUT_INL_H_
 #define MXNET_OPERATOR_REGRESSION_OUTPUT_INL_H_
 
-#include <dmlc/logging.h>
-#include <mxnet/operator.h>
-#include <map>
-#include <string>
+#include <mxnet/operator_util.h>
 #include <vector>
 #include <utility>
+#include "./mshadow_op.h"
+#include "./mxnet_op.h"
 #include "./operator_common.h"
 
 namespace mxnet {
 namespace op {
 
+/*!
+ * \brief regression namespace
+ */
 namespace reg_enum {
 enum RegressionOutputOpInputs {kData, kLabel};
 enum RegressionOutputOutputs {kOut};
-enum RegressionOutputType {kLinear, kLogistic, kMAE};
 }  // reg_enum
 
 struct RegressionOutputParam : public dmlc::Parameter<RegressionOutputParam> {
@@ -50,146 +50,90 @@ struct RegressionOutputParam : public dmlc::Parameter<RegressionOutputParam> {
   };
 };
 
-// Special Operator to output regression value in forward
-// And get gradient in calculation.
-template<typename xpu, typename ForwardOp, typename BackwardOp>
-class RegressionOutputOp : public Operator {
- public:
-  explicit RegressionOutputOp(RegressionOutputParam param) : param_(param) {}
-
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2U) << "RegressionOutputOp Input: [data, label]";
-    CHECK_EQ(out_data.size(), 1U) << "RegressionOutputOp Output: [output]";
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[reg_enum::kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[reg_enum::kOut].FlatTo2D<xpu, real_t>(s);
-    Assign(out, req[reg_enum::kOut], F<ForwardOp>(data));
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2U);
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_GE(in_grad.size(), 1U);
-    CHECK_GE(req.size(), 1U);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    real_t num_output =
-      in_data[reg_enum::kLabel].Size()/in_data[reg_enum::kLabel].shape_[0];
-    Tensor<xpu, 2> out = out_data[reg_enum::kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> grad = in_grad[reg_enum::kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> label = in_data[reg_enum::kLabel]
-      .get_with_shape<xpu, 2, real_t>(out.shape_, s);
-    Assign(grad, req[reg_enum::kData], param_.grad_scale/num_output*
-      F<BackwardOp>(out, reshape(label, grad.shape_)));
-  }
-
- private:
-  RegressionOutputParam param_;
-};
-
-// Decalre Factory function, used for dispatch specialization
-template<typename xpu>
-Operator* CreateRegressionOutputOp(reg_enum::RegressionOutputType type,
-                                   RegressionOutputParam param);
-
-#if DMLC_USE_CXX11
-template<reg_enum::RegressionOutputType type>
-class RegressionOutputProp : public OperatorProperty {
- public:
-  std::vector<std::string> ListArguments() const override {
-    return {"data", "label"};
-  }
-
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 2) << "Input:[data, label]";
-    const TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
-    auto &lshape = (*in_shape)[1];
-    if (lshape.ndim() == 0) {
-      // special treatment for 1D output, to allow 1D label by default.
-      // Think about change convention later
-      if (dshape.ndim() == 2 && dshape[1] == 1) {
-        lshape = Shape1(dshape[0]);
-      } else {
-        lshape = dshape;
-      }
-    } else if (lshape[0] != dshape[0] || lshape.Size() != dshape.Size()) {
-      std::ostringstream os;
-      os << "Shape inconsistent, Provided=" << lshape << ','
-         << " inferred shape=" << dshape;
-      throw ::mxnet::op::InferShapeError(os.str(), 1);
-    }
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new RegressionOutputProp<type>();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    switch (type) {
-      case reg_enum::kLinear: return "LinearRegressionOutput";
-      case reg_enum::kLogistic: return "LogisticRegressionOutput";
-      case reg_enum::kMAE: return "MAERegressionOutput";
-      default: LOG(FATAL) << "unknown type"; return "";
+inline bool RegressionOpShape(const nnvm::NodeAttrs& attrs,
+                              std::vector<TShape> *in_attrs,
+                              std::vector<TShape> *out_attrs) {
+  using namespace mshadow;
+  CHECK_EQ(in_attrs->size(), 2U) << "Input:[data, label]";
+  const TShape &dshape = in_attrs->at(0);
+  if (dshape.ndim() == 0) return false;
+  auto &lshape = (*in_attrs)[1];
+  if (lshape.ndim() == 0) {
+    // special treatment for 1D output, to allow 1D label by default.
+    // Think about change convention later
+    if (dshape.ndim() == 2 && dshape[1] == 1) {
+      lshape = Shape1(dshape[0]);
+    } else {
+      lshape = dshape;
     }
+  } else if (lshape[0] != dshape[0] || lshape.Size() != dshape.Size()) {
+    std::ostringstream os;
+    os << "Shape inconsistent, Provided=" << lshape << ','
+       << " inferred shape=" << dshape;
+    throw ::mxnet::op::InferShapeError(os.str(), 1);
   }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {in_data[reg_enum::kLabel], out_data[reg_enum::kOut]};
-  }
-
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-    return {{out_data[reg_enum::kOut], in_grad[reg_enum::kData]}};
-  }
-
-  std::vector<std::pair<int, void*> > ForwardInplaceOption(
-    const std::vector<int> &in_data,
-    const std::vector<void*> &out_data) const override {
-    return {{in_data[reg_enum::kData], out_data[reg_enum::kOut]}};
+  out_attrs->clear();
+  out_attrs->push_back(dshape);
+  return true;
+}
+
+template<typename xpu, typename ForwardOp>
+void RegressionForward(const nnvm::NodeAttrs& attrs,
+                       const OpContext& ctx,
+                       const std::vector<TBlob>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs) {
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_REAL_TYPE_SWITCH(inputs[reg_enum::kData].type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[reg_enum::kOut], Req, {
+      const DType* in_data = inputs[reg_enum::kData].dptr<DType>();
+      DType* out_data = outputs[reg_enum::kOut].dptr<DType>();
+      using namespace mxnet_op;
+      Kernel<op_with_req<ForwardOp, Req>, xpu>::Launch(
+        s, outputs[reg_enum::kOut].Size(), out_data, in_data);
+    });
+  });
+}
+
+template<typename xpu, typename BackwardOp>
+void RegressionBackward(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  const RegressionOutputParam& param = nnvm::get<RegressionOutputParam>(attrs.parsed);
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  // inputs are in_label, out_data
+  // outputs are data_grad, label_grad
+  MSHADOW_REAL_TYPE_SWITCH(inputs[1].type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+      const DType* in_label = inputs[0].dptr<DType>();
+      const DType* out_data = inputs[1].dptr<DType>();
+      DType* data_grad = outputs[0].dptr<DType>();
+      const real_t num_output = inputs[0].Size()/inputs[0].shape_[0];
+      using namespace mxnet_op;
+      Kernel<op_with_req<BackwardOp, Req>, xpu>::Launch(
+        s, outputs[0].Size(), data_grad, out_data, in_label);
+      Kernel<op_with_req<mshadow_op::mul, Req>, xpu>::Launch(
+        s, outputs[0].Size(), data_grad, data_grad,
+        static_cast<DType>(param.grad_scale/num_output));
+    });
+  });
+}
+
+struct RegressionOpGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    std::vector<nnvm::NodeEntry> heads;
+    heads.push_back(n->inputs[reg_enum::kLabel]);
+    heads.emplace_back(nnvm::NodeEntry{n, reg_enum::kOut, 0});
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
   }
+};
 
-  Operator* CreateOperator(Context ctx) const override;
 
- protected:
-  RegressionOutputParam param_;
-};
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
+
 #endif  // MXNET_OPERATOR_REGRESSION_OUTPUT_INL_H_
diff --git a/src/operator/regression_output.cc b/src/operator/regression_output.cc
index 2f8042e9e831..7b0fbae3bccb 100644
--- a/src/operator/regression_output.cc
+++ b/src/operator/regression_output.cc
@@ -18,61 +18,71 @@
  */
 
 /*!
- * Copyright (c) 2015 by Contributors
- * \file regression_output.cc
- * \brief regression output operator
+ * \file regression_ouput.cc
+ * \brief Regression output operator.
 */
+
 #include "./regression_output-inl.h"
-#include "./mshadow_op.h"
+
+#define MXNET_OPERATOR_REGISTER_REGRESSION_FWD(__name$, __kernel$, __bwdop$)   \
+  NNVM_REGISTER_OP(__name$)                                                    \
+  .set_num_inputs(2)                                                           \
+  .set_num_outputs(1)                                                          \
+  .set_attr<nnvm::FListInputNames>("FListInputNames",                          \
+    [](const NodeAttrs& attrs) {                                               \
+      return std::vector<std::string>{"data", "label"};                        \
+    })                                                                         \
+  .set_attr<nnvm::FInferShape>("FInferShape", RegressionOpShape)               \
+  .set_attr<nnvm::FGradient>("FGradient", RegressionOpGrad{__bwdop$})          \
+  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                            \
+  [](const NodeAttrs& attrs){                                                  \
+    return std::vector<std::pair<int, int> >{{0, 0}};                          \
+  })                                                                           \
+  .set_attr<FCompute>("FCompute<cpu>", RegressionForward<cpu, __kernel$>)      \
+  .add_argument("data", "NDArray-or-Symbol", "Input data to the function.")    \
+  .add_argument("label", "NDArray-or-Symbol", "Input label to the function.")  \
+  .add_arguments(RegressionOutputParam::__FIELDS__())
+
+#define MXNET_OPERATOR_REGISTER_REGRESSION_BWD(__name$, __kernel$)         \
+  NNVM_REGISTER_OP(__name$)                                                \
+  .set_num_inputs(2)                                                       \
+  .set_num_outputs(2)                                                      \
+  .set_attr_parser(ParamParser<RegressionOutputParam>)                     \
+  .set_attr<nnvm::TIsBackward>("TIsBackward", true)                        \
+  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                        \
+  [](const NodeAttrs& attrs){                                              \
+    return std::vector<std::pair<int, int> >{{1, 0}};                      \
+  })                                                                       \
+  .set_attr<FCompute>("FCompute<cpu>", RegressionBackward<cpu, __kernel$>)
 
 namespace mxnet {
 namespace op {
 
-template<>
-Operator *CreateRegressionOutputOp<cpu>(reg_enum::RegressionOutputType type,
-                                        RegressionOutputParam param) {
-  switch (type) {
-    case reg_enum::kLinear:
-      return new RegressionOutputOp<cpu, op::mshadow_op::identity, op::mshadow_op::minus>(param);
-    case reg_enum::kLogistic:
-      return new RegressionOutputOp<cpu, mshadow_op::sigmoid, op::mshadow_op::minus>(param);
-    case reg_enum::kMAE:
-      return new RegressionOutputOp<cpu, op::mshadow_op::identity, mshadow_op::minus_sign>(param);
-    default:
-      LOG(FATAL) << "unknown activation type " << type;
-  }
-  return nullptr;
-}
-
-// DO_BIND_DISPATCH comes from operator_common.h
-template<reg_enum::RegressionOutputType type>
-Operator *RegressionOutputProp<type>::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateRegressionOutputOp, type, param_);
-}
 
 DMLC_REGISTER_PARAMETER(RegressionOutputParam);
 
-MXNET_REGISTER_OP_PROPERTY(LinearRegressionOutput, RegressionOutputProp<reg_enum::kLinear>)
+MXNET_OPERATOR_REGISTER_REGRESSION_FWD(LinearRegressionOutput,
+  mshadow_op::identity, "_backward_linear_reg_out")
 .describe(R"code(Computes and optimizes for squared loss during backward propagation.
 Just outputs ``data`` during forward propagation.
 
 If :math:`\hat{y}_i` is the predicted value of the i-th sample, and :math:`y_i` is the corresponding target value,
 then the squared loss estimated over :math:`n` samples is defined as
 
-:math:`\text{SquaredLoss}(y, \hat{y} ) = \frac{1}{n} \sum_{i=0}^{n-1} \left( y_i - \hat{y}_i \right)^2`
+:math:`\text{SquaredLoss}(\textbf{Y}, \hat{\textbf{Y}} ) = \frac{1}{n} \sum_{i=0}^{n-1} \lVert  \textbf{y}_i - \hat{\textbf{y}}_i  \rVert_2`
 
 .. note::
    Use the LinearRegressionOutput as the final output layer of a net.
 
-By default, gradients of this loss function are scaled by factor `1/n`, where n is the number of training examples.
-The parameter `grad_scale` can be used to change this scale to `grad_scale/n`.
+By default, gradients of this loss function are scaled by factor `1/m`, where m is the number of regression outputs of a training example.
+The parameter `grad_scale` can be used to change this scale to `grad_scale/m`.
+
+)code" ADD_FILELINE);
 
-)code" ADD_FILELINE)
-.add_argument("data", "NDArray-or-Symbol", "Input data to the function.")
-.add_argument("label", "NDArray-or-Symbol", "Input label to the function.")
-.add_arguments(RegressionOutputParam::__FIELDS__());
+MXNET_OPERATOR_REGISTER_REGRESSION_BWD(_backward_linear_reg_out, mshadow_op::minus);
 
-MXNET_REGISTER_OP_PROPERTY(MAERegressionOutput, RegressionOutputProp<reg_enum::kMAE>)
+MXNET_OPERATOR_REGISTER_REGRESSION_FWD(MAERegressionOutput,
+  mshadow_op::identity, "_backward_mae_reg_out")
 .describe(R"code(Computes mean absolute error of the input.
 
 MAE is a risk metric corresponding to the expected value of the absolute error.
@@ -80,24 +90,24 @@ MAE is a risk metric corresponding to the expected value of the absolute error.
 If :math:`\hat{y}_i` is the predicted value of the i-th sample, and :math:`y_i` is the corresponding target value,
 then the mean absolute error (MAE) estimated over :math:`n` samples is defined as
 
-:math:`\text{MAE}(y, \hat{y} ) = \frac{1}{n} \sum_{i=0}^{n-1} \left| y_i - \hat{y}_i \right|`
+:math:`\text{MAE}(\textbf{Y}, \hat{\textbf{Y}} ) = \frac{1}{n} \sum_{i=0}^{n-1} \lVert \textbf{y}_i - \hat{\textbf{y}}_i \rVert_1`
 
 .. note::
    Use the MAERegressionOutput as the final output layer of a net.
 
-By default, gradients of this loss function are scaled by factor `1/n`, where n is the number of training examples.
-The parameter `grad_scale` can be used to change this scale to `grad_scale/n`.
+By default, gradients of this loss function are scaled by factor `1/m`, where m is the number of regression outputs of a training example.
+The parameter `grad_scale` can be used to change this scale to `grad_scale/m`.
 
-)code" ADD_FILELINE)
-.add_argument("data", "NDArray-or-Symbol", "Input data to the function.")
-.add_argument("label", "NDArray-or-Symbol", "Input label to the function.")
-.add_arguments(RegressionOutputParam::__FIELDS__());
+)code" ADD_FILELINE);
 
-MXNET_REGISTER_OP_PROPERTY(LogisticRegressionOutput, RegressionOutputProp<reg_enum::kLogistic>)
+MXNET_OPERATOR_REGISTER_REGRESSION_BWD(_backward_mae_reg_out, mshadow_op::minus_sign);
+
+MXNET_OPERATOR_REGISTER_REGRESSION_FWD(LogisticRegressionOutput,
+  mshadow_op::sigmoid, "_backward_logistic_reg_out")
 .describe(R"code(Applies a logistic function to the input.
 
 The logistic function, also known as the sigmoid function, is computed as
-:math:`\frac{1}{1+exp(-x)}`.
+:math:`\frac{1}{1+exp(-\textbf{x})}`.
 
 Commonly, the sigmoid is used to squash the real-valued output of a linear model
 :math:wTx+b into the [0,1] range so that it can be interpreted as a probability.
@@ -106,13 +116,12 @@ It is suitable for binary classification or probability prediction tasks.
 .. note::
    Use the LogisticRegressionOutput as the final output layer of a net.
 
-By default, gradients of this loss function are scaled by factor `1/n`, where n is the number of training examples.
-The parameter `grad_scale` can be used to change this scale to `grad_scale/n`.
+By default, gradients of this loss function are scaled by factor `1/m`, where m is the number of regression outputs of a training example.
+The parameter `grad_scale` can be used to change this scale to `grad_scale/m`.
+
+)code" ADD_FILELINE);
 
-)code" ADD_FILELINE)
-.add_argument("data", "NDArray-or-Symbol", "Input data to the function.")
-.add_argument("label", "NDArray-or-Symbol", "Input label to the function.")
-.add_arguments(RegressionOutputParam::__FIELDS__());
+MXNET_OPERATOR_REGISTER_REGRESSION_BWD(_backward_logistic_reg_out, mshadow_op::minus);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/regression_output.cu b/src/operator/regression_output.cu
index cb951f1fd29f..e3a2e7ea2b2b 100644
--- a/src/operator/regression_output.cu
+++ b/src/operator/regression_output.cu
@@ -18,31 +18,32 @@
  */
 
 /*!
- * Copyright (c) 2015 by Contributors
- * \file regression_output.cu
- * \brief regression output operator
+ * \file regression_ouput.cu
+ * \brief Regression output operator.
 */
 #include "./regression_output-inl.h"
-#include "./mshadow_op.h"
+
 
 namespace mxnet {
 namespace op {
 
-template<>
-Operator *CreateRegressionOutputOp<gpu>(reg_enum::RegressionOutputType type,
-                                        RegressionOutputParam param) {
-  switch (type) {
-    case reg_enum::kLinear:
-      return new RegressionOutputOp<gpu, op::mshadow_op::identity, op::mshadow_op::minus>(param);
-    case reg_enum::kLogistic:
-      return new RegressionOutputOp<gpu, mshadow_op::sigmoid, op::mshadow_op::minus>(param);
-    case reg_enum::kMAE:
-      return new RegressionOutputOp<gpu, op::mshadow_op::identity, mshadow_op::minus_sign>(param);
-    default:
-      LOG(FATAL) << "unknown activation type " << type;
-  }
-  return NULL;
-}
+NNVM_REGISTER_OP(LinearRegressionOutput)
+.set_attr<FCompute>("FCompute<gpu>", RegressionForward<gpu, mshadow_op::identity>);
+
+NNVM_REGISTER_OP(_backward_linear_reg_out)
+.set_attr<FCompute>("FCompute<gpu>", RegressionBackward<gpu, mshadow_op::minus>);
+
+NNVM_REGISTER_OP(MAERegressionOutput)
+.set_attr<FCompute>("FCompute<gpu>", RegressionForward<gpu, mshadow_op::identity>);
+
+NNVM_REGISTER_OP(_backward_mae_reg_out)
+.set_attr<FCompute>("FCompute<gpu>", RegressionBackward<gpu, mshadow_op::minus_sign>);
+
+NNVM_REGISTER_OP(LogisticRegressionOutput)
+.set_attr<FCompute>("FCompute<gpu>", RegressionForward<gpu, mshadow_op::sigmoid>);
+
+NNVM_REGISTER_OP(_backward_logistic_reg_out)
+.set_attr<FCompute>("FCompute<gpu>", RegressionBackward<gpu, mshadow_op::minus>);
+
 }  // namespace op
 }  // namespace mxnet
-
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 742d05518425..640cd347dc16 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -244,7 +244,9 @@ def test_regression():
     check_regression(mx.symbol.LinearRegressionOutput,
                      lambda x: x,
                      lambda x, y : x - y)
-
+    check_regression(mx.symbol.MAERegressionOutput,
+                     lambda x: x,
+                     lambda x, y : np.where(x > y, np.ones(x.shape), -np.ones(x.shape)))
 
 def check_softmax_grad(xpu):
     x = mx.sym.Variable('x')

From 3ba84d83105bbc8825ac858f4c9cf81f9ca03d18 Mon Sep 17 00:00:00 2001
From: Haibin Lin <linhaibin.eric@gmail.com>
Date: Sat, 27 Jan 2018 23:18:11 -0800
Subject: [PATCH 04/18] Update KEYS

---
 KEYS | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/KEYS b/KEYS
index d646bb7c3f11..5e5769be542e 100644
--- a/KEYS
+++ b/KEYS
@@ -363,3 +363,63 @@ iEVpHzOV7gd75fJbOvoNxNZj20Yj5sg8OCwbv8PxLXEcBFs7hhjQMhVRsjpNYzAR
 Iw==
 =rMlc
 -----END PGP PUBLIC KEY BLOCK-----
+
+pub   rsa4096 2018-01-28 [SC]
+      7302629A6791AC2C3593B9A0015ED8A29C815704
+uid           [ultimate] Haibin Lin (CODE SIGNING KEY) <haibin@apache.org>
+sig 3        015ED8A29C815704 2018-01-28  Haibin Lin (CODE SIGNING KEY) <haibin@apache.org>
+sub   rsa4096 2018-01-28 [E]
+sig          015ED8A29C815704 2018-01-28  Haibin Lin (CODE SIGNING KEY) <haibin@apache.org>
+
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQINBFptdRQBEACxk4vidIZ9n14poPFKMayxA0P4o92pboPzYf5rqzgD5cjcVBxk
+uuWqDEEbj3wYCdluTw4sO4jENIBstUY0pIJtuUIsGW9KU14DKsnO+Od6cj/4bAub
+/1/otUJ0D+xDHx3tYEhYEOOOvk1UI7Dd0nxh2K3ymYEZMfki1iMSABwj8Vm0nK2Q
+ZuyswLUbssfZqLaOQ+HuTN54houLqHDuHYz+pyttDH+sxL7c4uJykgJyDx8c0ENW
+0Atejqk8fyKNtVlizcehQff/t7NdKxkpgA3J4ZV2sYvjrD54CiMkhU51At6YO3HX
+L/bPK3deXiNGzHA45mrX8eewgCw92YwdXWQ4OI40smRFm6dBiebXDwfjJkucTMnk
+7RQJSbOE4VezyzwrqKZTHUPBvnwskDXBeNHIdaBwicYkGP8/p1HLvIj0XCC32yHz
+5jaj9qEuTlE1tW0FPpqAFRUNlnVF/wMaDqyV6MdQI9mE6jzzI+8ja9Vi750bs8Ew
+dvyrxf4UcJjc/aMGKcHkxMM6n1aVH/Jl1G7YC8d5K5QXJxuabBc3tp5PI2p6iBdy
+nNpJKmJLKNVCm8rXu0XbSQxoM6QBOF6IlIjExtKXUqKUSs426p81V8dnRCQFg8fP
+Ha7hxYaO2hJHxNx4lIgVgZZj61q5EIpmyNZ4gITkCu6kiGDoBxyruGrlXQARAQAB
+tDFIYWliaW4gTGluIChDT0RFIFNJR05JTkcgS0VZKSA8aGFpYmluQGFwYWNoZS5v
+cmc+iQJOBBMBCAA4FiEEcwJimmeRrCw1k7mgAV7YopyBVwQFAlptdRQCGwMFCwkI
+BwIGFQgJCgsCBBYCAwECHgECF4AACgkQAV7YopyBVwTKwxAAmy+i4ql+pz6tK9P4
+XYYEkRUPqoXKWamoQWukpnVZmZPPuRr3SPCgBUTLOxm6RSTiuFxahHN+zGHBrpNA
+tLv5uyfVS26e3ugjCeZ+NllMLQ7MB+yVDlb7QFOYWDSZ1iTG1kJ1/I038IZJhM5t
+TVAYVICQlUNbi9AI3iHWRzRQswZxFWuuMwTUDsP7yvcIgwMh6keUmNhyRe+GTPFJ
+qwroW+fXbLZ59YqGt/eLvg6kodgia1deBRygjcbAH0B0I8TpcV/IQAXC7Vvji0fB
+fLoCcPaUTTTKInejIrSLkOunooVNbIBHfxpBtl6ilWygkFb1TMfNI8BeXKPPnudk
+2MERTn7poYAS8TJYjLomknrjnaIccQyicLxxs4nh3nC9xvZ2CGr0hmuOv2rM/spj
+/KJzbsdLsFfkyMVPKZR8LALYl1YDsOvAzAmEXtcP3S681sHbfyJbKobn4UmvejCH
+9GHJGm+KlBRSpzKTEt6gqsM5DCKjiSiPomC2XAw7ztqTsf1NCeIDuPniIMANEK9b
+pdS5GpBy2XCvmv7epyamZOHQ57t0//9n3qfH3qDHXFaTMC1EEKvZl7q5Wpx3b+H/
+WkJqCf1cMG0fU/7aPAo2zygNYtPnNqyGYs9RMicrj/lnw7Oz8RiygXPgNLXMkci+
+aTftZm5DKZgWikAYTOhBxV1GEbS5Ag0EWm11FAEQAOBSel/yRYwgxalZfajTv52w
+v61UrZBQMuWNxbGHWdQBZnO0BiijgS+u1AWfpAia1ig+Dqfa5U8w/jqbBG63VvwE
+x8PapVuvXJisxhekGFysQxWf0NCVIY9rTHUs529kN/kbZq2XzWnr4aI6f44YYjEa
+lFAnVL/JJ7ewERbI0XHy3d99LoHYKq9ttc9w4CB2dVN4o5g1wyJxG5uzdNcQO6MP
++QPWPUBkBDIWEWtYeXJVTjuCW9VscFfvgnGSDyBPTeXyN3rup9mu3P1g9PopobkV
+cczTNwSqy4vO+vIYgXUAP98cbbJzE6LZIYEpUPki7ooWIk9MDo3oKCLnJE0TOxCv
+R5ZYyIRJkM5Jtt1RdZvKpLRlRGFTx1uW2pHYJMz2VS+rUPy7NLBcLR1N1LnjOot0
+mb4cE0sJDgT1ONqg79sUGRRBCdda291FomZjjb3UW+mM76h9TSgg8OijTzjQJMmn
+sO/Tx69FMdc1VqJ5nI0SThDwP33EQDthvlobUNrU/mEwI0t3Qsukx+Fi5n/hf3x8
+dInzmCSQ4yLsTZttNTF6+YPDuxuMgTzR0P0e/ilSt576FXjWqWXGtA0noXjEtUim
+j7xXbc4WeKWQjV+jMTIrgxbrE2Cr6x/P+rPGqydpmKH+yNMW4IJs5LWk/SFFHPKM
+liSWetxGdjsxn1aX0h4jABEBAAGJAjYEGAEIACAWIQRzAmKaZ5GsLDWTuaABXtii
+nIFXBAUCWm11FAIbDAAKCRABXtiinIFXBECxEACPTAw0o68QEme78qQXi0ls4yxB
+tVPB4DED3ReGNsnUDzmx0MHmzEUv3vJFfOzpeq/bn5ZxGG70k1HcIUF3c15xz9CK
+A3WpxAxwzRHHIPS+xVN6OQXwilo0+lfKNitQgUMVl9QwG8KgNT1sBCm61c4yzqCV
+aRDzuNLnXJpweClLE/QfjZjudGa41yBAp+XVTF/ke1l4OuWCi9udycfNE0LgmoMS
+uyE2g61oTWyxCfKwdmct30YRkligQ8w80KoW/reBEFURS+KWcMSH8rJaVdv8zdAD
+NRktfLtHgZcq3w1WkVX09PhVQK4HTrFBRHit6BvogRMl2de5ByADCjjVCysNfmWf
+qLHor5+LM2KOTBoOptidG6r9bpKoJNKk0a4evmfXRRe79UoAqcbM3UWZBc48M2qv
+tNzRbAcf0S+ltgw7xEW8rge6Vcz9lLTDuBjC7Mg3m1Q2gQO94RgZifNrmVAF31cY
+iBRNeLQGCrOV/Vt8XhD5Un90dA2aKLrW90IG9houHfKNj3vpRU32Qbb9kLpk1MZY
+fDiLw4372qAC4NpSLRpCIBbT33VztUOTmZgIg4zJiQGSp89dEVN8OUT/yjKQps39
+9XwxzS2A4J/DXuYUkCUD0/FKn7OEf0beXVyOoQItucTGIePSGkIT79uG9qptpxZL
+G4kKPLx5+UhNtHsaNA==
+=ZoTi
+-----END PGP PUBLIC KEY BLOCK-----

From 8b3c9ebb7bb4a9e8ee88e7222a718f7fa1c9a6be Mon Sep 17 00:00:00 2001
From: Haibin Lin <linhaibin.eric@gmail.com>
Date: Sat, 27 Jan 2018 23:23:13 -0800
Subject: [PATCH 05/18] Update NEWS.md

---
 NEWS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 6e116c57002a..920063ae31fb 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -36,6 +36,8 @@ MXNet Change Log
 ### Known Issues
 - "Predict with pre-trained models" tutorial is broken
 
+For more information and examples, see [full release notes](https://cwiki.apache.org/confluence/display/MXNET/Apache+MXNet+%28incubating%29+1.1.0+Release+Notes)
+
 
 ## 1.0.0
 ### Performance

From 832cf82688d9929bf804bd40ef3b4fb94dd2bb56 Mon Sep 17 00:00:00 2001
From: thinksanky <31976455+thinksanky@users.noreply.github.com>
Date: Mon, 29 Jan 2018 14:13:49 -0800
Subject: [PATCH 06/18] fixed links that were missng ndarray folder path
 (#9618)

---
 docs/community/contribute.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/community/contribute.md b/docs/community/contribute.md
index 5bb790eed7d8..9c3c3e187029 100644
--- a/docs/community/contribute.md
+++ b/docs/community/contribute.md
@@ -103,7 +103,7 @@ or is conceptual, add it in the C++ documentation. Make sure your example works
 by running a Python version of the example.
   * If a concrete and simple language-specific example can further clarify the API and the API arguments, add the
 example in language-specific files.
-* Refer to these examples for guidance:- [Embedding](http://mxnet.io/api/python/ndarray.html#mxnet.ndarray.Embedding) , [ROIPooling](http://mxnet.io/api/python/ndarray.html#mxnet.ndarray.ROIPooling) , [Reshape](http://mxnet.io/api/python/ndarray.html#mxnet.ndarray.Reshape).
+* Refer to these examples for guidance:- [Embedding](http://mxnet.io/api/python/ndarray/ndarray.html#mxnet.ndarray.Embedding) , [ROIPooling](http://mxnet.io/api/python/ndarray/ndarray.html#mxnet.ndarray.ROIPooling) , [Reshape](http://mxnet.io/api/python/ndarray/ndarray.html#mxnet.ndarray.Reshape).
 
 ### Testing and Rendering
 * Make sure not to break any coding standards. Run

From 7d6fab98824e61c7e05cb08b3554858e1037e8cb Mon Sep 17 00:00:00 2001
From: thinksanky <31976455+thinksanky@users.noreply.github.com>
Date: Tue, 6 Feb 2018 09:17:06 -0800
Subject: [PATCH 07/18] Fixed 4 broken links (#9698)

* Fixed 4 broken links

* fixed pylint for long line disable and 1 broken link
---
 docs/faq/finetune.md          | 2 +-
 docs/faq/multi_devices.md     | 2 +-
 docs/tutorials/index.md       | 4 ++--
 python/mxnet/gluon/trainer.py | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/faq/finetune.md b/docs/faq/finetune.md
index 2c6c7e340279..533c3caf52a9 100644
--- a/docs/faq/finetune.md
+++ b/docs/faq/finetune.md
@@ -15,7 +15,7 @@ with these pretrained weights when training on our new task. This process is
 commonly called _fine-tuning_. There are a number of variations of fine-tuning.
 Sometimes, the initial neural network is used only as a _feature extractor_.
 That means that we freeze every layer prior to the output layer and simply learn
-a new output layer. In [another document](https://github.com/dmlc/mxnet-notebooks/blob/master/python/faq/predict.ipynb), we explained how to
+a new output layer. In [another document](https://github.com/dmlc/mxnet-notebooks/blob/master/python/how_to/predict.ipynb), we explained how to
 do this kind of feature extraction. Another approach is to update all of
 the network's weights for the new task, and that's the approach we demonstrate in
 this document.
diff --git a/docs/faq/multi_devices.md b/docs/faq/multi_devices.md
index 5d538bca56af..b9cb3ea2916b 100644
--- a/docs/faq/multi_devices.md
+++ b/docs/faq/multi_devices.md
@@ -210,4 +210,4 @@ export PS_VERBOSE=1; python ../../tools/launch.py ...
 ### More
 
 - See more launch options by `python ../../tools/launch.py -h`
-- See more options of [ps-lite](http://ps-lite.readthedocs.org/en/latest/faq.html)
+- See more options of [ps-lite](https://ps-lite.readthedocs.io/en/latest)
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index aca091c41c3f..3eff299d7787 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -134,7 +134,7 @@ The Gluon and Module tutorials are in Python, but you can also find a variety of
 
 - [Imperative tensor operations on CPU/GPU](http://mxnet.incubator.apache.org/tutorials/basic/ndarray.html)
 
-- [NDArray Indexing](http://mxnet.incubator.apache.org/tutorials/basic/ndarray_indexing.html)
+- [NDArray Indexing](../tutorials/basic/ndarray_indexing.html)
 
 - [Symbol API](http://mxnet.incubator.apache.org/tutorials/basic/symbol.html)
 
@@ -174,7 +174,7 @@ The Gluon and Module tutorials are in Python, but you can also find a variety of
 
 <div class="applications">
 
-- [Connectionist Temporal Classification](http://mxnet.incubator.apache.org/tutorials/speech_recognition/ctc.html)
+- [Connectionist Temporal Classification](../tutorials/speech_recognition/ctc.html)
 
 - [Distributed key-value store](http://mxnet.incubator.apache.org/tutorials/python/kvstore.html)
 
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 71c144f80cf2..c8822bb02c1d 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -16,7 +16,7 @@
 # under the License.
 
 # coding: utf-8
-# pylint: disable=
+# pylint: disable=line-too-long
 """Parameter optimizer."""
 __all__ = ['Trainer']
 
@@ -34,7 +34,7 @@ class Trainer(object):
         The set of parameters to optimize.
     optimizer : str or Optimizer
         The optimizer to use. See
-        `help <http://mxnet.io/api/python/optimization.html#the-mxnet-optimizer-package>`_
+        `help <http://mxnet.io/api/python/optimization/optimization.html#the-mxnet-optimizer-package>`_
         on Optimizer for a list of available optimizers.
     optimizer_params : dict
         Key-word arguments to be passed to optimizer constructor. For example,

From 31104c9d4b050883467f45f8bf9a164acb93976f Mon Sep 17 00:00:00 2001
From: Haibin Lin <linhaibin.eric@gmail.com>
Date: Tue, 6 Feb 2018 15:22:53 -0800
Subject: [PATCH 08/18] Update NEWS.md

---
 NEWS.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 920063ae31fb..a51b514c1a51 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -12,6 +12,7 @@ MXNet Change Log
 - Fixed custom op multi-GPU scaling (#9283)
 - Fixed gradient of gather_nd when duplicate entries exist in index. (#9200)
 - Fixed overriden contexts in Module `group2ctx` option when using multiple contexts (#8867)
+- Fixed `swap_axes` operator with "add_to" gradient req (#9541)
 ### New Features
 - Added experimental API in `contrib.text` for building vocabulary, and loading pre-trained word embeddings, with built-in support for 307 GloVe and FastText pre-trained embeddings. (#8763)
 - Added experimental structural blocks in `gluon.contrib`: `Concurrent`, `HybridConcurrent`, `Identity`. (#9427)
@@ -26,7 +27,7 @@ MXNet Change Log
 - Added `lazy_update` option for standard `SGD` & `Adam` optimizer with `row_sparse` gradients (#9468, #9189)
 - Added `select` option in `Block.collect_params` to support regex (#9348)
 - Added support for (one-to-one and sequence-to-one) inference on explicit unrolled RNN models in R (#9022) 
-### Depreciations
+### Deprecations
 - The Scala API name space is still called `ml.dmlc`. The name space is likely be changed in a future release to `org.apache` and might brake existing applications and scripts (#9579, #9324)
 ### Performance Improvements
 - Improved GPU inference speed by 20% when batch size is 1 (#9055)
@@ -35,6 +36,7 @@ MXNet Change Log
 - Improved batching for GEMM/TRSM operators with large matrices on GPU (#8846)
 ### Known Issues
 - "Predict with pre-trained models" tutorial is broken
+- "example/numpy-ops/ndarray_softmax.py" is broken
 
 For more information and examples, see [full release notes](https://cwiki.apache.org/confluence/display/MXNET/Apache+MXNet+%28incubating%29+1.1.0+Release+Notes)
 

From 8cc5e97b95421692fe20c7c4575db48564761dda Mon Sep 17 00:00:00 2001
From: Haibin Lin <linhaibin.eric@gmail.com>
Date: Tue, 6 Feb 2018 09:25:50 -0800
Subject: [PATCH 09/18] Update NOTICE (#9706)

---
 NOTICE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NOTICE b/NOTICE
index a12b99f5b593..98321cba7c07 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,5 +1,5 @@
     Apache MXNET (incubating)
-    Copyright 2017- The Apache Software Foundation
+    Copyright 2017-2018 The Apache Software Foundation
 
     This product includes software developed at
     The Apache Software Foundation (http://www.apache.org/).

From 4878fde51ba2b5c4cd0b59d7d1208319340def5a Mon Sep 17 00:00:00 2001
From: Sheng Zha <szha@users.noreply.github.com>
Date: Wed, 7 Feb 2018 10:07:58 -0800
Subject: [PATCH 10/18] revert acc changes (#9731)

* Revert "avoid per-batch blocking in metric (#9636)"

This reverts commit 3fe694e7b1ed7fa6a2dcfeddeac44c14ab77b015.

* Revert "proper flatten in acc (#9619)"

This reverts commit ed823b2e187eb859d9475eb651465edf714c6c5f.

* Revert "use nd for accuracy calculation (#9583)"

This reverts commit f5f1b91ff972ad70e9131d3cd1d7408ddddb7684.

* keep doc change
---
 python/mxnet/metric.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index f1cdae26a235..8bb3f6ee0a81 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -391,16 +391,13 @@ def update(self, labels, preds):
         for label, pred_label in zip(labels, preds):
             if pred_label.shape != label.shape:
                 pred_label = ndarray.argmax(pred_label, axis=self.axis)
-            pred_label = pred_label.astype('int32')
-            label = label.astype('int32')
+            pred_label = pred_label.asnumpy().astype('int32')
+            label = label.asnumpy().astype('int32')
 
             check_label_shapes(label, pred_label)
 
-            if pred_label.context != label.context:
-                pred_label = pred_label.as_in_context(label.context)
-
-            self.sum_metric += (pred_label.flatten() == label.flatten()).sum().asscalar()
-            self.num_inst += numpy.prod(pred_label.shape)
+            self.sum_metric += (pred_label.flat == label.flat).sum()
+            self.num_inst += len(pred_label.flat)
 
 
 @register

From e49a1de9167c1c6041c3e7c1635eadc3e34534fb Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Wed, 7 Feb 2018 10:10:20 -0800
Subject: [PATCH 11/18] PGP keys add liuyizhi AT apache.org (#9728)

---
 KEYS | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/KEYS b/KEYS
index 5e5769be542e..77710ad5c440 100644
--- a/KEYS
+++ b/KEYS
@@ -423,3 +423,62 @@ fDiLw4372qAC4NpSLRpCIBbT33VztUOTmZgIg4zJiQGSp89dEVN8OUT/yjKQps39
 G4kKPLx5+UhNtHsaNA==
 =ZoTi
 -----END PGP PUBLIC KEY BLOCK-----
+
+pub   rsa4096 2018-02-07 [SC]
+      F42C1A6E634C105E8D985105CA751254E97B9FE4
+uid           [ultimate] Yizhi Liu <liuyizhi@apache.org>
+sig 3        CA751254E97B9FE4 2018-02-07  Yizhi Liu <liuyizhi@apache.org>
+sub   rsa4096 2018-02-07 [E]
+sig          CA751254E97B9FE4 2018-02-07  Yizhi Liu <liuyizhi@apache.org>
+
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQINBFp6j2ABEADO07fnGhxTkPfmRsJS65Cif6ywUVRl2ZXKi/N7DjKJdl+Ej5lG
+Oaw5cExaP0RD5iT5ZCAzfUS7UFULybEcbqgnm/RzaCrz9mx3gLa8Jx9XncagwJQU
+9GbvJxzlX8itgY9vezK1q7Ec/iwCA66suzLeY8cA68EvWwmjR/1WlE9W/gov9mSl
+Cu7QRIP9DuUHyL9ZZtYTwYTKsSaRCTv42xvkxAQ/ifinYZn31uQmW41Gqt2YFNWl
+fp1uA97dmyAKcIeOCkvpQyChspJLIcA5lQrH6RV+oyuhRoSw/ZXPwdRXS1+79arC
+e1vsMUeZPzkzSXcrpkzROOzVx1WlXR8WYcWtaXkPsgn7Icym0ngnwRbuY0JACT2F
+8MWgBlC3LQj0mrhCcr26v9ettcmeulmuY/WLIi9oDtgq2yHSbz2na+qbPRd5vDS1
+i+nD62xvejZXC7xInaoyB0f6QYpgXQKyEFO/uCUGFXCBwYAPe4XwkNozR1GmOTKP
+9ZnriJax/BIPva5JiqK5pkqOxuiGuPNdW7Bj/HvQr7F1s5LoG+Q8YSVo/KEJ2oo3
+IwU6FWZwq+KY/bVTO3fRCBD7Fgu8Eu9zw8ANIvpuq+BDo3yoUCoS83Ok+favH0K/
+jwBVFyu+/rnJc4wn7Px9/zdniaSTuxK6pAyTiUtVy6Gp73Roik5Dhu3nqwARAQAB
+tB9ZaXpoaSBMaXUgPGxpdXlpemhpQGFwYWNoZS5vcmc+iQJOBBMBCAA4FiEE9Cwa
+bmNMEF6NmFEFynUSVOl7n+QFAlp6j2ACGwMFCwkIBwIGFQoJCAsCBBYCAwECHgEC
+F4AACgkQynUSVOl7n+TMiA/+LB2vDz8ZzMRTwlnWxxhiKU8+P5QEvC7sgwg6REiq
+jEfo+Abcf/erRzMSnX0G4G6xauty4NDtieUI3X/mDKUS96yqo8Ij5NO02ltI0isG
+6edlyyjrs01yiGHNKjTDkU1f1Af+wW8/h9By6cf2x4u9VWfSUjzwkcrr1qorP0AU
+1cSXVDJNxnKKHbdsBlVC7UkUX1ZMBQq3inFIox5y1cSL34joUGRcyFtqZDoTvYMI
+ZgAiJJw1JmpQU2bte3T1/70j6za81/09ev/kN9HIfeK2Mh0IVTttvBdggmQZHhKq
+5tL70v93RUoCaRmJCvyUTaSe1o57phzOeUj8FmFhvqugnrtfYaygdvjrOZYXo5R1
+8jXiQG0lNQPuxh9Vr6dca85aP12yB6kK8/d+09PaEtirqwW32YcoNeiHtPWvEIas
+tcO+bAE6OKFWHE+3mYKr2m4hAH6CWDOa+x6p9JyciTKxEgaaXcj/q458r2S79iMe
+JknzLKw9zLPjHAm0tb45x893xnjNSSDd8DhjwwwZKCt/pZs2E0pyp08DF1a8uCId
+oQ0s4eo5Yr7tJxGpAWgd/VcrlHBmmGdqdMMUhS02BjuyVDXc+T3fbE1a5QIpHoqj
+l7lyeY+VLnOUt9Y+RyWKsDONsB3QcuMRaQQWGf7eeILMIZ+Y33qpt0/55qLbzsEY
+/265Ag0EWnqPYAEQAMLE3QGCRBZU4nGKyOIpIsWpolG8f5vnAZJwsC6g4ya3odsH
+uUknDo7Puhp7RCIxHuEtSBTf+20nFifX7GCgHAKn/mGWDk9mNWmsGpVzXcHNO0TK
+Tod6V9FE5SC3CVggK8U1PesXh0PoV2AMWq1AmzWJyivHFRefuPilu+NVRE/Mj6ZW
+bs3ApixMml/0S1Y7L5btNjG1DCZbs6i70nSuUXXXM/D0jkCYljYf8wtruzj1MN97
+NZP2nvGjyBkGw9tNxyWYirZ5jJOlzbee4rags9agxETrZ4z9S3QAFcQaKNI32Hyu
+SJELgIcx5U/uB2f19GQX/33kk26OrTAW6INUCRK6ji2y0F8IxfrHd0WXj/RFrV/o
+kQyEai5x8oC1+Rik62CEnI9EfL/WU/toHtSeFBfNrtTKa3WiXnQDfHmJBe1wfvOm
+M3QjH2ApPBwUXXblm7wBCPEjQJs+B0FIrlpJdN+KaGMMSHsz90f9QMF6GH/pgDPG
+7K1IBsP3ZqDzJi7CLnLTAf0FreLuKLix349Y4X603uNd6Fx6vK3BGWB3ZyH7D1vC
+MBBytDdb66nmQQ3QZjJBU8FCGuBwd8q32bVKbIOQTQiMUbUGe3xZozC82mB3glEU
+CO46OElD0j56GC3XGVK4utPexIX9hcQ+uSXStrwhgHd76/iFCsb1F9wR16EVABEB
+AAGJAjYEGAEIACAWIQT0LBpuY0wQXo2YUQXKdRJU6Xuf5AUCWnqPYAIbDAAKCRDK
+dRJU6Xuf5KqtEADHxHPTbl1lT/QZZ+Y+SSuDpPF4uMjUP1TPyt6LGK9O/C0raIxa
+bpCtuit9VPwcubH/krVQxqIkje1rI6kjl/+krrwnnNhjUozoQh4y0e90atgu9pho
+QGjb12vhl5P95OB/YX8ZRJ2Bt7aSTfZiUUbL0OwwgontgLFNyz9/FNp/9eSrxOco
+Mazkt5D6SrW0IBW9l5SZeNDc9yYw0CMg/5YZ5Rv++APgXHWc/WjuDMHje7hi2VFM
+12VXF+gWQZy842n5IQzRPx7Pav32iByN00qKLNUUIwgoEQwZMStC9xjooGSmqOVU
+WnMYBLiUgNTySgOhu73hZVo8VNpOseatlaIRGC2ukn8AF5TlXMKf7O9L24x6bp3B
+d7M5KUNCUDgwn0mjVjsGEcT41Rc9XtglB7aLTiKhE/LqGi1f+BQolr6nGLEQ+oVu
+b3bqratmjAE7Pw7Byzup78JPVMt8vNdjwGYg3yHW4atLS1qUQ9VNYo2l4b+DxcCv
+FxV/mAfa+07j1Z9Ep4/Pw35uanSfOo0ylGmHp/h9yh27vrF1EzwshB7DlJoo5Kfn
+IxR3jVTKye+UerEtN8yATW8CRIKO3IobUfLMDdPCLO7uzoW95cI35Y0l8JgK2NeU
+6tVZptP5mDogeAbq8PlimrXuzG9Bokct2SOO6Z51i6rSDo/ALj440EvWNw==
+=E4W5
+-----END PGP PUBLIC KEY BLOCK-----

From ead9b1af1f48d6bf4430ab96c60a085caa46088a Mon Sep 17 00:00:00 2001
From: Chris Olivier <cjolivier01@gmail.com>
Date: Wed, 7 Feb 2018 14:34:39 -0800
Subject: [PATCH 12/18] Add my key (#9736)

---
 KEYS | 112 +++++++++++++++++++++++++++++------------------------------
 1 file changed, 55 insertions(+), 57 deletions(-)

diff --git a/KEYS b/KEYS
index 77710ad5c440..fa885848d948 100644
--- a/KEYS
+++ b/KEYS
@@ -304,66 +304,64 @@ IjljtjhIMhMLB5rf8BPCZ6og5fKqUF5LOp8DujG2DGa9ZhYWTzOO/UGZP60qGTot
 GZZVNUU0hQYfulYDY5E8fJ4Olzpf5OE=
 =WmLB
 -----END PGP PUBLIC KEY BLOCK-----
-pub   rsa4096 2017-11-21 [SC]
-      331E9A5ED727FADD429B2894F2F1EAB589EBCFB1
-uid           [ultimate] Haibin Lin <linhaibin.eric@gmail.com>
-sig 3        F2F1EAB589EBCFB1 2017-11-21  Haibin Lin <linhaibin.eric@gmail.com>
-sub   rsa4096 2017-11-21 [E]
-sig          F2F1EAB589EBCFB1 2017-11-21  Haibin Lin <linhaibin.eric@gmail.com>
+pub   4096R/C622DF82 2017-11-29
+uid                  Chris Olivier (CODE SIGNING KEY) <cjolivier01@apache.org>
+sig 3        C622DF82 2017-11-29  Chris Olivier (CODE SIGNING KEY) <cjolivier01@apache.org>
+sub   4096R/A70D0AA4 2017-11-29
+sig          C622DF82 2017-11-29  Chris Olivier (CODE SIGNING KEY) <cjolivier01@apache.org>
 
 -----BEGIN PGP PUBLIC KEY BLOCK-----
+Version: GnuPG v1
 
-mQINBFoTp3YBEACiGa++rsTjQal+33xADuWxzN9L8bTkMu4uFJqYvyNP2z1Q0fcM
-DFjLJcvsc3ODSlkDGlkrtFpYlqkBTFERABU19TcAQ5FYFu1uULUybtHm55h6OKAm
-1qfSRcKvdidDRytf7XAnhK/jvjtY71EQZUz2OtvKj0p93C22JcaJasKjHEF+8Jv0
-1rvV4BsZcY3hl9ORbv+nvBB6PX6zkpfhh0edVl50yzJEM34dtBZ1CTVlcJhIj0yo
-LEZkt+zKEz5C3/D5OgM2DoclUInAvPeIGXvOgoQi9he4YjMppC3fmcA9O+sJ8XFh
-dqNxcI+ddcvg84g4ntC2iJb8OOX75xkkoIsJXhZgwxBbdnwINNY6Eqqyx2lMvGRI
-BLTSxLKsfX/mCmW9mwNrKxfrBIb107ldxwfo+13/Vh45nIlhM0yxfhlukHmYEHp+
-G+T+aD67t0HHZHr27M2x0qTdKkRoI+7xYTUvu+OmObJej48UDhi4GMAjQ61TeLm1
-OyetyMoKpB+Cah1n0O5j6nDPRJBS9OPi361DIZRhlg4IkrbIP5MHs+Zvof8O04xq
-GRfYAqEhT6rP98TidpHVhFEV3CrDLVDJLZ3Vqglj2iyNOjEjF1GJJBaFWUoXhKPs
-WVZMfgpkaXRwng6r6ieRmmt/Ci//JV6ztkwKk7e0OQJBqbwA0A7lqx7j2QARAQAB
-tCVIYWliaW4gTGluIDxsaW5oYWliaW4uZXJpY0BnbWFpbC5jb20+iQJOBBMBCAA4
-FiEEMx6aXtcn+t1CmyiU8vHqtYnrz7EFAloTp3YCGwMFCwkIBwIGFQgJCgsCBBYC
-AwECHgECF4AACgkQ8vHqtYnrz7GFWA//Z6YTxtlZSHFlqkAFFOsDtV3DghSC8zJe
-LRm508fZn53e9a3fUvT9U1sUfW8DI69GRK+IBkvP5hcmMb1U4N3MxzX4YC/13wMY
-3BtUbCIpD8uBJOtuC7fPAH//Ij/4wv4Fp1/3WL6y04+mJIayMyKqmc3nBLD0rVWC
-AHEsPR7tiDDMltrzxMNHIJCDaiClJzKiCrQ4owKBOnY2TU/E64xyk5IwAczz2lCY
-712h6+q2mO7F672Yt6b6pqmugnFqWdqUj9dx1V9x//4y/k0DefF7G/1Lk1lh4Eyo
-aUx3jve/74Y87ICW1AhR2/TvdfWbsAkPyfy98k1SLR/9BulSIXIFeduxaFl7M3D8
-98aB5pqO8tPl2BFUJwh/uywDx0994MjQ8Xvrjmb9WJOAx9OyokivVCvmqJOkBzve
-Fk/4KUHTFTGQCoXbbBlIQTC9hBd8c1S4t0gFGbcjlqTvr/ZnTdpSgbzZ/96/SVRm
-dYOgjjpkrBOZgJPwsmmRQ2MufeZUtmkFSqdIRLGBNTefsMDDCGvyNeR/XCgM5Zfy
-39PX/GHFKgq5Ei2ywEyZOGLCK5MwA12fMExYoedazFFjv6ApGpz+j831A2z/crEo
-bRpVvd+rFzGnCKDq5viUD7cRzIPLVltYCNEayEgWta4KI+00/ayaaT6sM7N7oM32
-r01Wv02FvdG5Ag0EWhOndgEQAPiiTvmo9fZNW/5IxL7kDR6u9FEmEb2EZI+KxzbN
-RYYY0IPsnA8TY9Rzj9D7xV8Vmf2Pd5SUyCtVwLfBKhadLh755NeehNXWIbW802gH
-bvbykL/Zcn98oiLOVfK/Op/6MVpDuGXZ6CpDbQDSn6ne6/CWQnoz1+Wo+wbs1TOy
-AhO6xKa20NtGIZrfZD01dSzRC5DMJD3GK1j6HdVUz5piwiTsGvGRJ3ZLfObdlHGn
-CTMA39Jb8zQ0QtWPsOre0Nz2JQ53awMBaUhan5MeoOYp6ccsgD1BigyxmKb8iIDN
-NM/Iwi0Ib5L4AiGh6fQFf0WF8p74yIn1WgFcWxJXR1ZzvMDDHXqq97SQtbr9FKhu
-xrceh/92Ga4ruAJRCbMtmOTUP4APTeT4csANdgJxtW+I4QAp01BQSl75pB2QDlam
-+tqePQDboAGc78Ck6096wML0ZMKDDxXPrI67uppuM02FYuJ41ZQjOytigeoGS88g
-ByZwPcFIT+5XgtNC0BH7U9VIkiap5U00lykzEjcRjrZTtKqHdeFPbSEpv1QfIcLG
-Ra439g9acRHX82sVzhzZk5uu9QKyDN1EpuWoLOaOrICHcMSC7GkVXS8+/7TX0vAN
-vn/51fb+tHJekGfaPhsPuIbSba2kmUy8sSS/6JJHkJ1aEFigAPbwUbZTqNlb4IRm
-FBVBABEBAAGJAjYEGAEIACAWIQQzHppe1yf63UKbKJTy8eq1ievPsQUCWhOndgIb
-DAAKCRDy8eq1ievPsbrpEACQ8HqAvq3NuiM00WyHla7VtghCWVEmRozbYc4dR7u+
-sTQrVgbLfgR5zeSWCMHpEcaN/RS58O/i1Dk0DLHTu3NrarzrkEPlHwIgJQ7orxFD
-YW3Z2Ytk40uKex4ou/8VzvXTpj1u8d/GHgGdvChBmtw5FaMgc8PBi4FnlIS5cAGU
-1ca1RwMX0WpFsp9HgrQLVxgkDs/m7oRSmC5GvPDIpb5S9QFzJKYKTJxSfXXO6hCk
-FGAGHWjVC26a/wSUtZQfb3G9sYZJuKUOwr4tpz1y6Ronc34cZYi1FlKWJuz01w4s
-4PKjFG/wbYSd+QLfftyyVPMLdY+wCwc8O59QqKx5Rj8HQLxIwSL3chhmdAHCmejM
-zKCpkFyLOc6+Wjet6hD6X3EsjIee1AAy22D24EaLJsju9zR/khJFS4K76aQX7dYN
-aB3C7S5HGxvYGSqfnn4eBaEzrSOde7HEcqYpYKxS+jB1c4X4W91NSTsqDd0QJMVF
-35eKfhWj+X6jWIC+48kfzypXdOCnPbto7wrr40yYCHw3XSXj40H5dWSsWEZVmS+s
-Dzz6zy9maHVyXa/rNsL7OjqimtKad65r/wfSFPPIcR1jJfP4GMNHV0TYqxdyDaXg
-iEVpHzOV7gd75fJbOvoNxNZj20Yj5sg8OCwbv8PxLXEcBFs7hhjQMhVRsjpNYzAR
-Iw==
-=rMlc
+mQINBFoe7TkBEACjmUNxxx+nvLv7+NO+J/dkzR+/imuMPp6Op1wP6dXWQ+gj5n9V
+ILMaGQcFp6LLaoFZQgGjFVsM6FZ4d92yT9pIbXg0VNCGGfmnQEQEwE3ZWdeQeOdz
+tbAqOiX/ZSw6C7gsUivMKBbMY3iJ46dgE2eqYD7aRyOYbP2mgX4mkmez/05bM48n
+8cuLj6dZgt8DduUwabYhrm2/xpSpaHnYKqwE3E0bhek0EEZ9Plmo6c+qW0v01iw0
+Y9EJQgA1Ulayh8Ub4JX1ZlqP/zOCmNB1tnNS7Y93sEDVZ942T/f+xX2Myk71dq4j
++Fm+q9V62Xu+zrHgdsMj8G84bsR8iGRmJHI0ZpjRN3e8Uvh+r4+h4UhFiSO3MP8G
+zX7BJvckOp0/zRKeMuq4UFBL+6lgYJWiN5KyG41v4zNiuuoNL9DE9brIRfwwrFxX
+6BCab/wjp0emCMjESnw2P6NT8tan/zRZWlsymJRtiG6NdveN++9g7wNrg2lT693m
+8Dkbe2BXAXcgw6tcJg6kSgzLByMzCv4VOm+k36GmViq5RQs++BTLAWDPea4g2PyZ
+GSKygTLzNIEYMgOs6+6+qHaWfZNdGXcnmE+gRgqBt7DOHi2BOw+UVdp5VtwmD/w8
+TH1vFdjzV+PKdDWEiv4D1UYgozA/YKnNK2brxa547JrMwwwHg76Qe+gM1QARAQAB
+tDlDaHJpcyBPbGl2aWVyIChDT0RFIFNJR05JTkcgS0VZKSA8Y2pvbGl2aWVyMDFA
+YXBhY2hlLm9yZz6JAjgEEwECACIFAloe7TkCGwMGCwkIBwMCBhUIAgkKCwQWAgMB
+Ah4BAheAAAoJECHz+avGIt+CWZcP/08h6cvwLlyfMXolOyHIS/qXs+NAoLyIc3eL
+1aDXdI88pawjH9cYgHEop95Wz9yhsDoGmGNYTQ0C92h+3/LlU4DvwElKtk/8sSbN
+sHwDPU1v39PpP4YZsxE5+KALk+0wuA8//1XGgxC6kSH6Nqo5UUC+Wr1WhbR8pZv5
+C/q1PtiqECQadRNXMEs/cKM8ahOA/radvjxgrTFSaNjmFdZC/YIy1Od5dX5Vtzk9
+VKSdOsGXbwoVUIFx9g3iA7Am+8KeZBX10LNjsWrhuKynUEbZ2w33OOM+ZtDH/Zb1
+j+gKfAPKqoqSJQaFrFTryLsaols8bjVJ44/E/32ps1QtCJDdhQk1+52C8+6javbt
+YB6FaYItkM87cUtQbeVuNZ+MfSzNaapuWH1ScAAAbe5igWJmL/9tALafvx9WZ6AG
+bJzoaSGff0q19xpkoI0bLdGj30+HMdqgA9018yGBao4RvIzWLqjyZoe3bDcB+4Pe
+QL04Nxx3Ona8W6mLdZCRZq0uMND9pkL0M1qdlA20j1VLG9RCr4nqk4APppHnJ9rm
+7ZwiIi0O20TSzFS1dbRwpZQKjTONNCMFsDxYV6nHT6CCfyX85nndFXM+BQbCnsbJ
+E/mzy6ivkLfqSRBh0vf63GnIrSu1CeKbZGkP5QDPMAbDenn9QSBG+U6pxYh0yjuM
+g+QcnXZ1uQINBFoe7TkBEADHk435eYpwkhfa0iLhqDaqHzQTuddQ2G5kf8CjQusd
+zngJ48cQ1iUAp5SHEOAq/wXLLmW63jPHfDbkj0pM3E2rUwOKgpYWjnBnzZweUuBv
+I4M45XSUszRfjzgYXpzIGlu3B6ytq5rkM5drvaAm9qtQI8Xe6fwuokLsM38Z7mCV
+JyZ7ah27HoCDZiLulZ+Zvy7b6DIC9hPZw+BDwAz2jYZHorzS3aShFd8qBDEgYBtj
+3Sflr/8iZU/OAa6yuiwUtbZxLcfvE2CCRqS5UR2LhpoUzmLXrBTBSrAjMKDE93Dl
+SrYKzHdTil9LxQUejlbO8ai8i57+KqaO9lWsiczP6Vcwn7ITkTWpc+ek/9gHvoyb
+Tvd5axVkJIUT62SmFPZill1O6wOK1ITZVoC835xxKSvf/j+t5X0Np4cjY9iDT6h1
+9XmtTwl4RXDvsgDeYWAkOu/M164tvLZLbq5ep8/sllihDVKWyl7yWSDQdXevBuZh
+WfOvnOv34F/+PkXtWuE10gvDOdp8lLgXglWlrx7VKb0C01r0L/KAMj6y2irOmLRG
+4fakix+uc/VXoRQWjtOMo9w8n3kaTIRsGGf0v6CBWDjrTu7xHmOFLTcMr2DMgNj5
+DdqAC5Mya/xAliW5QQIli3FJ2Phkb3zd7nRr9+N06V77cRrqnyw4Cu9/6qPj3Q25
+ywARAQABiQIfBBgBAgAJBQJaHu05AhsMAAoJECHz+avGIt+CukUP/A+p/KulcEd7
+343GOG/Rlw0IQFZmo4VWHvzJ3+DHspsvP/mrApeQfc7NSjUY0jeCvkgBEBLAaKlD
+I1jQ7zQmwONcPbdVMDsST90ILCeuWUsUG6B4EZzrOPKftEcpbqatyoHzDD+lZwgk
+6RnR5vyVZMctHdxsOT5ewnmFUIAG+/3Er/nvmENBdPQ7DaePgER/3NyhFrPycnT2
+4KzdA2R4Q44F4KZ2f/Qo4afKi7FDBYaJqk1PNiN4X9+60DqVUVqfutev+0k26Zou
+LddJRogXdkZtdcTAFEbTHAj0h5QdY/max9TKKiiLGy6FHd+UXUmawCTtHV5V3dO3
+SrcxQKb2cHK2TQbciCifujPYRfsfwBwPs8ufbA1yauV23UEjrnv4T3tCilL0nnJc
+281NOzJG6qI6Pz82FA0oC+ISS5lVfH3akpr6A9VGPttXlD9z7ixjZIRUGRTjTTdE
+VEqLKiyrEdWKiqhpdViGpGKn34GZgOsgwToe7tHyYdhLMeD+RknRuFruOgbP4ji4
+KcgcfFIMGxMAEhklBK0Q7CJWP6h51U+PTAbWxKQ1dxvvfmKuuSOfMHqPjm9kFlNd
+xglurVnmS7UpZmfQx061L9clHAqNyy7L/4ZvMTsm3T8g/rJI4JRxYhbW0b3nPlN3
+eXjHA3OoJ8lc9pGCShw56YNir2ieQ57S
+=dkF7
 -----END PGP PUBLIC KEY BLOCK-----
-
 pub   rsa4096 2018-01-28 [SC]
       7302629A6791AC2C3593B9A0015ED8A29C815704
 uid           [ultimate] Haibin Lin (CODE SIGNING KEY) <haibin@apache.org>
@@ -423,7 +421,6 @@ fDiLw4372qAC4NpSLRpCIBbT33VztUOTmZgIg4zJiQGSp89dEVN8OUT/yjKQps39
 G4kKPLx5+UhNtHsaNA==
 =ZoTi
 -----END PGP PUBLIC KEY BLOCK-----
-
 pub   rsa4096 2018-02-07 [SC]
       F42C1A6E634C105E8D985105CA751254E97B9FE4
 uid           [ultimate] Yizhi Liu <liuyizhi@apache.org>
@@ -482,3 +479,4 @@ IxR3jVTKye+UerEtN8yATW8CRIKO3IobUfLMDdPCLO7uzoW95cI35Y0l8JgK2NeU
 6tVZptP5mDogeAbq8PlimrXuzG9Bokct2SOO6Z51i6rSDo/ALj440EvWNw==
 =E4W5
 -----END PGP PUBLIC KEY BLOCK-----
+

From 6deba93e3f6a118c11eee8aa678bd0235b15748a Mon Sep 17 00:00:00 2001
From: mbaijal <30911248+mbaijal@users.noreply.github.com>
Date: Wed, 7 Feb 2018 16:17:31 -0800
Subject: [PATCH 13/18] [REVIEW REQUIRED] Revert PR #9484 & add additional
 dependency licenses to LICENSE file (#9701)

* Revert "[Review Required] Fixing Licenses: Cleaning up the Top Level LICENSE file (#9484)"

This reverts commit 8930d96b265560a797c5554a9617f607cea7740f.

* Some more LICENSE fixes

* Adding some more packages to the LICENSE file

* Adding dependencies of dependencies
---
 LICENSE | 312 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 278 insertions(+), 34 deletions(-)

diff --git a/LICENSE b/LICENSE
index d3b3d6f9dd0f..e7d50c377232 100644
--- a/LICENSE
+++ b/LICENSE
@@ -201,43 +201,145 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 
-    =======================================================================
+    ======================================================================================
     Apache MXNET (incubating) Subcomponents:
 
-    The Apache MXNET (incubating) project contains subcomponents with separate
-    copyright notices and license terms. Your use of the source code for the these
+    The Apache MXNET (incubating) project contains subcomponents with separate copyright
+    notices and license terms. Your use of the source code for the these
     subcomponents is subject to the terms and conditions of the following
-    licenses -
-
-    ========================================================================
-    1. Apache-2.0 license as above, wherever applicable
-    ========================================================================
-
-    ========================================================================
-    2. MIT license wherever applicable
-    ========================================================================
-    Permission is hereby granted, free of charge, to any person obtaining a
-    copy of this software and associated documentation files (the "Software"),
-    to deal in the Software without restriction, including without limitation
-    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-    and/or sell copies of the Software, and to permit persons to whom the
-    Software is furnished to do so, subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included
-    in all copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-    OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-    ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-    OTHER DEALINGS IN THE SOFTWARE.
-
-
-    ========================================================================
-    3. BSD License wherever applicable
-    ========================================================================
+    licenses.
+
+    =======================================================================================
+    Apache-2.0 licenses
+    =======================================================================================
+
+    The following components are provided under an Apache 2.0 license.
+
+    1. MXNet Cpp-package - For details, /cpp-package/LICENSE
+    2. MXNet rcnn - For details, see, example/rcnn/LICENSE
+    3. scala-package - For details, see, scala-package/LICENSE
+    4. Warp-CTC - For details, see, src/operator/contrib/ctc_include/LICENSE
+    5. dlpack - For details, see, dlpack/LICENSE
+    6. dmlc-core - For details, see, dmlc-core/LICENSE
+    7. mshadow - For details, see, mshadow/LICENSE
+    8. nnvm/dmlc-core - For details, see, nnvm/dmlc-core/LICENSE
+    9. nnvm - For details, see, nnvm/LICENSE
+    10. nnvm-fusion - For details, see, nnvm/plugin/nnvm-fusion/LICENSE
+    11. ps-lite - For details, see, ps-lite/LICENSE
+    12. nnvm/tvm - For details, see, nnvm/tvm/LICENSE
+    13. googlemock scripts/generator - For details, see, 3rdparty/googletest/googlemock/scripts/generator/LICENSE
+
+
+    =======================================================================================
+    MIT licenses
+    =======================================================================================
+
+    1. Fast R-CNN  - For details, see example/rcnn/LICENSE
+    2. Faster R-CNN - For details, see example/rcnn/LICENSE
+    3. tree_lstm - For details, see example/gluon/tree_lstm/LICENSE
+    4. OpenMP - For details, see 3rdparty/openmp/LICENSE.txt
+    5. HalideIR - For details, see nnvm/tvm/HalideIR/LICENSE
+
+
+    =======================================================================================
+    NVIDIA Licenses
+    =======================================================================================
+
+    1. Moderngpu
+    For details, see, src/operator/contrib/ctc_include/contrib/moderngpu/LICENSE
+
+    /******************************************************************************
+    * Redistribution and use in source and binary forms, with or without
+    * modification, are permitted provided that the following conditions are met:
+    *     * Redistributions of source code must retain the above copyright
+    *       notice, this list of conditions and the following disclaimer.
+    *     * Redistributions in binary form must reproduce the above copyright
+    *       notice, this list of conditions and the following disclaimer in the
+    *       documentation and/or other materials provided with the distribution.
+    *     * Neither the name of the NVIDIA CORPORATION nor the
+    *       names of its contributors may be used to endorse or promote products
+    *       derived from this software without specific prior written permission.
+    *
+    * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+    * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+    * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+    * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+    * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+    * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+    *
+    ******************************************************************************/
+
+    2. CUB Library
+    For details, see, 3rdparty/cub/LICENSE.TXT
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+       *  Redistributions of source code must retain the above copyright
+          notice, this list of conditions and the following disclaimer.
+       *  Redistributions in binary form must reproduce the above copyright
+          notice, this list of conditions and the following disclaimer in the
+          documentation and/or other materials provided with the distribution.
+       *  Neither the name of the NVIDIA CORPORATION nor the
+          names of its contributors may be used to endorse or promote products
+          derived from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+    DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+    =======================================================================================
+    Other Licenses
+    =======================================================================================
+
+    1. Caffe
+    For details, see, example/rcnn/LICENSE
+
+    LICENSE
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice, this
+       list of conditions and the following disclaimer.
+    2. Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the documentation
+       and/or other materials provided with the distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+    ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    CONTRIBUTION AGREEMENT
+
+    By contributing to the BVLC/caffe repository through pull-request, comment,
+    or otherwise, the contributor releases their content to the
+    license and copyright terms herein.
+
+    =======================================================================================
+
+    2. MS COCO API
+    For details, see, example/rcnn/LICENSE
+
     Redistribution and use in source and binary forms, with or without
     modification, are permitted provided that the following conditions are met:
 
@@ -262,6 +364,148 @@
     of the authors and should not be interpreted as representing official policies,
     either expressed or implied, of the FreeBSD Project.
 
+    =======================================================================================
+
+    3. Sphinx JavaScript utilties for the full-text search
+    For details, see, docs/_static/searchtools_custom.js
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+    * Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    =======================================================================================
+
+    4. FindCrypto.cmake
+    For details, see, dmlc-core/cmake/Modules/FindCrypto.cmake,
+    Redistribution and use is allowed according to the terms of the BSD license.
+
+    =======================================================================================
+
+    5. Googlemock
+    For details, see, 3rdparty/googletest/googlemock/LICENSE
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+        * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+        * Neither the name of Google Inc. nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    =======================================================================================
+
+    6. Googletest
+    For details, see, 3rdparty/googletest/googletest/LICENSE
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+        * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+        * Neither the name of Google Inc. nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    =======================================================================================
+
+    7. OpenMP Testsuite
+    For details, see, 3rdparty/openmp/testsuite/LICENSE
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    o Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    o Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    o Neither the name of the University of Houston System nor the names of its
+      contributors may be used to
+      endorse or promote products derived from this software without specific
+      prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+    TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    =======================================================================================
+
+    8. Semaphore implementation in blockingconcurrentqueue.h
+    This file uses a semaphore implementation under the terms of its separate zlib license.
+    For details, see, dmlc-core/include/dmlc/blockingconcurrentqueue.h
+
+    =======================================================================================
+
+    9. blockingconcurrentqueue.h
+    This file is Distributed under the terms of the simplified BSD license.
+    For details, see, dmlc-core/include/dmlc/blockingconcurrentqueue.h
+
+    =======================================================================================
 
 
 

From 07a83a0325a3d782513a04f47d711710972cb144 Mon Sep 17 00:00:00 2001
From: Sheng Zha <szha@users.noreply.github.com>
Date: Fri, 9 Feb 2018 11:38:27 -0800
Subject: [PATCH 14/18] update navbar model zoo link (#9749)

* update navbar model zoo link

* update
---
 docs/_static/mxnet-theme/navbar.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/_static/mxnet-theme/navbar.html b/docs/_static/mxnet-theme/navbar.html
index bcf87f1ca7f4..bcc89896e40a 100644
--- a/docs/_static/mxnet-theme/navbar.html
+++ b/docs/_static/mxnet-theme/navbar.html
@@ -34,7 +34,7 @@ <h1 id="logo-wrap">
             <li><a class="main-nav-link" href="{{url_root}}faq/index.html">FAQ</a></li>
             <li><a class="main-nav-link" href="{{url_root}}architecture/index.html">Architecture</a></li>
             <li><a class="main-nav-link" href="https://github.com/apache/incubator-mxnet/tree/master/example">Examples</a></li>
-            <li><a class="main-nav-link" href="{{url_root}}model_zoo/index.html">Model Zoo</a></li>
+            <li><a class="main-nav-link" href="{{url_root}}api/python/gluon/model_zoo.html">Gluon Model Zoo</a></li>
           </ul>
         </span>
 
@@ -83,7 +83,7 @@ <h1 id="logo-wrap">
                   <li><a tabindex="-1"  href="{{url_root}}faq/index.html">FAQ</a></li>
                   <li><a tabindex="-1"  href="{{url_root}}architecture/index.html">Architecture</a></li>
                   <li><a tabindex="-1"  href="https://github.com/apache/incubator-mxnet/tree/master/example">Examples</a></li>
-                  <li><a tabindex="-1"  href="{{url_root}}model_zoo/index.html">Model Zoo</a></li>
+                  <li><a tabindex="-1"  href="{{url_root}}api/python/gluon/model_zoo.html">Gluon Model Zoo</a></li>
                 </ul>
               </li>
               <li><a href="{{url_root}}architecture/index.html">Architecture</a></li>

From 9f387cffe7dc5fbfa5e74ac780df272f4686f13d Mon Sep 17 00:00:00 2001
From: ZiyueHuang <zyhuang94@gmail.com>
Date: Sat, 10 Mar 2018 13:44:41 +0000
Subject: [PATCH 15/18] initial commit

---
 src/operator/optimizer_op-inl.h         | 106 +++++++++++++++++++++++-
 src/operator/optimizer_op.cu            |  75 +++++++++++++++++
 tests/python/unittest/test_optimizer.py |  16 ++--
 3 files changed, 188 insertions(+), 9 deletions(-)

diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 60981aa6d2e2..5a434250a177 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -787,6 +787,73 @@ struct AdamDnsRspDnsKernel {
   }
 };
 
+template<int req>
+struct AdamDnsRspDnsKernelV1 {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
+    DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx,
+    const DType* grad_data, const DType clip_gradient, const DType beta1, const DType beta2,
+    const DType lr, const DType wd, const DType epsilon, const DType rescale_grad) {
+    using nnvm::dim_t;
+    using namespace mshadow_op;
+    const dim_t row_id = i / row_length;
+    const dim_t col_id = i % row_length;
+    const dim_t row_offset = grad_idx[row_id] * row_length;
+    // index in data/mean/var
+    const dim_t data_i = row_offset + col_id;
+    // index in grad
+    DType grad_rescaled = grad_data[i] * rescale_grad + weight_data[data_i] * wd;
+    if (clip_gradient >= 0.0f) {
+      grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
+    }
+    mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
+    var_data[data_i] = beta2 * var_data[data_i] +
+                       (1.f - beta2) * grad_rescaled * grad_rescaled;
+    KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] - lr * mean_data[data_i] /
+                  (square_root::Map(var_data[data_i]) + epsilon));
+  }
+};
+
+template<typename xpu>
+inline void AdamUpdateDnsRspDnsImplV1(const AdamParam& param,
+                                    const OpContext& ctx,
+                                    const TBlob& weight,
+                                    const NDArray& grad,
+                                    const TBlob& mean,
+                                    const TBlob& var,
+                                    const OpReqType& req,
+                                    TBlob *out) {
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  if (!grad.storage_initialized() || req == kNullOp) return;
+  CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse adam_update";
+  CHECK_GT(weight.shape_.Size(), 0);
+  CHECK_GT(mean.shape_.Size(), 0);
+  CHECK_GT(var.shape_.Size(), 0);
+
+  MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
+    MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, {
+      MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+        const DType* weight_data = weight.dptr<DType>();
+        const IType* grad_idx = grad.aux_data(kIdx).dptr<IType>();
+        const DType* grad_val = grad.data().dptr<DType>();
+        DType* mean_data = mean.dptr<DType>();
+        DType* var_data = var.dptr<DType>();
+        DType* out_data = out->dptr<DType>();
+        nnvm::dim_t num_rows = grad.aux_shape(kIdx)[0];
+        const auto row_length = weight.shape_.ProdShape(1, weight.ndim());
+
+        Kernel<AdamDnsRspDnsKernelV1<req_type>, xpu>::Launch(s, num_rows * row_length, row_length,
+          out_data, mean_data, var_data, weight_data, grad_idx, grad_val,
+          static_cast<DType>(param.clip_gradient), static_cast<DType>(param.beta1),
+          static_cast<DType>(param.beta2), static_cast<DType>(param.lr),
+          static_cast<DType>(param.wd), static_cast<DType>(param.epsilon),
+          static_cast<DType>(param.rescale_grad));
+      });
+    });
+  });
+}
 
 template<typename xpu>
 inline void AdamUpdateDnsRspDnsImpl(const AdamParam& param,
@@ -854,10 +921,47 @@ inline void AdamUpdateRspRspRspImpl(const AdamParam& param,
   }
   TBlob out_blob = out->data();
   // reuse dns rsp implementation when storage_shape == shape
-  AdamUpdateDnsRspDnsImpl<xpu>(param, ctx, weight.data(), grad, mean.data(),
+  int version = dmlc::GetEnv("ADAM_VERSION", 0);
+  if (version == 0) {
+    AdamUpdateDnsRspDnsImpl<xpu>(param, ctx, weight.data(), grad, mean.data(),
                                var.data(), req, &out_blob);
+  } else if (version == 1) {
+    AdamUpdateDnsRspDnsImplV1<xpu>(param, ctx, weight.data(), grad, mean.data(), var.data(), req, &out_blob);
+  } else {
+    LOG(FATAL) << "NOT IMPLEMENTED VERSION" << version;
+  }
 }
 
+template<int req>
+struct AdamStdDnsRspDnsKernelV1 {
+  template<typename DType, typename IType, typename RType>
+  MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
+    DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx,
+    const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
+    const DType beta1, const DType beta2, const DType lr, const DType wd,
+    const DType epsilon, const DType rescale_grad) {
+    using namespace mshadow_op;
+    using nnvm::dim_t;
+    const dim_t row_id = i / row_length;
+    const dim_t col_id = i % row_length;
+    const bool non_zero = (row_id == 0) ? prefix_sum[0] > 0
+                          : prefix_sum[row_id] > prefix_sum[row_id - 1];
+    const RType grad_offset = (prefix_sum[row_id] - 1) * row_length + col_id;
+    DType grad_rescaled = non_zero ? static_cast<DType>(grad_data[grad_offset] * rescale_grad
+                                                        + weight_data[i] * wd)
+                                   : static_cast<DType>(weight_data[i] * wd);
+    if (clip_gradient >= 0.0f) {
+      grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
+    }
+    mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled;
+    var_data[i] = beta2 * var_data[i] +
+                  (1.f - beta2) * square::Map(grad_rescaled);
+    KERNEL_ASSIGN(out_data[i], req, weight_data[i] - lr * mean_data[i] /
+                  (square_root::Map(var_data[i]) + epsilon));
+  }
+};
+
+
 template<int req>
 struct AdamStdDnsRspDnsKernel {
   template<typename DType, typename IType, typename RType>
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index 1bd6117432bf..049f65e395d4 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -94,6 +94,74 @@ void SGDMomStdUpdateDnsRspDnsImpl<gpu>(const SGDMomParam& param,
   });
 }
 
+void AdamStdUpdateDnsRspDnsImplV1(const AdamParam& param,
+                                 const OpContext& ctx,
+                                 const TBlob& weight,
+                                 const NDArray& grad,
+                                 const TBlob& mean,
+                                 const TBlob& var,
+                                 const OpReqType& req,
+                                 TBlob *out) {
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  using namespace mshadow;
+  Stream<gpu>* s = ctx.get_stream<gpu>();
+  if (req == kNullOp) return;
+  CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse adam_update";
+  CHECK_GT(weight.shape_.Size(), 0);
+  CHECK_GT(mean.shape_.Size(), 0);
+  CHECK_GT(var.shape_.Size(), 0);
+
+  MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
+    MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, {
+      MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+        const DType* weight_data = weight.dptr<DType>();
+        const IType* grad_idx = grad.aux_data(kIdx).dptr<IType>();
+        const DType* grad_val = grad.data().dptr<DType>();
+        DType* mean_data = mean.dptr<DType>();
+        DType* var_data = var.dptr<DType>();
+        DType* out_data = out->dptr<DType>();
+        nnvm::dim_t num_rows = weight.shape_[0];
+        nnvm::dim_t row_length = weight.shape_.ProdShape(1, weight.ndim());
+        nnvm::dim_t* prefix_sum = NULL;
+        void* d_temp_storage = NULL;
+        size_t temp_storage_bytes = 0;
+        cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                      temp_storage_bytes,
+                                      prefix_sum,
+                                      prefix_sum,
+                                      num_rows,
+                                      Stream<gpu>::GetStream(s));
+        Tensor<gpu, 1, char> workspace = ctx.requested[0]
+          .get_space_typed<gpu, 1, char>(Shape1(num_rows * sizeof(nnvm::dim_t) +
+                                         temp_storage_bytes), s);
+        prefix_sum = reinterpret_cast<nnvm::dim_t*>(workspace.dptr_);
+        d_temp_storage = workspace.dptr_ + num_rows*sizeof(nnvm::dim_t);
+        // mark row flags
+        Fill<false>(s, TBlob(prefix_sum, Shape1(num_rows), gpu::kDevMask), kWriteTo, 0);
+        if (grad.storage_initialized()) {
+          Kernel<MarkRowFlgKernel, gpu>::Launch(s, grad.aux_shape(kIdx)[0],
+            prefix_sum, grad_idx);
+          // calculate inclusive prefix sum
+          cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                        temp_storage_bytes,
+                                        prefix_sum,
+                                        prefix_sum,
+                                        num_rows,
+                                        Stream<gpu>::GetStream(s));
+        }
+
+        Kernel<AdamStdDnsRspDnsKernelV1<req_type>, gpu>::Launch(s, num_rows * row_length, row_length,
+          out_data, mean_data, var_data, weight_data, grad_idx, grad_val, prefix_sum,
+          static_cast<DType>(param.clip_gradient), static_cast<DType>(param.beta1),
+          static_cast<DType>(param.beta2), static_cast<DType>(param.lr),
+          static_cast<DType>(param.wd), static_cast<DType>(param.epsilon),
+          static_cast<DType>(param.rescale_grad));
+      });
+    });
+  });
+}
+
 template<>
 void AdamStdUpdateDnsRspDnsImpl<gpu>(const AdamParam& param,
                                      const OpContext& ctx,
@@ -103,6 +171,10 @@ void AdamStdUpdateDnsRspDnsImpl<gpu>(const AdamParam& param,
                                      const TBlob& var,
                                      const OpReqType& req,
                                      TBlob *out) {
+  int version = dmlc::GetEnv("ADAM_VERSION", 4);
+  if (version == 5) {
+    AdamStdUpdateDnsRspDnsImplV1(param, ctx, weight, grad, mean, var, req, out);
+  } else if (version == 4) {
   using namespace mxnet_op;
   using namespace rowsparse;
   using namespace mshadow;
@@ -161,6 +233,9 @@ void AdamStdUpdateDnsRspDnsImpl<gpu>(const AdamParam& param,
       });
     });
   });
+  } else {
+    LOG(FATAL) << "NOT IMPLEMENTED";
+  }
 }
 
 NNVM_REGISTER_OP(signsgd_update)
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 26ff48babcd4..b68c92ffdb76 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -428,13 +428,13 @@ def test_ftml():
 class PyAdam(mx.optimizer.Optimizer):
     """python reference implemenation of adam"""
     def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-                 decay_factor=(1 - 1e-8), sparse_update=False, **kwargs):
+                 decay_factor=(1 - 1e-8), lazy_update=False, **kwargs):
         super(PyAdam, self).__init__(learning_rate=learning_rate, **kwargs)
         self.beta1 = beta1
         self.beta2 = beta2
         self.epsilon = epsilon
         self.decay_factor = decay_factor
-        self.sparse_update = sparse_update
+        self.lazy_update = lazy_update
 
     def create_state(self, index, weight):
         """Create additional optimizer state: mean, variance
@@ -480,7 +480,7 @@ def update(self, index, weight, grad, state):
             # check row slices of all zeros
             all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
             # skip zeros during sparse update
-            if all_zeros and self.sparse_update:
+            if all_zeros and self.lazy_update:
                 continue
             grad[row] = grad[row] * self.rescale_grad + wd * weight[row]
             # clip gradients
@@ -520,7 +520,7 @@ def test_adam():
                                     not kwarg['multi_precision'])):
                             continue
                         compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
-                        compare_optimizer(opt1(sparse_update=True, **kwarg), opt2(**kwarg), shape,
+                        compare_optimizer(opt1(lazy_update=True, **kwarg), opt2(**kwarg), shape,
                                           dtype, w_stype='row_sparse', g_stype='row_sparse')
                         compare_optimizer(opt1(**kwarg), opt2(lazy_update=False, **kwarg), shape,
                                           dtype, w_stype='row_sparse', g_stype='row_sparse')
@@ -763,12 +763,12 @@ class PyFtrl(mx.optimizer.Optimizer):
            \\eta_{t,i} = \\frac{learningrate}{\\beta+\\sqrt{\\sum_{s=1}^tg_{s,i}^t}}
     """
 
-    def __init__(self, lamda1=0.01, learning_rate=0.1, beta=1, sparse_update=False, **kwargs):
+    def __init__(self, lamda1=0.01, learning_rate=0.1, beta=1, lazy_update=False, **kwargs):
         super(PyFtrl, self).__init__(**kwargs)
         self.lamda1 = lamda1
         self.beta = beta
         self.lr = learning_rate
-        self.sparse_update = sparse_update
+        self.lazy_update = lazy_update
 
     def create_state(self, index, weight):
         return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # dn
@@ -783,7 +783,7 @@ def update(self, index, weight, grad, state):
         dn, n = state
         for row in range(num_rows):
             all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
-            if all_zeros and self.sparse_update:
+            if all_zeros and self.lazy_update:
                 continue
             grad[row] = grad[row] * self.rescale_grad
             if self.clip_gradient is not None:
@@ -813,7 +813,7 @@ def test_ftrl():
               {'clip_gradient': 0.5, 'wd': 0.07, 'lamda1': 1.0}]
     for kwarg in kwargs:
         compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32)
-        compare_optimizer(opt1(sparse_update=True, **kwarg), opt2(**kwarg), shape,
+        compare_optimizer(opt1(lazy_update=True, **kwarg), opt2(**kwarg), shape,
                           np.float32, w_stype='row_sparse', g_stype='row_sparse')
 
 def test_nadam():

From ae77b611a2b10cdc80d5a6e2279474af10e62662 Mon Sep 17 00:00:00 2001
From: ZiyueHuang <zyhuang94@gmail.com>
Date: Sat, 10 Mar 2018 14:11:13 +0000
Subject: [PATCH 16/18] clean up

---
 src/operator/optimizer_op-inl.h | 105 ++++++------------------------
 src/operator/optimizer_op.cu    | 110 ++++++++++----------------------
 2 files changed, 50 insertions(+), 165 deletions(-)

diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 5a434250a177..667ab17ee8cc 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -756,7 +756,7 @@ inline void AdamUpdate(const nnvm::NodeAttrs& attrs,
  * The kernel assumes dense weight/mean/var, and row_sparse gradient
  */
 template<int req>
-struct AdamDnsRspDnsKernel {
+struct AdamDnsRspDnsKernelByRow {
   template<typename DType, typename IType>
   MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
     DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx,
@@ -788,7 +788,7 @@ struct AdamDnsRspDnsKernel {
 };
 
 template<int req>
-struct AdamDnsRspDnsKernelV1 {
+struct AdamDnsRspDnsKernelByElem {
   template<typename DType, typename IType>
   MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
     DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx,
@@ -814,47 +814,6 @@ struct AdamDnsRspDnsKernelV1 {
   }
 };
 
-template<typename xpu>
-inline void AdamUpdateDnsRspDnsImplV1(const AdamParam& param,
-                                    const OpContext& ctx,
-                                    const TBlob& weight,
-                                    const NDArray& grad,
-                                    const TBlob& mean,
-                                    const TBlob& var,
-                                    const OpReqType& req,
-                                    TBlob *out) {
-  using namespace mxnet_op;
-  using namespace rowsparse;
-  Stream<xpu>* s = ctx.get_stream<xpu>();
-  if (!grad.storage_initialized() || req == kNullOp) return;
-  CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse adam_update";
-  CHECK_GT(weight.shape_.Size(), 0);
-  CHECK_GT(mean.shape_.Size(), 0);
-  CHECK_GT(var.shape_.Size(), 0);
-
-  MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
-    MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, {
-      MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
-        const DType* weight_data = weight.dptr<DType>();
-        const IType* grad_idx = grad.aux_data(kIdx).dptr<IType>();
-        const DType* grad_val = grad.data().dptr<DType>();
-        DType* mean_data = mean.dptr<DType>();
-        DType* var_data = var.dptr<DType>();
-        DType* out_data = out->dptr<DType>();
-        nnvm::dim_t num_rows = grad.aux_shape(kIdx)[0];
-        const auto row_length = weight.shape_.ProdShape(1, weight.ndim());
-
-        Kernel<AdamDnsRspDnsKernelV1<req_type>, xpu>::Launch(s, num_rows * row_length, row_length,
-          out_data, mean_data, var_data, weight_data, grad_idx, grad_val,
-          static_cast<DType>(param.clip_gradient), static_cast<DType>(param.beta1),
-          static_cast<DType>(param.beta2), static_cast<DType>(param.lr),
-          static_cast<DType>(param.wd), static_cast<DType>(param.epsilon),
-          static_cast<DType>(param.rescale_grad));
-      });
-    });
-  });
-}
-
 template<typename xpu>
 inline void AdamUpdateDnsRspDnsImpl(const AdamParam& param,
                                     const OpContext& ctx,
@@ -884,12 +843,21 @@ inline void AdamUpdateDnsRspDnsImpl(const AdamParam& param,
         DType* out_data = out->dptr<DType>();
         nnvm::dim_t num_rows = grad.aux_shape(kIdx)[0];
         const auto row_length = weight.shape_.ProdShape(1, weight.ndim());
-        Kernel<AdamDnsRspDnsKernel<req_type>, xpu>::Launch(s, num_rows, row_length,
-          out_data, mean_data, var_data, weight_data, grad_idx, grad_val,
-          static_cast<DType>(param.clip_gradient), static_cast<DType>(param.beta1),
-          static_cast<DType>(param.beta2), static_cast<DType>(param.lr),
-          static_cast<DType>(param.wd), static_cast<DType>(param.epsilon),
-          static_cast<DType>(param.rescale_grad));
+        if (std::is_same<xpu, gpu>::value) {
+          Kernel<AdamDnsRspDnsKernelByElem<req_type>, xpu>::Launch(s, num_rows * row_length,
+            row_length, out_data, mean_data, var_data, weight_data, grad_idx, grad_val,
+            static_cast<DType>(param.clip_gradient), static_cast<DType>(param.beta1),
+            static_cast<DType>(param.beta2), static_cast<DType>(param.lr),
+            static_cast<DType>(param.wd), static_cast<DType>(param.epsilon),
+            static_cast<DType>(param.rescale_grad));
+        } else {
+          Kernel<AdamDnsRspDnsKernelByRow<req_type>, xpu>::Launch(s, num_rows, row_length,
+            out_data, mean_data, var_data, weight_data, grad_idx, grad_val,
+            static_cast<DType>(param.clip_gradient), static_cast<DType>(param.beta1),
+            static_cast<DType>(param.beta2), static_cast<DType>(param.lr),
+            static_cast<DType>(param.wd), static_cast<DType>(param.epsilon),
+            static_cast<DType>(param.rescale_grad));
+        }
       });
     });
   });
@@ -921,47 +889,10 @@ inline void AdamUpdateRspRspRspImpl(const AdamParam& param,
   }
   TBlob out_blob = out->data();
   // reuse dns rsp implementation when storage_shape == shape
-  int version = dmlc::GetEnv("ADAM_VERSION", 0);
-  if (version == 0) {
-    AdamUpdateDnsRspDnsImpl<xpu>(param, ctx, weight.data(), grad, mean.data(),
+  AdamUpdateDnsRspDnsImpl<xpu>(param, ctx, weight.data(), grad, mean.data(),
                                var.data(), req, &out_blob);
-  } else if (version == 1) {
-    AdamUpdateDnsRspDnsImplV1<xpu>(param, ctx, weight.data(), grad, mean.data(), var.data(), req, &out_blob);
-  } else {
-    LOG(FATAL) << "NOT IMPLEMENTED VERSION" << version;
-  }
 }
 
-template<int req>
-struct AdamStdDnsRspDnsKernelV1 {
-  template<typename DType, typename IType, typename RType>
-  MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
-    DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx,
-    const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
-    const DType beta1, const DType beta2, const DType lr, const DType wd,
-    const DType epsilon, const DType rescale_grad) {
-    using namespace mshadow_op;
-    using nnvm::dim_t;
-    const dim_t row_id = i / row_length;
-    const dim_t col_id = i % row_length;
-    const bool non_zero = (row_id == 0) ? prefix_sum[0] > 0
-                          : prefix_sum[row_id] > prefix_sum[row_id - 1];
-    const RType grad_offset = (prefix_sum[row_id] - 1) * row_length + col_id;
-    DType grad_rescaled = non_zero ? static_cast<DType>(grad_data[grad_offset] * rescale_grad
-                                                        + weight_data[i] * wd)
-                                   : static_cast<DType>(weight_data[i] * wd);
-    if (clip_gradient >= 0.0f) {
-      grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
-    }
-    mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled;
-    var_data[i] = beta2 * var_data[i] +
-                  (1.f - beta2) * square::Map(grad_rescaled);
-    KERNEL_ASSIGN(out_data[i], req, weight_data[i] - lr * mean_data[i] /
-                  (square_root::Map(var_data[i]) + epsilon));
-  }
-};
-
-
 template<int req>
 struct AdamStdDnsRspDnsKernel {
   template<typename DType, typename IType, typename RType>
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index 049f65e395d4..f294664bd75c 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -94,73 +94,34 @@ void SGDMomStdUpdateDnsRspDnsImpl<gpu>(const SGDMomParam& param,
   });
 }
 
-void AdamStdUpdateDnsRspDnsImplV1(const AdamParam& param,
-                                 const OpContext& ctx,
-                                 const TBlob& weight,
-                                 const NDArray& grad,
-                                 const TBlob& mean,
-                                 const TBlob& var,
-                                 const OpReqType& req,
-                                 TBlob *out) {
-  using namespace mxnet_op;
-  using namespace rowsparse;
-  using namespace mshadow;
-  Stream<gpu>* s = ctx.get_stream<gpu>();
-  if (req == kNullOp) return;
-  CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse adam_update";
-  CHECK_GT(weight.shape_.Size(), 0);
-  CHECK_GT(mean.shape_.Size(), 0);
-  CHECK_GT(var.shape_.Size(), 0);
-
-  MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
-    MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, {
-      MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
-        const DType* weight_data = weight.dptr<DType>();
-        const IType* grad_idx = grad.aux_data(kIdx).dptr<IType>();
-        const DType* grad_val = grad.data().dptr<DType>();
-        DType* mean_data = mean.dptr<DType>();
-        DType* var_data = var.dptr<DType>();
-        DType* out_data = out->dptr<DType>();
-        nnvm::dim_t num_rows = weight.shape_[0];
-        nnvm::dim_t row_length = weight.shape_.ProdShape(1, weight.ndim());
-        nnvm::dim_t* prefix_sum = NULL;
-        void* d_temp_storage = NULL;
-        size_t temp_storage_bytes = 0;
-        cub::DeviceScan::InclusiveSum(d_temp_storage,
-                                      temp_storage_bytes,
-                                      prefix_sum,
-                                      prefix_sum,
-                                      num_rows,
-                                      Stream<gpu>::GetStream(s));
-        Tensor<gpu, 1, char> workspace = ctx.requested[0]
-          .get_space_typed<gpu, 1, char>(Shape1(num_rows * sizeof(nnvm::dim_t) +
-                                         temp_storage_bytes), s);
-        prefix_sum = reinterpret_cast<nnvm::dim_t*>(workspace.dptr_);
-        d_temp_storage = workspace.dptr_ + num_rows*sizeof(nnvm::dim_t);
-        // mark row flags
-        Fill<false>(s, TBlob(prefix_sum, Shape1(num_rows), gpu::kDevMask), kWriteTo, 0);
-        if (grad.storage_initialized()) {
-          Kernel<MarkRowFlgKernel, gpu>::Launch(s, grad.aux_shape(kIdx)[0],
-            prefix_sum, grad_idx);
-          // calculate inclusive prefix sum
-          cub::DeviceScan::InclusiveSum(d_temp_storage,
-                                        temp_storage_bytes,
-                                        prefix_sum,
-                                        prefix_sum,
-                                        num_rows,
-                                        Stream<gpu>::GetStream(s));
-        }
-
-        Kernel<AdamStdDnsRspDnsKernelV1<req_type>, gpu>::Launch(s, num_rows * row_length, row_length,
-          out_data, mean_data, var_data, weight_data, grad_idx, grad_val, prefix_sum,
-          static_cast<DType>(param.clip_gradient), static_cast<DType>(param.beta1),
-          static_cast<DType>(param.beta2), static_cast<DType>(param.lr),
-          static_cast<DType>(param.wd), static_cast<DType>(param.epsilon),
-          static_cast<DType>(param.rescale_grad));
-      });
-    });
-  });
-}
+template<int req>
+struct AdamStdDnsRspDnsKernelByElem {
+  template<typename DType, typename IType, typename RType>
+  MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
+    DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx,
+    const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
+    const DType beta1, const DType beta2, const DType lr, const DType wd,
+    const DType epsilon, const DType rescale_grad) {
+    using namespace mshadow_op;
+    using nnvm::dim_t;
+    const dim_t row_id = i / row_length;
+    const dim_t col_id = i % row_length;
+    const bool non_zero = (row_id == 0) ? prefix_sum[0] > 0
+                          : prefix_sum[row_id] > prefix_sum[row_id - 1];
+    const RType grad_offset = (prefix_sum[row_id] - 1) * row_length + col_id;
+    DType grad_rescaled = non_zero ? static_cast<DType>(grad_data[grad_offset] * rescale_grad
+                                                        + weight_data[i] * wd)
+                                   : static_cast<DType>(weight_data[i] * wd);
+    if (clip_gradient >= 0.0f) {
+      grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
+    }
+    mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled;
+    var_data[i] = beta2 * var_data[i] +
+                  (1.f - beta2) * square::Map(grad_rescaled);
+    KERNEL_ASSIGN(out_data[i], req, weight_data[i] - lr * mean_data[i] /
+                  (square_root::Map(var_data[i]) + epsilon));
+  }
+};
 
 template<>
 void AdamStdUpdateDnsRspDnsImpl<gpu>(const AdamParam& param,
@@ -171,10 +132,6 @@ void AdamStdUpdateDnsRspDnsImpl<gpu>(const AdamParam& param,
                                      const TBlob& var,
                                      const OpReqType& req,
                                      TBlob *out) {
-  int version = dmlc::GetEnv("ADAM_VERSION", 4);
-  if (version == 5) {
-    AdamStdUpdateDnsRspDnsImplV1(param, ctx, weight, grad, mean, var, req, out);
-  } else if (version == 4) {
   using namespace mxnet_op;
   using namespace rowsparse;
   using namespace mshadow;
@@ -194,8 +151,8 @@ void AdamStdUpdateDnsRspDnsImpl<gpu>(const AdamParam& param,
         DType* mean_data = mean.dptr<DType>();
         DType* var_data = var.dptr<DType>();
         DType* out_data = out->dptr<DType>();
-        nnvm::dim_t num_rows = weight.shape_[0];
-        nnvm::dim_t row_length = weight.shape_.ProdShape(1, weight.ndim());
+        const nnvm::dim_t num_rows = weight.shape_[0];
+        const nnvm::dim_t row_length = weight.shape_.ProdShape(1, weight.ndim());
         nnvm::dim_t* prefix_sum = NULL;
         void* d_temp_storage = NULL;
         size_t temp_storage_bytes = 0;
@@ -224,8 +181,8 @@ void AdamStdUpdateDnsRspDnsImpl<gpu>(const AdamParam& param,
                                         Stream<gpu>::GetStream(s));
         }
 
-        Kernel<AdamStdDnsRspDnsKernel<req_type>, gpu>::Launch(s, num_rows, row_length,
-          out_data, mean_data, var_data, weight_data, grad_idx, grad_val, prefix_sum,
+        Kernel<AdamStdDnsRspDnsKernelByElem<req_type>, gpu>::Launch(s, weight.shape_.Size(),
+          row_length, out_data, mean_data, var_data, weight_data, grad_idx, grad_val, prefix_sum,
           static_cast<DType>(param.clip_gradient), static_cast<DType>(param.beta1),
           static_cast<DType>(param.beta2), static_cast<DType>(param.lr),
           static_cast<DType>(param.wd), static_cast<DType>(param.epsilon),
@@ -233,9 +190,6 @@ void AdamStdUpdateDnsRspDnsImpl<gpu>(const AdamParam& param,
       });
     });
   });
-  } else {
-    LOG(FATAL) << "NOT IMPLEMENTED";
-  }
 }
 
 NNVM_REGISTER_OP(signsgd_update)

From 63ea004952c0ee6f99ea8d7600f179b8704e81a5 Mon Sep 17 00:00:00 2001
From: ZiyueHuang <zyhuang94@gmail.com>
Date: Wed, 21 Mar 2018 20:42:42 +0000
Subject: [PATCH 17/18] refactor

---
 benchmark/python/sparse/updater.py | 78 ++++++++++++++++++++++++++++++
 src/operator/optimizer_op-inl.h    | 67 ++++++-------------------
 src/operator/optimizer_op.cc       | 39 ++++++++++++++-
 src/operator/optimizer_op.cu       |  4 +-
 4 files changed, 134 insertions(+), 54 deletions(-)
 create mode 100644 benchmark/python/sparse/updater.py

diff --git a/benchmark/python/sparse/updater.py b/benchmark/python/sparse/updater.py
new file mode 100644
index 000000000000..72f2bfd04a27
--- /dev/null
+++ b/benchmark/python/sparse/updater.py
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import time
+import mxnet as mx
+from mxnet.ndarray.sparse import adam_update
+import numpy as np
+import argparse
+
+mx.random.seed(0)
+np.random.seed(0)
+
+parser = argparse.ArgumentParser(description='Benchmark adam updater')
+parser.add_argument('--dim-in', type=int, default=240000, help='weight.shape[0]')
+parser.add_argument('--dim-out', type=int, default=512, help='weight.shape[1]')
+parser.add_argument('--nnr', type=int, default=5000, help='grad.indices.shape[0]')
+parser.add_argument('--repeat', type=int, default=1000, help='num repeat')
+parser.add_argument('--dense-grad', action='store_true',
+                    help='if set to true, both gradient and weight are dense.')
+parser.add_argument('--dense-state', action='store_true',
+                    help='if set to true, states are dense, indicating standard update')
+parser.add_argument('--cpu', action='store_true')
+
+
+args = parser.parse_args()
+dim_in = args.dim_in
+dim_out = args.dim_out
+nnr = args.nnr
+ctx = mx.cpu() if args.cpu else mx.gpu()
+
+ones = mx.nd.ones((dim_in, dim_out), ctx=ctx)
+
+if not args.dense_grad:
+    weight = ones.tostype('row_sparse')
+    indices = np.arange(dim_in)
+    np.random.shuffle(indices)
+    indices = np.unique(indices[:nnr])
+    indices = mx.nd.array(indices, ctx=ctx)
+    grad = mx.nd.sparse.retain(weight, indices)
+else:
+    weight = ones.copy()
+    grad = ones.copy()
+
+if args.dense_state:
+    mean = ones.copy()
+else:
+    mean = ones.tostype('row_sparse')
+
+var = mean.copy()
+
+# warmup 
+for i in range(10):
+    adam_update(weight, grad, mean, var, out=weight, lr=1, wd=0, beta1=0.9,
+                beta2=0.99, rescale_grad=0.5, epsilon=1e-8)
+weight.wait_to_read()
+
+# measure speed
+a = time.time()
+for i in range(args.repeat):
+    adam_update(weight, grad, mean, var, out=weight, lr=1, wd=0, beta1=0.9,
+                beta2=0.99, rescale_grad=0.5, epsilon=1e-8)
+weight.wait_to_read()
+b = time.time()
+print(b - a)
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 667ab17ee8cc..a7ec1c14a16e 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -749,6 +749,9 @@ inline void AdamUpdate(const nnvm::NodeAttrs& attrs,
   });
 }
 
+template<int req, typename xpu>
+struct AdamDnsRspDnsKernel;
+
 /*!
  * Note: this kernel performs sparse adam update. For each row-slice in row_sparse
  * gradient, it finds the corresponding elements in weight, mean and var and performs
@@ -756,7 +759,7 @@ inline void AdamUpdate(const nnvm::NodeAttrs& attrs,
  * The kernel assumes dense weight/mean/var, and row_sparse gradient
  */
 template<int req>
-struct AdamDnsRspDnsKernelByRow {
+struct AdamDnsRspDnsKernel<req, cpu> {
   template<typename DType, typename IType>
   MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
     DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx,
@@ -787,8 +790,9 @@ struct AdamDnsRspDnsKernelByRow {
   }
 };
 
+
 template<int req>
-struct AdamDnsRspDnsKernelByElem {
+struct AdamDnsRspDnsKernel<req, gpu> {
   template<typename DType, typename IType>
   MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
     DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx,
@@ -843,21 +847,16 @@ inline void AdamUpdateDnsRspDnsImpl(const AdamParam& param,
         DType* out_data = out->dptr<DType>();
         nnvm::dim_t num_rows = grad.aux_shape(kIdx)[0];
         const auto row_length = weight.shape_.ProdShape(1, weight.ndim());
+        size_t num_threads = num_rows;
         if (std::is_same<xpu, gpu>::value) {
-          Kernel<AdamDnsRspDnsKernelByElem<req_type>, xpu>::Launch(s, num_rows * row_length,
-            row_length, out_data, mean_data, var_data, weight_data, grad_idx, grad_val,
-            static_cast<DType>(param.clip_gradient), static_cast<DType>(param.beta1),
-            static_cast<DType>(param.beta2), static_cast<DType>(param.lr),
-            static_cast<DType>(param.wd), static_cast<DType>(param.epsilon),
-            static_cast<DType>(param.rescale_grad));
-        } else {
-          Kernel<AdamDnsRspDnsKernelByRow<req_type>, xpu>::Launch(s, num_rows, row_length,
-            out_data, mean_data, var_data, weight_data, grad_idx, grad_val,
-            static_cast<DType>(param.clip_gradient), static_cast<DType>(param.beta1),
-            static_cast<DType>(param.beta2), static_cast<DType>(param.lr),
-            static_cast<DType>(param.wd), static_cast<DType>(param.epsilon),
-            static_cast<DType>(param.rescale_grad));
+          num_threads = num_rows * row_length;
         }
+        Kernel<AdamDnsRspDnsKernel<req_type, xpu>, xpu>::Launch(s, num_threads,
+          row_length, out_data, mean_data, var_data, weight_data, grad_idx, grad_val,
+          static_cast<DType>(param.clip_gradient), static_cast<DType>(param.beta1),
+          static_cast<DType>(param.beta2), static_cast<DType>(param.lr),
+          static_cast<DType>(param.wd), static_cast<DType>(param.epsilon),
+          static_cast<DType>(param.rescale_grad));
       });
     });
   });
@@ -893,42 +892,8 @@ inline void AdamUpdateRspRspRspImpl(const AdamParam& param,
                                var.data(), req, &out_blob);
 }
 
-template<int req>
-struct AdamStdDnsRspDnsKernel {
-  template<typename DType, typename IType, typename RType>
-  MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
-    DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx,
-    const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
-    const DType beta1, const DType beta2, const DType lr, const DType wd,
-    const DType epsilon, const DType rescale_grad) {
-    using namespace mshadow_op;
-    const bool non_zero = (i == 0) ? prefix_sum[0] > 0
-                                   : prefix_sum[i] > prefix_sum[i-1];
-
-    const index_t row_i = i * row_length;
-    const RType grad_i = (prefix_sum[i]-1) * row_length;
-    for (index_t j = 0; j < row_length; j++) {
-      const index_t data_i = row_i + j;
-      const DType grad_rescaled = non_zero ? static_cast<DType>(
-                                               grad_data[grad_i + j] * rescale_grad +
-                                               weight_data[data_i] * wd)
-                                           : static_cast<DType>(weight_data[data_i] * wd);
-      if (clip_gradient >= 0.0f) {
-        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) *
-                            clip::Map(grad_rescaled, clip_gradient);
-        var_data[data_i] =  beta2 * var_data[data_i] + (1.f - beta2) * square::Map(
-                            clip::Map(grad_rescaled, clip_gradient));
-      } else {
-        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
-        var_data[data_i] = beta2 * var_data[data_i] +
-                           (1.f - beta2) * square::Map(grad_rescaled);
-      }
-      KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] - lr * mean_data[data_i] /
-                    (square_root::Map(var_data[data_i]) + epsilon));
-    }
-  }
-};
-
+template<int req, typename xpu>
+struct AdamStdDnsRspDnsKernel;
 
 template<typename xpu>
 void AdamStdUpdateDnsRspDnsImpl(const AdamParam& param,
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index 136769a1bf01..c4f205a2e951 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -148,6 +148,43 @@ void SGDMomStdUpdateDnsRspDnsImpl<cpu>(const SGDMomParam& param,
   });
 }
 
+template<int req>
+struct AdamStdDnsRspDnsKernel<req, cpu> {
+  template<typename DType, typename IType, typename RType>
+  MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
+    DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx,
+    const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
+    const DType beta1, const DType beta2, const DType lr, const DType wd,
+    const DType epsilon, const DType rescale_grad) {
+    using namespace mshadow_op;
+    const bool non_zero = (i == 0) ? prefix_sum[0] > 0
+                                   : prefix_sum[i] > prefix_sum[i-1];
+
+    const index_t row_i = i * row_length;
+    const RType grad_i = (prefix_sum[i]-1) * row_length;
+    for (index_t j = 0; j < row_length; j++) {
+      const index_t data_i = row_i + j;
+      const DType grad_rescaled = non_zero ? static_cast<DType>(
+                                               grad_data[grad_i + j] * rescale_grad +
+                                               weight_data[data_i] * wd)
+                                           : static_cast<DType>(weight_data[data_i] * wd);
+      if (clip_gradient >= 0.0f) {
+        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) *
+                            clip::Map(grad_rescaled, clip_gradient);
+        var_data[data_i] =  beta2 * var_data[data_i] + (1.f - beta2) * square::Map(
+                            clip::Map(grad_rescaled, clip_gradient));
+      } else {
+        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
+        var_data[data_i] = beta2 * var_data[data_i] +
+                           (1.f - beta2) * square::Map(grad_rescaled);
+      }
+      KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] - lr * mean_data[data_i] /
+                    (square_root::Map(var_data[data_i]) + epsilon));
+    }
+  }
+};
+
+
 template<>
 void AdamStdUpdateDnsRspDnsImpl<cpu>(const AdamParam& param,
                                      const OpContext& ctx,
@@ -193,7 +230,7 @@ void AdamStdUpdateDnsRspDnsImpl<cpu>(const AdamParam& param,
           }
         }
 
-        Kernel<AdamStdDnsRspDnsKernel<req_type>, cpu>::Launch(s, num_rows, row_length,
+        Kernel<AdamStdDnsRspDnsKernel<req_type, cpu>, cpu>::Launch(s, num_rows, row_length,
           out_data, mean_data, var_data, weight_data, grad_idx, grad_val, prefix_sum,
           static_cast<DType>(param.clip_gradient), static_cast<DType>(param.beta1),
           static_cast<DType>(param.beta2), static_cast<DType>(param.lr),
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index f294664bd75c..533a15b99924 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -95,7 +95,7 @@ void SGDMomStdUpdateDnsRspDnsImpl<gpu>(const SGDMomParam& param,
 }
 
 template<int req>
-struct AdamStdDnsRspDnsKernelByElem {
+struct AdamStdDnsRspDnsKernel<req, gpu> {
   template<typename DType, typename IType, typename RType>
   MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
     DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx,
@@ -181,7 +181,7 @@ void AdamStdUpdateDnsRspDnsImpl<gpu>(const AdamParam& param,
                                         Stream<gpu>::GetStream(s));
         }
 
-        Kernel<AdamStdDnsRspDnsKernelByElem<req_type>, gpu>::Launch(s, weight.shape_.Size(),
+        Kernel<AdamStdDnsRspDnsKernel<req_type, gpu>, gpu>::Launch(s, weight.shape_.Size(),
           row_length, out_data, mean_data, var_data, weight_data, grad_idx, grad_val, prefix_sum,
           static_cast<DType>(param.clip_gradient), static_cast<DType>(param.beta1),
           static_cast<DType>(param.beta2), static_cast<DType>(param.lr),

From 44cf37135d0a690c2b5bae97b439a2a66609499c Mon Sep 17 00:00:00 2001
From: ZiyueHuang <zyhuang94@gmail.com>
Date: Wed, 21 Mar 2018 22:55:18 +0000
Subject: [PATCH 18/18] fix test

---
 tests/python/unittest/test_optimizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 1242a20d16cc..bbd7845f66f3 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -638,7 +638,7 @@ def test_adam():
                         compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
                                           rtol=1e-4, atol=2e-5)
                         # atol 2e-5 needed to pass with seed 781809840
-                        compare_optimizer(opt1(sparse_update=True, **kwarg), opt2(**kwarg), shape,
+                        compare_optimizer(opt1(lazy_update=True, **kwarg), opt2(**kwarg), shape,
                                           dtype, w_stype='row_sparse', g_stype='row_sparse',
                                           rtol=1e-4, atol=2e-5)
                         compare_optimizer(opt1(**kwarg), opt2(lazy_update=False, **kwarg), shape,